yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import collections
   6 import datetime
   7 import hashlib
   8 import itertools
   9 import json
  10 import netrc
  11 import os
  12 import random
  13 import re
  14 import sys
  15 import time
  16 import math
  17
  18 from ..compat import (
  19     compat_cookiejar_Cookie,
  20     compat_cookies_SimpleCookie,
  21     compat_etree_Element,
  22     compat_etree_fromstring,
  23     compat_expanduser,
  24     compat_getpass,
  25     compat_http_client,
  26     compat_os_name,
  27     compat_str,
  28     compat_urllib_error,
  29     compat_urllib_parse_unquote,
  30     compat_urllib_parse_urlencode,
  31     compat_urllib_request,
  32     compat_urlparse,
  33     compat_xml_parse_error,
  34 )
  35 from ..downloader import FileDownloader
  36 from ..downloader.f4m import (
  37     get_base_url,
  38     remove_encrypted_media,
  39 )
  40 from ..utils import (
  41     age_restricted,
  42     base_url,
  43     bug_reports_message,
  44     clean_html,
  45     compiled_regex_type,
  46     determine_ext,
  47     determine_protocol,
  48     dict_get,
  49     error_to_compat_str,
  50     extract_attributes,
  51     ExtractorError,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     join_nonempty,
  59     js_to_json,
  60     JSON_LD_RE,
  61     mimetype2ext,
  62     network_exceptions,
  63     NO_DEFAULT,
  64     orderedSet,
  65     parse_bitrate,
  66     parse_codecs,
  67     parse_duration,
  68     parse_iso8601,
  69     parse_m3u8_attributes,
  70     parse_resolution,
  71     RegexNotFoundError,
  72     sanitize_filename,
  73     sanitized_Request,
  74     str_or_none,
  75     str_to_int,
  76     strip_or_none,
  77     traverse_obj,
  78     unescapeHTML,
  79     UnsupportedError,
  80     unified_strdate,
  81     unified_timestamp,
  82     update_Request,
  83     update_url_query,
  84     url_basename,
  85     url_or_none,
  86     urljoin,
  87     variadic,
  88     xpath_element,
  89     xpath_text,
  90     xpath_with_ns,
  91 )
  92
  93
  94 class InfoExtractor(object):
  95     """Information Extractor class.
  96
  97     Information extractors are the classes that, given a URL, extract
  98     information about the video (or videos) the URL refers to. This
  99     information includes the real video URL, the video title, author and
 100     others. The information is stored in a dictionary which is then
 101     passed to the YoutubeDL. The YoutubeDL processes this
 102     information possibly downloading the video to the file system, among
 103     other possible outcomes.
 104
 105     The type field determines the type of the result.
 106     By far the most common value (and the default if _type is missing) is
 107     "video", which indicates a single video.
 108
 109     For a video, the dictionaries must include the following fields:
 110
 111     id:             Video identifier.
 112     title:          Video title, unescaped.
 113
 114     Additionally, it must contain either a formats entry or a url one:
 115
 116     formats:        A list of dictionaries for each format available, ordered
 117                     from worst to best quality.
 118
 119                     Potential fields:
 120                     * url        The mandatory URL representing the media:
 121                                    for plain file media - HTTP URL of this file,
 122                                    for RTMP - RTMP URL,
 123                                    for HLS - URL of the M3U8 media playlist,
 124                                    for HDS - URL of the F4M manifest,
 125                                    for DASH
 126                                      - HTTP URL to plain file media (in case of
 127                                        unfragmented media)
 128                                      - URL of the MPD manifest or base URL
 129                                        representing the media if MPD manifest
 130                                        is parsed from a string (in case of
 131                                        fragmented media)
 132                                    for MSS - URL of the ISM manifest.
 133                     * manifest_url
 134                                  The URL of the manifest file in case of
 135                                  fragmented media:
 136                                    for HLS - URL of the M3U8 master playlist,
 137                                    for HDS - URL of the F4M manifest,
 138                                    for DASH - URL of the MPD manifest,
 139                                    for MSS - URL of the ISM manifest.
 140                     * ext        Will be calculated from URL if missing
 141                     * format     A human-readable description of the format
 142                                  ("mp4 container with h264/opus").
 143                                  Calculated from the format_id, width, height.
 144                                  and format_note fields if missing.
 145                     * format_id  A short description of the format
 146                                  ("mp4_h264_opus" or "19").
 147                                 Technically optional, but strongly recommended.
 148                     * format_note Additional info about the format
 149                                  ("3D" or "DASH video")
 150                     * width      Width of the video, if known
 151                     * height     Height of the video, if known
 152                     * resolution Textual description of width and height
 153                     * dynamic_range The dynamic range of the video. One of:
 154                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 155                     * tbr        Average bitrate of audio and video in KBit/s
 156                     * abr        Average audio bitrate in KBit/s
 157                     * acodec     Name of the audio codec in use
 158                     * asr        Audio sampling rate in Hertz
 159                     * vbr        Average video bitrate in KBit/s
 160                     * fps        Frame rate
 161                     * vcodec     Name of the video codec in use
 162                     * container  Name of the container format
 163                     * filesize   The number of bytes, if known in advance
 164                     * filesize_approx  An estimate for the number of bytes
 165                     * player_url SWF Player URL (used for rtmpdump).
 166                     * protocol   The protocol that will be used for the actual
 167                                  download, lower-case.
 168                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 169                                  "m3u8", "m3u8_native" or "http_dash_segments".
 170                     * fragment_base_url
 171                                  Base URL for fragments. Each fragment's path
 172                                  value (if present) will be relative to
 173                                  this URL.
 174                     * fragments  A list of fragments of a fragmented media.
 175                                  Each fragment entry must contain either an url
 176                                  or a path. If an url is present it should be
 177                                  considered by a client. Otherwise both path and
 178                                  fragment_base_url must be present. Here is
 179                                  the list of all potential fields:
 180                                  * "url" - fragment's URL
 181                                  * "path" - fragment's path relative to
 182                                             fragment_base_url
 183                                  * "duration" (optional, int or float)
 184                                  * "filesize" (optional, int)
 185                     * preference Order number of this format. If this field is
 186                                  present and not None, the formats get sorted
 187                                  by this field, regardless of all other values.
 188                                  -1 for default (order by other properties),
 189                                  -2 or smaller for less than default.
 190                                  < -1000 to hide the format (if there is
 191                                     another one which is strictly better)
 192                     * language   Language code, e.g. "de" or "en-US".
 193                     * language_preference  Is this in the language mentioned in
 194                                  the URL?
 195                                  10 if it's what the URL is about,
 196                                  -1 for default (don't know),
 197                                  -10 otherwise, other values reserved for now.
 198                     * quality    Order number of the video quality of this
 199                                  format, irrespective of the file format.
 200                                  -1 for default (order by other properties),
 201                                  -2 or smaller for less than default.
 202                     * source_preference  Order number for this video source
 203                                   (quality takes higher priority)
 204                                  -1 for default (order by other properties),
 205                                  -2 or smaller for less than default.
 206                     * http_headers  A dictionary of additional HTTP headers
 207                                  to add to the request.
 208                     * stretched_ratio  If given and not 1, indicates that the
 209                                  video's pixels are not square.
 210                                  width : height ratio as float.
 211                     * no_resume  The server does not support resuming the
 212                                  (HTTP or RTMP) download. Boolean.
 213                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 214                     * downloader_options  A dictionary of downloader options as
 215                                  described in FileDownloader
 216                     RTMP formats can also have the additional fields: page_url,
 217                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 218                     rtmp_protocol, rtmp_real_time
 219
 220     url:            Final video URL.
 221     ext:            Video filename extension.
 222     format:         The video format, defaults to ext (used for --get-format)
 223     player_url:     SWF Player URL (used for rtmpdump).
 224
 225     The following fields are optional:
 226
 227     alt_title:      A secondary title of the video.
 228     display_id      An alternative identifier for the video, not necessarily
 229                     unique, but available before title. Typically, id is
 230                     something like "4234987", title "Dancing naked mole rats",
 231                     and display_id "dancing-naked-mole-rats"
 232     thumbnails:     A list of dictionaries, with the following entries:
 233                         * "id" (optional, string) - Thumbnail format ID
 234                         * "url"
 235                         * "preference" (optional, int) - quality of the image
 236                         * "width" (optional, int)
 237                         * "height" (optional, int)
 238                         * "resolution" (optional, string "{width}x{height}",
 239                                         deprecated)
 240                         * "filesize" (optional, int)
 241     thumbnail:      Full URL to a video thumbnail image.
 242     description:    Full video description.
 243     uploader:       Full name of the video uploader.
 244     license:        License name the video is licensed under.
 245     creator:        The creator of the video.
 246     release_timestamp: UNIX timestamp of the moment the video was released.
 247     release_date:   The date (YYYYMMDD) when the video was released.
 248     timestamp:      UNIX timestamp of the moment the video was uploaded
 249     upload_date:    Video upload date (YYYYMMDD).
 250                     If not explicitly set, calculated from timestamp.
 251     uploader_id:    Nickname or id of the video uploader.
 252     uploader_url:   Full URL to a personal webpage of the video uploader.
 253     channel:        Full name of the channel the video is uploaded on.
 254                     Note that channel fields may or may not repeat uploader
 255                     fields. This depends on a particular extractor.
 256     channel_id:     Id of the channel.
 257     channel_url:    Full URL to a channel webpage.
 258     location:       Physical location where the video was filmed.
 259     subtitles:      The available subtitles as a dictionary in the format
 260                     {tag: subformats}. "tag" is usually a language code, and
 261                     "subformats" is a list sorted from lower to higher
 262                     preference, each element is a dictionary with the "ext"
 263                     entry and one of:
 264                         * "data": The subtitles file contents
 265                         * "url": A URL pointing to the subtitles file
 266                     It can optionally also have:
 267                         * "name": Name or description of the subtitles
 268                     "ext" will be calculated from URL if missing
 269     automatic_captions: Like 'subtitles'; contains automatically generated
 270                     captions instead of normal subtitles
 271     duration:       Length of the video in seconds, as an integer or float.
 272     view_count:     How many users have watched the video on the platform.
 273     like_count:     Number of positive ratings of the video
 274     dislike_count:  Number of negative ratings of the video
 275     repost_count:   Number of reposts of the video
 276     average_rating: Average rating give by users, the scale used depends on the webpage
 277     comment_count:  Number of comments on the video
 278     comments:       A list of comments, each with one or more of the following
 279                     properties (all but one of text or html optional):
 280                         * "author" - human-readable name of the comment author
 281                         * "author_id" - user ID of the comment author
 282                         * "author_thumbnail" - The thumbnail of the comment author
 283                         * "id" - Comment ID
 284                         * "html" - Comment as HTML
 285                         * "text" - Plain text of the comment
 286                         * "timestamp" - UNIX timestamp of comment
 287                         * "parent" - ID of the comment this one is replying to.
 288                                      Set to "root" to indicate that this is a
 289                                      comment to the original video.
 290                         * "like_count" - Number of positive ratings of the comment
 291                         * "dislike_count" - Number of negative ratings of the comment
 292                         * "is_favorited" - Whether the comment is marked as
 293                                            favorite by the video uploader
 294                         * "author_is_uploader" - Whether the comment is made by
 295                                                  the video uploader
 296     age_limit:      Age restriction for the video, as an integer (years)
 297     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 298                     should allow to get the same result again. (It will be set
 299                     by YoutubeDL if it's missing)
 300     categories:     A list of categories that the video falls in, for example
 301                     ["Sports", "Berlin"]
 302     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 303     cast:           A list of the video cast
 304     is_live:        True, False, or None (=unknown). Whether this video is a
 305                     live stream that goes on instead of a fixed-length video.
 306     was_live:       True, False, or None (=unknown). Whether this video was
 307                     originally a live stream.
 308     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 309                     If absent, automatically set from is_live, was_live
 310     start_time:     Time in seconds where the reproduction should start, as
 311                     specified in the URL.
 312     end_time:       Time in seconds where the reproduction should end, as
 313                     specified in the URL.
 314     chapters:       A list of dictionaries, with the following entries:
 315                         * "start_time" - The start time of the chapter in seconds
 316                         * "end_time" - The end time of the chapter in seconds
 317                         * "title" (optional, string)
 318     playable_in_embed: Whether this video is allowed to play in embedded
 319                     players on other sites. Can be True (=always allowed),
 320                     False (=never allowed), None (=unknown), or a string
 321                     specifying the criteria for embedability (Eg: 'whitelist')
 322     availability:   Under what condition the video is available. One of
 323                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 324                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 325                     to set it
 326     __post_extractor: A function to be called just before the metadata is
 327                     written to either disk, logger or console. The function
 328                     must return a dict which will be added to the info_dict.
 329                     This is usefull for additional information that is
 330                     time-consuming to extract. Note that the fields thus
 331                     extracted will not be available to output template and
 332                     match_filter. So, only "comments" and "comment_count" are
 333                     currently allowed to be extracted via this method.
 334
 335     The following fields should only be used when the video belongs to some logical
 336     chapter or section:
 337
 338     chapter:        Name or title of the chapter the video belongs to.
 339     chapter_number: Number of the chapter the video belongs to, as an integer.
 340     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 341
 342     The following fields should only be used when the video is an episode of some
 343     series, programme or podcast:
 344
 345     series:         Title of the series or programme the video episode belongs to.
 346     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 347     season:         Title of the season the video episode belongs to.
 348     season_number:  Number of the season the video episode belongs to, as an integer.
 349     season_id:      Id of the season the video episode belongs to, as a unicode string.
 350     episode:        Title of the video episode. Unlike mandatory video title field,
 351                     this field should denote the exact title of the video episode
 352                     without any kind of decoration.
 353     episode_number: Number of the video episode within a season, as an integer.
 354     episode_id:     Id of the video episode, as a unicode string.
 355
 356     The following fields should only be used when the media is a track or a part of
 357     a music album:
 358
 359     track:          Title of the track.
 360     track_number:   Number of the track within an album or a disc, as an integer.
 361     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 362                     as a unicode string.
 363     artist:         Artist(s) of the track.
 364     genre:          Genre(s) of the track.
 365     album:          Title of the album the track belongs to.
 366     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 367     album_artist:   List of all artists appeared on the album (e.g.
 368                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 369                     and compilations).
 370     disc_number:    Number of the disc or other physical medium the track belongs to,
 371                     as an integer.
 372     release_year:   Year (YYYY) when the album was released.
 373
 374     Unless mentioned otherwise, the fields should be Unicode strings.
 375
 376     Unless mentioned otherwise, None is equivalent to absence of information.
 377
 378
 379     _type "playlist" indicates multiple videos.
 380     There must be a key "entries", which is a list, an iterable, or a PagedList
 381     object, each element of which is a valid dictionary by this specification.
 382
 383     Additionally, playlists can have "id", "title", and any other relevent
 384     attributes with the same semantics as videos (see above).
 385
 386
 387     _type "multi_video" indicates that there are multiple videos that
 388     form a single show, for examples multiple acts of an opera or TV episode.
 389     It must have an entries key like a playlist and contain all the keys
 390     required for a video at the same time.
 391
 392
 393     _type "url" indicates that the video must be extracted from another
 394     location, possibly by a different extractor. Its only required key is:
 395     "url" - the next URL to extract.
 396     The key "ie_key" can be set to the class name (minus the trailing "IE",
 397     e.g. "Youtube") if the extractor class is known in advance.
 398     Additionally, the dictionary may have any properties of the resolved entity
 399     known in advance, for example "title" if the title of the referred video is
 400     known ahead of time.
 401
 402
 403     _type "url_transparent" entities have the same specification as "url", but
 404     indicate that the given additional information is more precise than the one
 405     associated with the resolved URL.
 406     This is useful when a site employs a video service that hosts the video and
 407     its technical metadata, but that video service does not embed a useful
 408     title, description etc.
 409
 410
 411     Subclasses of this one should re-define the _real_initialize() and
 412     _real_extract() methods and define a _VALID_URL regexp.
 413     Probably, they should also be added to the list of extractors.
 414
 415     Subclasses may also override suitable() if necessary, but ensure the function
 416     signature is preserved and that this function imports everything it needs
 417     (except other extractors), so that lazy_extractors works correctly
 418
 419     _GEO_BYPASS attribute may be set to False in order to disable
 420     geo restriction bypass mechanisms for a particular extractor.
 421     Though it won't disable explicit geo restriction bypass based on
 422     country code provided with geo_bypass_country.
 423
 424     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 425     countries for this extractor. One of these countries will be used by
 426     geo restriction bypass mechanism right away in order to bypass
 427     geo restriction, of course, if the mechanism is not disabled.
 428
 429     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 430     IP blocks in CIDR notation for this extractor. One of these IP blocks
 431     will be used by geo restriction bypass mechanism similarly
 432     to _GEO_COUNTRIES.
 433
 434     The _WORKING attribute should be set to False for broken IEs
 435     in order to warn the users and skip the tests.
 436     """
 437
 438     _ready = False
 439     _downloader = None
 440     _x_forwarded_for_ip = None
 441     _GEO_BYPASS = True
 442     _GEO_COUNTRIES = None
 443     _GEO_IP_BLOCKS = None
 444     _WORKING = True
 445
 446     _LOGIN_HINTS = {
 447         'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
 448         'cookies': (
 449             'Use --cookies-from-browser or --cookies for the authentication. '
 450             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 451         'password': 'Use --username and --password, or --netrc to provide account credentials',
 452     }
 453
 454     def __init__(self, downloader=None):
 455         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 456         If a downloader is not passed during initialization,
 457         it must be set using "set_downloader()" before "extract()" is called"""
 458         self._ready = False
 459         self._x_forwarded_for_ip = None
 460         self._printed_messages = set()
 461         self.set_downloader(downloader)
 462
 463     @classmethod
 464     def _match_valid_url(cls, url):
 465         # This does not use has/getattr intentionally - we want to know whether
 466         # we have cached the regexp for *this* class, whereas getattr would also
 467         # match the superclass
 468         if '_VALID_URL_RE' not in cls.__dict__:
 469             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 470         return cls._VALID_URL_RE.match(url)
 471
 472     @classmethod
 473     def suitable(cls, url):
 474         """Receives a URL and returns True if suitable for this IE."""
 475         # This function must import everything it needs (except other extractors),
 476         # so that lazy_extractors works correctly
 477         return cls._match_valid_url(url) is not None
 478
 479     @classmethod
 480     def _match_id(cls, url):
 481         return cls._match_valid_url(url).group('id')
 482
 483     @classmethod
 484     def get_temp_id(cls, url):
 485         try:
 486             return cls._match_id(url)
 487         except (IndexError, AttributeError):
 488             return None
 489
 490     @classmethod
 491     def working(cls):
 492         """Getter method for _WORKING."""
 493         return cls._WORKING
 494
 495     def initialize(self):
 496         """Initializes an instance (authentication, etc)."""
 497         self._printed_messages = set()
 498         self._initialize_geo_bypass({
 499             'countries': self._GEO_COUNTRIES,
 500             'ip_blocks': self._GEO_IP_BLOCKS,
 501         })
 502         if not self._ready:
 503             self._real_initialize()
 504             self._ready = True
 505
 506     def _initialize_geo_bypass(self, geo_bypass_context):
 507         """
 508         Initialize geo restriction bypass mechanism.
 509
 510         This method is used to initialize geo bypass mechanism based on faking
 511         X-Forwarded-For HTTP header. A random country from provided country list
 512         is selected and a random IP belonging to this country is generated. This
 513         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 514         HTTP requests.
 515
 516         This method will be used for initial geo bypass mechanism initialization
 517         during the instance initialization with _GEO_COUNTRIES and
 518         _GEO_IP_BLOCKS.
 519
 520         You may also manually call it from extractor's code if geo bypass
 521         information is not available beforehand (e.g. obtained during
 522         extraction) or due to some other reason. In this case you should pass
 523         this information in geo bypass context passed as first argument. It may
 524         contain following fields:
 525
 526         countries:  List of geo unrestricted countries (similar
 527                     to _GEO_COUNTRIES)
 528         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 529                     (similar to _GEO_IP_BLOCKS)
 530
 531         """
 532         if not self._x_forwarded_for_ip:
 533
 534             # Geo bypass mechanism is explicitly disabled by user
 535             if not self.get_param('geo_bypass', True):
 536                 return
 537
 538             if not geo_bypass_context:
 539                 geo_bypass_context = {}
 540
 541             # Backward compatibility: previously _initialize_geo_bypass
 542             # expected a list of countries, some 3rd party code may still use
 543             # it this way
 544             if isinstance(geo_bypass_context, (list, tuple)):
 545                 geo_bypass_context = {
 546                     'countries': geo_bypass_context,
 547                 }
 548
 549             # The whole point of geo bypass mechanism is to fake IP
 550             # as X-Forwarded-For HTTP header based on some IP block or
 551             # country code.
 552
 553             # Path 1: bypassing based on IP block in CIDR notation
 554
 555             # Explicit IP block specified by user, use it right away
 556             # regardless of whether extractor is geo bypassable or not
 557             ip_block = self.get_param('geo_bypass_ip_block', None)
 558
 559             # Otherwise use random IP block from geo bypass context but only
 560             # if extractor is known as geo bypassable
 561             if not ip_block:
 562                 ip_blocks = geo_bypass_context.get('ip_blocks')
 563                 if self._GEO_BYPASS and ip_blocks:
 564                     ip_block = random.choice(ip_blocks)
 565
 566             if ip_block:
 567                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 568                 self._downloader.write_debug(
 569                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 570                 return
 571
 572             # Path 2: bypassing based on country code
 573
 574             # Explicit country code specified by user, use it right away
 575             # regardless of whether extractor is geo bypassable or not
 576             country = self.get_param('geo_bypass_country', None)
 577
 578             # Otherwise use random country code from geo bypass context but
 579             # only if extractor is known as geo bypassable
 580             if not country:
 581                 countries = geo_bypass_context.get('countries')
 582                 if self._GEO_BYPASS and countries:
 583                     country = random.choice(countries)
 584
 585             if country:
 586                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 587                 self._downloader.write_debug(
 588                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 589
 590     def extract(self, url):
 591         """Extracts URL information and returns it in list of dicts."""
 592         try:
 593             for _ in range(2):
 594                 try:
 595                     self.initialize()
 596                     self.write_debug('Extracting URL: %s' % url)
 597                     ie_result = self._real_extract(url)
 598                     if ie_result is None:
 599                         return None
 600                     if self._x_forwarded_for_ip:
 601                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 602                     subtitles = ie_result.get('subtitles')
 603                     if (subtitles and 'live_chat' in subtitles
 604                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 605                         del subtitles['live_chat']
 606                     return ie_result
 607                 except GeoRestrictedError as e:
 608                     if self.__maybe_fake_ip_and_retry(e.countries):
 609                         continue
 610                     raise
 611         except UnsupportedError:
 612             raise
 613         except ExtractorError as e:
 614             kwargs = {
 615                 'video_id': e.video_id or self.get_temp_id(url),
 616                 'ie': self.IE_NAME,
 617                 'tb': e.traceback,
 618                 'expected': e.expected,
 619                 'cause': e.cause
 620             }
 621             if hasattr(e, 'countries'):
 622                 kwargs['countries'] = e.countries
 623             raise type(e)(e.msg, **kwargs)
 624         except compat_http_client.IncompleteRead as e:
 625             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 626         except (KeyError, StopIteration) as e:
 627             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 628
 629     def __maybe_fake_ip_and_retry(self, countries):
 630         if (not self.get_param('geo_bypass_country', None)
 631                 and self._GEO_BYPASS
 632                 and self.get_param('geo_bypass', True)
 633                 and not self._x_forwarded_for_ip
 634                 and countries):
 635             country_code = random.choice(countries)
 636             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 637             if self._x_forwarded_for_ip:
 638                 self.report_warning(
 639                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 640                     % (self._x_forwarded_for_ip, country_code.upper()))
 641                 return True
 642         return False
 643
 644     def set_downloader(self, downloader):
 645         """Sets the downloader for this IE."""
 646         self._downloader = downloader
 647
 648     def _real_initialize(self):
 649         """Real initialization process. Redefine in subclasses."""
 650         pass
 651
 652     def _real_extract(self, url):
 653         """Real extraction process. Redefine in subclasses."""
 654         pass
 655
 656     @classmethod
 657     def ie_key(cls):
 658         """A string for getting the InfoExtractor with get_info_extractor"""
 659         return cls.__name__[:-2]
 660
 661     @property
 662     def IE_NAME(self):
 663         return compat_str(type(self).__name__[:-2])
 664
 665     @staticmethod
 666     def __can_accept_status_code(err, expected_status):
 667         assert isinstance(err, compat_urllib_error.HTTPError)
 668         if expected_status is None:
 669             return False
 670         elif callable(expected_status):
 671             return expected_status(err.code) is True
 672         else:
 673             return err.code in variadic(expected_status)
 674
 675     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 676         """
 677         Return the response handle.
 678
 679         See _download_webpage docstring for arguments specification.
 680         """
 681         if not self._downloader._first_webpage_request:
 682             sleep_interval = self.get_param('sleep_interval_requests') or 0
 683             if sleep_interval > 0:
 684                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 685                 time.sleep(sleep_interval)
 686         else:
 687             self._downloader._first_webpage_request = False
 688
 689         if note is None:
 690             self.report_download_webpage(video_id)
 691         elif note is not False:
 692             if video_id is None:
 693                 self.to_screen('%s' % (note,))
 694             else:
 695                 self.to_screen('%s: %s' % (video_id, note))
 696
 697         # Some sites check X-Forwarded-For HTTP header in order to figure out
 698         # the origin of the client behind proxy. This allows bypassing geo
 699         # restriction by faking this header's value to IP that belongs to some
 700         # geo unrestricted country. We will do so once we encounter any
 701         # geo restriction error.
 702         if self._x_forwarded_for_ip:
 703             if 'X-Forwarded-For' not in headers:
 704                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 705
 706         if isinstance(url_or_request, compat_urllib_request.Request):
 707             url_or_request = update_Request(
 708                 url_or_request, data=data, headers=headers, query=query)
 709         else:
 710             if query:
 711                 url_or_request = update_url_query(url_or_request, query)
 712             if data is not None or headers:
 713                 url_or_request = sanitized_Request(url_or_request, data, headers)
 714         try:
 715             return self._downloader.urlopen(url_or_request)
 716         except network_exceptions as err:
 717             if isinstance(err, compat_urllib_error.HTTPError):
 718                 if self.__can_accept_status_code(err, expected_status):
 719                     # Retain reference to error to prevent file object from
 720                     # being closed before it can be read. Works around the
 721                     # effects of <https://bugs.python.org/issue15002>
 722                     # introduced in Python 3.4.1.
 723                     err.fp._error = err
 724                     return err.fp
 725
 726             if errnote is False:
 727                 return False
 728             if errnote is None:
 729                 errnote = 'Unable to download webpage'
 730
 731             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 732             if fatal:
 733                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 734             else:
 735                 self.report_warning(errmsg)
 736                 return False
 737
 738     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 739         """
 740         Return a tuple (page content as string, URL handle).
 741
 742         See _download_webpage docstring for arguments specification.
 743         """
 744         # Strip hashes from the URL (#1038)
 745         if isinstance(url_or_request, (compat_str, str)):
 746             url_or_request = url_or_request.partition('#')[0]
 747
 748         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 749         if urlh is False:
 750             assert not fatal
 751             return False
 752         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 753         return (content, urlh)
 754
 755     @staticmethod
 756     def _guess_encoding_from_content(content_type, webpage_bytes):
 757         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 758         if m:
 759             encoding = m.group(1)
 760         else:
 761             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 762                           webpage_bytes[:1024])
 763             if m:
 764                 encoding = m.group(1).decode('ascii')
 765             elif webpage_bytes.startswith(b'\xff\xfe'):
 766                 encoding = 'utf-16'
 767             else:
 768                 encoding = 'utf-8'
 769
 770         return encoding
 771
 772     def __check_blocked(self, content):
 773         first_block = content[:512]
 774         if ('<title>Access to this site is blocked</title>' in content
 775                 and 'Websense' in first_block):
 776             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 777             blocked_iframe = self._html_search_regex(
 778                 r'<iframe src="([^"]+)"', content,
 779                 'Websense information URL', default=None)
 780             if blocked_iframe:
 781                 msg += ' Visit %s for more details' % blocked_iframe
 782             raise ExtractorError(msg, expected=True)
 783         if '<title>The URL you requested has been blocked</title>' in first_block:
 784             msg = (
 785                 'Access to this webpage has been blocked by Indian censorship. '
 786                 'Use a VPN or proxy server (with --proxy) to route around it.')
 787             block_msg = self._html_search_regex(
 788                 r'</h1><p>(.*?)</p>',
 789                 content, 'block message', default=None)
 790             if block_msg:
 791                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 792             raise ExtractorError(msg, expected=True)
 793         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 794                 and 'blocklist.rkn.gov.ru' in content):
 795             raise ExtractorError(
 796                 'Access to this webpage has been blocked by decision of the Russian government. '
 797                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 798                 expected=True)
 799
 800     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 801         content_type = urlh.headers.get('Content-Type', '')
 802         webpage_bytes = urlh.read()
 803         if prefix is not None:
 804             webpage_bytes = prefix + webpage_bytes
 805         if not encoding:
 806             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 807         if self.get_param('dump_intermediate_pages', False):
 808             self.to_screen('Dumping request to ' + urlh.geturl())
 809             dump = base64.b64encode(webpage_bytes).decode('ascii')
 810             self._downloader.to_screen(dump)
 811         if self.get_param('write_pages', False):
 812             basen = '%s_%s' % (video_id, urlh.geturl())
 813             trim_length = self.get_param('trim_file_name') or 240
 814             if len(basen) > trim_length:
 815                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 816                 basen = basen[:trim_length - len(h)] + h
 817             raw_filename = basen + '.dump'
 818             filename = sanitize_filename(raw_filename, restricted=True)
 819             self.to_screen('Saving request to ' + filename)
 820             # Working around MAX_PATH limitation on Windows (see
 821             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 822             if compat_os_name == 'nt':
 823                 absfilepath = os.path.abspath(filename)
 824                 if len(absfilepath) > 259:
 825                     filename = '\\\\?\\' + absfilepath
 826             with open(filename, 'wb') as outf:
 827                 outf.write(webpage_bytes)
 828
 829         try:
 830             content = webpage_bytes.decode(encoding, 'replace')
 831         except LookupError:
 832             content = webpage_bytes.decode('utf-8', 'replace')
 833
 834         self.__check_blocked(content)
 835
 836         return content
 837
 838     def _download_webpage(
 839             self, url_or_request, video_id, note=None, errnote=None,
 840             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 841             headers={}, query={}, expected_status=None):
 842         """
 843         Return the data of the page as a string.
 844
 845         Arguments:
 846         url_or_request -- plain text URL as a string or
 847             a compat_urllib_request.Requestobject
 848         video_id -- Video/playlist/item identifier (string)
 849
 850         Keyword arguments:
 851         note -- note printed before downloading (string)
 852         errnote -- note printed in case of an error (string)
 853         fatal -- flag denoting whether error should be considered fatal,
 854             i.e. whether it should cause ExtractionError to be raised,
 855             otherwise a warning will be reported and extraction continued
 856         tries -- number of tries
 857         timeout -- sleep interval between tries
 858         encoding -- encoding for a page content decoding, guessed automatically
 859             when not explicitly specified
 860         data -- POST data (bytes)
 861         headers -- HTTP headers (dict)
 862         query -- URL query (dict)
 863         expected_status -- allows to accept failed HTTP requests (non 2xx
 864             status code) by explicitly specifying a set of accepted status
 865             codes. Can be any of the following entities:
 866                 - an integer type specifying an exact failed status code to
 867                   accept
 868                 - a list or a tuple of integer types specifying a list of
 869                   failed status codes to accept
 870                 - a callable accepting an actual failed status code and
 871                   returning True if it should be accepted
 872             Note that this argument does not affect success status codes (2xx)
 873             which are always accepted.
 874         """
 875
 876         success = False
 877         try_count = 0
 878         while success is False:
 879             try:
 880                 res = self._download_webpage_handle(
 881                     url_or_request, video_id, note, errnote, fatal,
 882                     encoding=encoding, data=data, headers=headers, query=query,
 883                     expected_status=expected_status)
 884                 success = True
 885             except compat_http_client.IncompleteRead as e:
 886                 try_count += 1
 887                 if try_count >= tries:
 888                     raise e
 889                 self._sleep(timeout, video_id)
 890         if res is False:
 891             return res
 892         else:
 893             content, _ = res
 894             return content
 895
 896     def _download_xml_handle(
 897             self, url_or_request, video_id, note='Downloading XML',
 898             errnote='Unable to download XML', transform_source=None,
 899             fatal=True, encoding=None, data=None, headers={}, query={},
 900             expected_status=None):
 901         """
 902         Return a tuple (xml as an compat_etree_Element, URL handle).
 903
 904         See _download_webpage docstring for arguments specification.
 905         """
 906         res = self._download_webpage_handle(
 907             url_or_request, video_id, note, errnote, fatal=fatal,
 908             encoding=encoding, data=data, headers=headers, query=query,
 909             expected_status=expected_status)
 910         if res is False:
 911             return res
 912         xml_string, urlh = res
 913         return self._parse_xml(
 914             xml_string, video_id, transform_source=transform_source,
 915             fatal=fatal), urlh
 916
 917     def _download_xml(
 918             self, url_or_request, video_id,
 919             note='Downloading XML', errnote='Unable to download XML',
 920             transform_source=None, fatal=True, encoding=None,
 921             data=None, headers={}, query={}, expected_status=None):
 922         """
 923         Return the xml as an compat_etree_Element.
 924
 925         See _download_webpage docstring for arguments specification.
 926         """
 927         res = self._download_xml_handle(
 928             url_or_request, video_id, note=note, errnote=errnote,
 929             transform_source=transform_source, fatal=fatal, encoding=encoding,
 930             data=data, headers=headers, query=query,
 931             expected_status=expected_status)
 932         return res if res is False else res[0]
 933
 934     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 935         if transform_source:
 936             xml_string = transform_source(xml_string)
 937         try:
 938             return compat_etree_fromstring(xml_string.encode('utf-8'))
 939         except compat_xml_parse_error as ve:
 940             errmsg = '%s: Failed to parse XML ' % video_id
 941             if fatal:
 942                 raise ExtractorError(errmsg, cause=ve)
 943             else:
 944                 self.report_warning(errmsg + str(ve))
 945
 946     def _download_json_handle(
 947             self, url_or_request, video_id, note='Downloading JSON metadata',
 948             errnote='Unable to download JSON metadata', transform_source=None,
 949             fatal=True, encoding=None, data=None, headers={}, query={},
 950             expected_status=None):
 951         """
 952         Return a tuple (JSON object, URL handle).
 953
 954         See _download_webpage docstring for arguments specification.
 955         """
 956         res = self._download_webpage_handle(
 957             url_or_request, video_id, note, errnote, fatal=fatal,
 958             encoding=encoding, data=data, headers=headers, query=query,
 959             expected_status=expected_status)
 960         if res is False:
 961             return res
 962         json_string, urlh = res
 963         return self._parse_json(
 964             json_string, video_id, transform_source=transform_source,
 965             fatal=fatal), urlh
 966
 967     def _download_json(
 968             self, url_or_request, video_id, note='Downloading JSON metadata',
 969             errnote='Unable to download JSON metadata', transform_source=None,
 970             fatal=True, encoding=None, data=None, headers={}, query={},
 971             expected_status=None):
 972         """
 973         Return the JSON object as a dict.
 974
 975         See _download_webpage docstring for arguments specification.
 976         """
 977         res = self._download_json_handle(
 978             url_or_request, video_id, note=note, errnote=errnote,
 979             transform_source=transform_source, fatal=fatal, encoding=encoding,
 980             data=data, headers=headers, query=query,
 981             expected_status=expected_status)
 982         return res if res is False else res[0]
 983
 984     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 985         if transform_source:
 986             json_string = transform_source(json_string)
 987         try:
 988             return json.loads(json_string)
 989         except ValueError as ve:
 990             errmsg = '%s: Failed to parse JSON ' % video_id
 991             if fatal:
 992                 raise ExtractorError(errmsg, cause=ve)
 993             else:
 994                 self.report_warning(errmsg + str(ve))
 995
 996     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 997         return self._parse_json(
 998             data[data.find('{'):data.rfind('}') + 1],
 999             video_id, transform_source, fatal)
1000
1001     def _download_socket_json_handle(
1002             self, url_or_request, video_id, note='Polling socket',
1003             errnote='Unable to poll socket', transform_source=None,
1004             fatal=True, encoding=None, data=None, headers={}, query={},
1005             expected_status=None):
1006         """
1007         Return a tuple (JSON object, URL handle).
1008
1009         See _download_webpage docstring for arguments specification.
1010         """
1011         res = self._download_webpage_handle(
1012             url_or_request, video_id, note, errnote, fatal=fatal,
1013             encoding=encoding, data=data, headers=headers, query=query,
1014             expected_status=expected_status)
1015         if res is False:
1016             return res
1017         webpage, urlh = res
1018         return self._parse_socket_response_as_json(
1019             webpage, video_id, transform_source=transform_source,
1020             fatal=fatal), urlh
1021
1022     def _download_socket_json(
1023             self, url_or_request, video_id, note='Polling socket',
1024             errnote='Unable to poll socket', transform_source=None,
1025             fatal=True, encoding=None, data=None, headers={}, query={},
1026             expected_status=None):
1027         """
1028         Return the JSON object as a dict.
1029
1030         See _download_webpage docstring for arguments specification.
1031         """
1032         res = self._download_socket_json_handle(
1033             url_or_request, video_id, note=note, errnote=errnote,
1034             transform_source=transform_source, fatal=fatal, encoding=encoding,
1035             data=data, headers=headers, query=query,
1036             expected_status=expected_status)
1037         return res if res is False else res[0]
1038
1039     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1040         idstr = format_field(video_id, template='%s: ')
1041         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1042         if only_once:
1043             if f'WARNING: {msg}' in self._printed_messages:
1044                 return
1045             self._printed_messages.add(f'WARNING: {msg}')
1046         self._downloader.report_warning(msg, *args, **kwargs)
1047
1048     def to_screen(self, msg, *args, **kwargs):
1049         """Print msg to screen, prefixing it with '[ie_name]'"""
1050         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1051
1052     def write_debug(self, msg, *args, **kwargs):
1053         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1054
1055     def get_param(self, name, default=None, *args, **kwargs):
1056         if self._downloader:
1057             return self._downloader.params.get(name, default, *args, **kwargs)
1058         return default
1059
1060     def report_drm(self, video_id, partial=False):
1061         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1062
1063     def report_extraction(self, id_or_name):
1064         """Report information extraction."""
1065         self.to_screen('%s: Extracting information' % id_or_name)
1066
1067     def report_download_webpage(self, video_id):
1068         """Report webpage download."""
1069         self.to_screen('%s: Downloading webpage' % video_id)
1070
1071     def report_age_confirmation(self):
1072         """Report attempt to confirm age."""
1073         self.to_screen('Confirming age')
1074
1075     def report_login(self):
1076         """Report attempt to log in."""
1077         self.to_screen('Logging in')
1078
1079     def raise_login_required(
1080             self, msg='This video is only available for registered users',
1081             metadata_available=False, method='any'):
1082         if metadata_available and (
1083                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1084             self.report_warning(msg)
1085         if method is not None:
1086             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1087         raise ExtractorError(msg, expected=True)
1088
1089     def raise_geo_restricted(
1090             self, msg='This video is not available from your location due to geo restriction',
1091             countries=None, metadata_available=False):
1092         if metadata_available and (
1093                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1094             self.report_warning(msg)
1095         else:
1096             raise GeoRestrictedError(msg, countries=countries)
1097
1098     def raise_no_formats(self, msg, expected=False, video_id=None):
1099         if expected and (
1100                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1101             self.report_warning(msg, video_id)
1102         elif isinstance(msg, ExtractorError):
1103             raise msg
1104         else:
1105             raise ExtractorError(msg, expected=expected, video_id=video_id)
1106
1107     # Methods for following #608
1108     @staticmethod
1109     def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1110         """Returns a URL that points to a page that should be processed"""
1111         # TODO: ie should be the class used for getting the info
1112         video_info = {'_type': 'url',
1113                       'url': url,
1114                       'ie_key': ie}
1115         video_info.update(kwargs)
1116         if video_id is not None:
1117             video_info['id'] = video_id
1118         if video_title is not None:
1119             video_info['title'] = video_title
1120         return video_info
1121
1122     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1123         urls = orderedSet(
1124             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1125             for m in matches)
1126         return self.playlist_result(
1127             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1128
1129     @staticmethod
1130     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1131         """Returns a playlist"""
1132         video_info = {'_type': 'playlist',
1133                       'entries': entries}
1134         video_info.update(kwargs)
1135         if playlist_id:
1136             video_info['id'] = playlist_id
1137         if playlist_title:
1138             video_info['title'] = playlist_title
1139         if playlist_description is not None:
1140             video_info['description'] = playlist_description
1141         return video_info
1142
1143     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1144         """
1145         Perform a regex search on the given string, using a single or a list of
1146         patterns returning the first matching group.
1147         In case of failure return a default value or raise a WARNING or a
1148         RegexNotFoundError, depending on fatal, specifying the field name.
1149         """
1150         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1151             mobj = re.search(pattern, string, flags)
1152         else:
1153             for p in pattern:
1154                 mobj = re.search(p, string, flags)
1155                 if mobj:
1156                     break
1157
1158         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1159
1160         if mobj:
1161             if group is None:
1162                 # return the first matching group
1163                 return next(g for g in mobj.groups() if g is not None)
1164             elif isinstance(group, (list, tuple)):
1165                 return tuple(mobj.group(g) for g in group)
1166             else:
1167                 return mobj.group(group)
1168         elif default is not NO_DEFAULT:
1169             return default
1170         elif fatal:
1171             raise RegexNotFoundError('Unable to extract %s' % _name)
1172         else:
1173             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1174             return None
1175
1176     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1177         """
1178         Like _search_regex, but strips HTML tags and unescapes entities.
1179         """
1180         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1181         if res:
1182             return clean_html(res).strip()
1183         else:
1184             return res
1185
1186     def _get_netrc_login_info(self, netrc_machine=None):
1187         username = None
1188         password = None
1189         netrc_machine = netrc_machine or self._NETRC_MACHINE
1190
1191         if self.get_param('usenetrc', False):
1192             try:
1193                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1194                 if os.path.isdir(netrc_file):
1195                     netrc_file = os.path.join(netrc_file, '.netrc')
1196                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1197                 if info is not None:
1198                     username = info[0]
1199                     password = info[2]
1200                 else:
1201                     raise netrc.NetrcParseError(
1202                         'No authenticators for %s' % netrc_machine)
1203             except (IOError, netrc.NetrcParseError) as err:
1204                 self.report_warning(
1205                     'parsing .netrc: %s' % error_to_compat_str(err))
1206
1207         return username, password
1208
1209     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1210         """
1211         Get the login info as (username, password)
1212         First look for the manually specified credentials using username_option
1213         and password_option as keys in params dictionary. If no such credentials
1214         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1215         value.
1216         If there's no info available, return (None, None)
1217         """
1218
1219         # Attempt to use provided username and password or .netrc data
1220         username = self.get_param(username_option)
1221         if username is not None:
1222             password = self.get_param(password_option)
1223         else:
1224             username, password = self._get_netrc_login_info(netrc_machine)
1225
1226         return username, password
1227
1228     def _get_tfa_info(self, note='two-factor verification code'):
1229         """
1230         Get the two-factor authentication info
1231         TODO - asking the user will be required for sms/phone verify
1232         currently just uses the command line option
1233         If there's no info available, return None
1234         """
1235
1236         tfa = self.get_param('twofactor')
1237         if tfa is not None:
1238             return tfa
1239
1240         return compat_getpass('Type %s and press [Return]: ' % note)
1241
1242     # Helper functions for extracting OpenGraph info
1243     @staticmethod
1244     def _og_regexes(prop):
1245         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1246         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1247                        % {'prop': re.escape(prop)})
1248         template = r'<meta[^>]+?%s[^>]+?%s'
1249         return [
1250             template % (property_re, content_re),
1251             template % (content_re, property_re),
1252         ]
1253
1254     @staticmethod
1255     def _meta_regex(prop):
1256         return r'''(?isx)<meta
1257                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1258                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1259
1260     def _og_search_property(self, prop, html, name=None, **kargs):
1261         prop = variadic(prop)
1262         if name is None:
1263             name = 'OpenGraph %s' % prop[0]
1264         og_regexes = []
1265         for p in prop:
1266             og_regexes.extend(self._og_regexes(p))
1267         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1268         if escaped is None:
1269             return None
1270         return unescapeHTML(escaped)
1271
1272     def _og_search_thumbnail(self, html, **kargs):
1273         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1274
1275     def _og_search_description(self, html, **kargs):
1276         return self._og_search_property('description', html, fatal=False, **kargs)
1277
1278     def _og_search_title(self, html, **kargs):
1279         return self._og_search_property('title', html, **kargs)
1280
1281     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1282         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1283         if secure:
1284             regexes = self._og_regexes('video:secure_url') + regexes
1285         return self._html_search_regex(regexes, html, name, **kargs)
1286
1287     def _og_search_url(self, html, **kargs):
1288         return self._og_search_property('url', html, **kargs)
1289
1290     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1291         name = variadic(name)
1292         if display_name is None:
1293             display_name = name[0]
1294         return self._html_search_regex(
1295             [self._meta_regex(n) for n in name],
1296             html, display_name, fatal=fatal, group='content', **kwargs)
1297
1298     def _dc_search_uploader(self, html):
1299         return self._html_search_meta('dc.creator', html, 'uploader')
1300
1301     def _rta_search(self, html):
1302         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1303         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1304                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1305                      html):
1306             return 18
1307         return 0
1308
1309     def _media_rating_search(self, html):
1310         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1311         rating = self._html_search_meta('rating', html)
1312
1313         if not rating:
1314             return None
1315
1316         RATING_TABLE = {
1317             'safe for kids': 0,
1318             'general': 8,
1319             '14 years': 14,
1320             'mature': 17,
1321             'restricted': 19,
1322         }
1323         return RATING_TABLE.get(rating.lower())
1324
1325     def _family_friendly_search(self, html):
1326         # See http://schema.org/VideoObject
1327         family_friendly = self._html_search_meta(
1328             'isFamilyFriendly', html, default=None)
1329
1330         if not family_friendly:
1331             return None
1332
1333         RATING_TABLE = {
1334             '1': 0,
1335             'true': 0,
1336             '0': 18,
1337             'false': 18,
1338         }
1339         return RATING_TABLE.get(family_friendly.lower())
1340
1341     def _twitter_search_player(self, html):
1342         return self._html_search_meta('twitter:player', html,
1343                                       'twitter card player')
1344
1345     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1346         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1347         default = kwargs.get('default', NO_DEFAULT)
1348         # JSON-LD may be malformed and thus `fatal` should be respected.
1349         # At the same time `default` may be passed that assumes `fatal=False`
1350         # for _search_regex. Let's simulate the same behavior here as well.
1351         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1352         json_ld = []
1353         for mobj in json_ld_list:
1354             json_ld_item = self._parse_json(
1355                 mobj.group('json_ld'), video_id, fatal=fatal)
1356             if not json_ld_item:
1357                 continue
1358             if isinstance(json_ld_item, dict):
1359                 json_ld.append(json_ld_item)
1360             elif isinstance(json_ld_item, (list, tuple)):
1361                 json_ld.extend(json_ld_item)
1362         if json_ld:
1363             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1364         if json_ld:
1365             return json_ld
1366         if default is not NO_DEFAULT:
1367             return default
1368         elif fatal:
1369             raise RegexNotFoundError('Unable to extract JSON-LD')
1370         else:
1371             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1372             return {}
1373
1374     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1375         if isinstance(json_ld, compat_str):
1376             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1377         if not json_ld:
1378             return {}
1379         info = {}
1380         if not isinstance(json_ld, (list, tuple, dict)):
1381             return info
1382         if isinstance(json_ld, dict):
1383             json_ld = [json_ld]
1384
1385         INTERACTION_TYPE_MAP = {
1386             'CommentAction': 'comment',
1387             'AgreeAction': 'like',
1388             'DisagreeAction': 'dislike',
1389             'LikeAction': 'like',
1390             'DislikeAction': 'dislike',
1391             'ListenAction': 'view',
1392             'WatchAction': 'view',
1393             'ViewAction': 'view',
1394         }
1395
1396         def extract_interaction_type(e):
1397             interaction_type = e.get('interactionType')
1398             if isinstance(interaction_type, dict):
1399                 interaction_type = interaction_type.get('@type')
1400             return str_or_none(interaction_type)
1401
1402         def extract_interaction_statistic(e):
1403             interaction_statistic = e.get('interactionStatistic')
1404             if isinstance(interaction_statistic, dict):
1405                 interaction_statistic = [interaction_statistic]
1406             if not isinstance(interaction_statistic, list):
1407                 return
1408             for is_e in interaction_statistic:
1409                 if not isinstance(is_e, dict):
1410                     continue
1411                 if is_e.get('@type') != 'InteractionCounter':
1412                     continue
1413                 interaction_type = extract_interaction_type(is_e)
1414                 if not interaction_type:
1415                     continue
1416                 # For interaction count some sites provide string instead of
1417                 # an integer (as per spec) with non digit characters (e.g. ",")
1418                 # so extracting count with more relaxed str_to_int
1419                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1420                 if interaction_count is None:
1421                     continue
1422                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1423                 if not count_kind:
1424                     continue
1425                 count_key = '%s_count' % count_kind
1426                 if info.get(count_key) is not None:
1427                     continue
1428                 info[count_key] = interaction_count
1429
1430         def extract_video_object(e):
1431             assert e['@type'] == 'VideoObject'
1432             author = e.get('author')
1433             info.update({
1434                 'url': url_or_none(e.get('contentUrl')),
1435                 'title': unescapeHTML(e.get('name')),
1436                 'description': unescapeHTML(e.get('description')),
1437                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1438                 'duration': parse_duration(e.get('duration')),
1439                 'timestamp': unified_timestamp(e.get('uploadDate')),
1440                 # author can be an instance of 'Organization' or 'Person' types.
1441                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1442                 # however some websites are using 'Text' type instead.
1443                 # 1. https://schema.org/VideoObject
1444                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1445                 'filesize': float_or_none(e.get('contentSize')),
1446                 'tbr': int_or_none(e.get('bitrate')),
1447                 'width': int_or_none(e.get('width')),
1448                 'height': int_or_none(e.get('height')),
1449                 'view_count': int_or_none(e.get('interactionCount')),
1450             })
1451             extract_interaction_statistic(e)
1452
1453         for e in json_ld:
1454             if '@context' in e:
1455                 item_type = e.get('@type')
1456                 if expected_type is not None and expected_type != item_type:
1457                     continue
1458                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1459                 if rating is not None:
1460                     info['average_rating'] = rating
1461                 if item_type in ('TVEpisode', 'Episode'):
1462                     episode_name = unescapeHTML(e.get('name'))
1463                     info.update({
1464                         'episode': episode_name,
1465                         'episode_number': int_or_none(e.get('episodeNumber')),
1466                         'description': unescapeHTML(e.get('description')),
1467                     })
1468                     if not info.get('title') and episode_name:
1469                         info['title'] = episode_name
1470                     part_of_season = e.get('partOfSeason')
1471                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1472                         info.update({
1473                             'season': unescapeHTML(part_of_season.get('name')),
1474                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1475                         })
1476                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1477                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1478                         info['series'] = unescapeHTML(part_of_series.get('name'))
1479                 elif item_type == 'Movie':
1480                     info.update({
1481                         'title': unescapeHTML(e.get('name')),
1482                         'description': unescapeHTML(e.get('description')),
1483                         'duration': parse_duration(e.get('duration')),
1484                         'timestamp': unified_timestamp(e.get('dateCreated')),
1485                     })
1486                 elif item_type in ('Article', 'NewsArticle'):
1487                     info.update({
1488                         'timestamp': parse_iso8601(e.get('datePublished')),
1489                         'title': unescapeHTML(e.get('headline')),
1490                         'description': unescapeHTML(e.get('articleBody')),
1491                     })
1492                 elif item_type == 'VideoObject':
1493                     extract_video_object(e)
1494                     if expected_type is None:
1495                         continue
1496                     else:
1497                         break
1498                 video = e.get('video')
1499                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1500                     extract_video_object(video)
1501                 if expected_type is None:
1502                     continue
1503                 else:
1504                     break
1505         return dict((k, v) for k, v in info.items() if v is not None)
1506
1507     def _search_nextjs_data(self, webpage, video_id, **kw):
1508         return self._parse_json(
1509             self._search_regex(
1510                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1511                 webpage, 'next.js data', **kw),
1512             video_id, **kw)
1513
1514     @staticmethod
1515     def _hidden_inputs(html):
1516         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1517         hidden_inputs = {}
1518         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1519             attrs = extract_attributes(input)
1520             if not input:
1521                 continue
1522             if attrs.get('type') not in ('hidden', 'submit'):
1523                 continue
1524             name = attrs.get('name') or attrs.get('id')
1525             value = attrs.get('value')
1526             if name and value is not None:
1527                 hidden_inputs[name] = value
1528         return hidden_inputs
1529
1530     def _form_hidden_inputs(self, form_id, html):
1531         form = self._search_regex(
1532             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1533             html, '%s form' % form_id, group='form')
1534         return self._hidden_inputs(form)
1535
1536     class FormatSort:
1537         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1538
1539         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1540                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1541                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1542         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1543                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1544                         'fps', 'fs_approx', 'source', 'id')
1545
1546         settings = {
1547             'vcodec': {'type': 'ordered', 'regex': True,
1548                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1549             'acodec': {'type': 'ordered', 'regex': True,
1550                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1551             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1552                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1553             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1554                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1555             'vext': {'type': 'ordered', 'field': 'video_ext',
1556                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1557                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1558             'aext': {'type': 'ordered', 'field': 'audio_ext',
1559                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1560                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1561             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1562             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1563                            'field': ('vcodec', 'acodec'),
1564                            'function': lambda it: int(any(v != 'none' for v in it))},
1565             'ie_pref': {'priority': True, 'type': 'extractor'},
1566             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1567             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1568             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1569             'quality': {'convert': 'float', 'default': -1},
1570             'filesize': {'convert': 'bytes'},
1571             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1572             'id': {'convert': 'string', 'field': 'format_id'},
1573             'height': {'convert': 'float_none'},
1574             'width': {'convert': 'float_none'},
1575             'fps': {'convert': 'float_none'},
1576             'tbr': {'convert': 'float_none'},
1577             'vbr': {'convert': 'float_none'},
1578             'abr': {'convert': 'float_none'},
1579             'asr': {'convert': 'float_none'},
1580             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1581
1582             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1583             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1584             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1585             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1586             'res': {'type': 'multiple', 'field': ('height', 'width'),
1587                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1588
1589             # Deprecated
1590             'dimension': {'type': 'alias', 'field': 'res'},
1591             'resolution': {'type': 'alias', 'field': 'res'},
1592             'extension': {'type': 'alias', 'field': 'ext'},
1593             'bitrate': {'type': 'alias', 'field': 'br'},
1594             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1595             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1596             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1597             'framerate': {'type': 'alias', 'field': 'fps'},
1598             'language_preference': {'type': 'alias', 'field': 'lang'},
1599             'protocol': {'type': 'alias', 'field': 'proto'},
1600             'source_preference': {'type': 'alias', 'field': 'source'},
1601             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1602             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1603             'samplerate': {'type': 'alias', 'field': 'asr'},
1604             'video_ext': {'type': 'alias', 'field': 'vext'},
1605             'audio_ext': {'type': 'alias', 'field': 'aext'},
1606             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1607             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1608             'video': {'type': 'alias', 'field': 'hasvid'},
1609             'has_video': {'type': 'alias', 'field': 'hasvid'},
1610             'audio': {'type': 'alias', 'field': 'hasaud'},
1611             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1612             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1613             'preference': {'type': 'alias', 'field': 'ie_pref'},
1614             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1615             'format_id': {'type': 'alias', 'field': 'id'},
1616         }
1617
1618         def __init__(self, ie, field_preference):
1619             self._order = []
1620             self.ydl = ie._downloader
1621             self.evaluate_params(self.ydl.params, field_preference)
1622             if ie.get_param('verbose'):
1623                 self.print_verbose_info(self.ydl.write_debug)
1624
1625         def _get_field_setting(self, field, key):
1626             if field not in self.settings:
1627                 if key in ('forced', 'priority'):
1628                     return False
1629                 self.ydl.deprecation_warning(
1630                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1631                     'and may be removed in a future version')
1632                 self.settings[field] = {}
1633             propObj = self.settings[field]
1634             if key not in propObj:
1635                 type = propObj.get('type')
1636                 if key == 'field':
1637                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1638                 elif key == 'convert':
1639                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1640                 else:
1641                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1642                 propObj[key] = default
1643             return propObj[key]
1644
1645         def _resolve_field_value(self, field, value, convertNone=False):
1646             if value is None:
1647                 if not convertNone:
1648                     return None
1649             else:
1650                 value = value.lower()
1651             conversion = self._get_field_setting(field, 'convert')
1652             if conversion == 'ignore':
1653                 return None
1654             if conversion == 'string':
1655                 return value
1656             elif conversion == 'float_none':
1657                 return float_or_none(value)
1658             elif conversion == 'bytes':
1659                 return FileDownloader.parse_bytes(value)
1660             elif conversion == 'order':
1661                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1662                 use_regex = self._get_field_setting(field, 'regex')
1663                 list_length = len(order_list)
1664                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1665                 if use_regex and value is not None:
1666                     for i, regex in enumerate(order_list):
1667                         if regex and re.match(regex, value):
1668                             return list_length - i
1669                     return list_length - empty_pos  # not in list
1670                 else:  # not regex or  value = None
1671                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1672             else:
1673                 if value.isnumeric():
1674                     return float(value)
1675                 else:
1676                     self.settings[field]['convert'] = 'string'
1677                     return value
1678
1679         def evaluate_params(self, params, sort_extractor):
1680             self._use_free_order = params.get('prefer_free_formats', False)
1681             self._sort_user = params.get('format_sort', [])
1682             self._sort_extractor = sort_extractor
1683
1684             def add_item(field, reverse, closest, limit_text):
1685                 field = field.lower()
1686                 if field in self._order:
1687                     return
1688                 self._order.append(field)
1689                 limit = self._resolve_field_value(field, limit_text)
1690                 data = {
1691                     'reverse': reverse,
1692                     'closest': False if limit is None else closest,
1693                     'limit_text': limit_text,
1694                     'limit': limit}
1695                 if field in self.settings:
1696                     self.settings[field].update(data)
1697                 else:
1698                     self.settings[field] = data
1699
1700             sort_list = (
1701                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1702                 + (tuple() if params.get('format_sort_force', False)
1703                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1704                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1705
1706             for item in sort_list:
1707                 match = re.match(self.regex, item)
1708                 if match is None:
1709                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1710                 field = match.group('field')
1711                 if field is None:
1712                     continue
1713                 if self._get_field_setting(field, 'type') == 'alias':
1714                     alias, field = field, self._get_field_setting(field, 'field')
1715                     self.ydl.deprecation_warning(
1716                         f'Format sorting alias {alias} is deprecated '
1717                         f'and may be removed in a future version. Please use {field} instead')
1718                 reverse = match.group('reverse') is not None
1719                 closest = match.group('separator') == '~'
1720                 limit_text = match.group('limit')
1721
1722                 has_limit = limit_text is not None
1723                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1724                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1725
1726                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1727                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1728                 limit_count = len(limits)
1729                 for (i, f) in enumerate(fields):
1730                     add_item(f, reverse, closest,
1731                              limits[i] if i < limit_count
1732                              else limits[0] if has_limit and not has_multiple_limits
1733                              else None)
1734
1735         def print_verbose_info(self, write_debug):
1736             if self._sort_user:
1737                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1738             if self._sort_extractor:
1739                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1740             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1741                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1742                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1743                               self._get_field_setting(field, 'limit_text'),
1744                               self._get_field_setting(field, 'limit'))
1745                 if self._get_field_setting(field, 'limit_text') is not None else '')
1746                 for field in self._order if self._get_field_setting(field, 'visible')]))
1747
1748         def _calculate_field_preference_from_value(self, format, field, type, value):
1749             reverse = self._get_field_setting(field, 'reverse')
1750             closest = self._get_field_setting(field, 'closest')
1751             limit = self._get_field_setting(field, 'limit')
1752
1753             if type == 'extractor':
1754                 maximum = self._get_field_setting(field, 'max')
1755                 if value is None or (maximum is not None and value >= maximum):
1756                     value = -1
1757             elif type == 'boolean':
1758                 in_list = self._get_field_setting(field, 'in_list')
1759                 not_in_list = self._get_field_setting(field, 'not_in_list')
1760                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1761             elif type == 'ordered':
1762                 value = self._resolve_field_value(field, value, True)
1763
1764             # try to convert to number
1765             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1766             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1767             if is_num:
1768                 value = val_num
1769
1770             return ((-10, 0) if value is None
1771                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1772                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1773                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1774                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1775                     else (-1, value, 0))
1776
1777         def _calculate_field_preference(self, format, field):
1778             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1779             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1780             if type == 'multiple':
1781                 type = 'field'  # Only 'field' is allowed in multiple for now
1782                 actual_fields = self._get_field_setting(field, 'field')
1783
1784                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1785             else:
1786                 value = get_value(field)
1787             return self._calculate_field_preference_from_value(format, field, type, value)
1788
1789         def calculate_preference(self, format):
1790             # Determine missing protocol
1791             if not format.get('protocol'):
1792                 format['protocol'] = determine_protocol(format)
1793
1794             # Determine missing ext
1795             if not format.get('ext') and 'url' in format:
1796                 format['ext'] = determine_ext(format['url'])
1797             if format.get('vcodec') == 'none':
1798                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1799                 format['video_ext'] = 'none'
1800             else:
1801                 format['video_ext'] = format['ext']
1802                 format['audio_ext'] = 'none'
1803             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1804             #    format['preference'] = -1000
1805
1806             # Determine missing bitrates
1807             if format.get('tbr') is None:
1808                 if format.get('vbr') is not None and format.get('abr') is not None:
1809                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1810             else:
1811                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1812                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1813                 if format.get('acodec') != 'none' and format.get('abr') is None:
1814                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1815
1816             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1817
1818     def _sort_formats(self, formats, field_preference=[]):
1819         if not formats:
1820             return
1821         format_sort = self.FormatSort(self, field_preference)
1822         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1823
1824     def _check_formats(self, formats, video_id):
1825         if formats:
1826             formats[:] = filter(
1827                 lambda f: self._is_valid_url(
1828                     f['url'], video_id,
1829                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1830                 formats)
1831
1832     @staticmethod
1833     def _remove_duplicate_formats(formats):
1834         format_urls = set()
1835         unique_formats = []
1836         for f in formats:
1837             if f['url'] not in format_urls:
1838                 format_urls.add(f['url'])
1839                 unique_formats.append(f)
1840         formats[:] = unique_formats
1841
1842     def _is_valid_url(self, url, video_id, item='video', headers={}):
1843         url = self._proto_relative_url(url, scheme='http:')
1844         # For now assume non HTTP(S) URLs always valid
1845         if not (url.startswith('http://') or url.startswith('https://')):
1846             return True
1847         try:
1848             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1849             return True
1850         except ExtractorError as e:
1851             self.to_screen(
1852                 '%s: %s URL is invalid, skipping: %s'
1853                 % (video_id, item, error_to_compat_str(e.cause)))
1854             return False
1855
1856     def http_scheme(self):
1857         """ Either "http:" or "https:", depending on the user's preferences """
1858         return (
1859             'http:'
1860             if self.get_param('prefer_insecure', False)
1861             else 'https:')
1862
1863     def _proto_relative_url(self, url, scheme=None):
1864         if url is None:
1865             return url
1866         if url.startswith('//'):
1867             if scheme is None:
1868                 scheme = self.http_scheme()
1869             return scheme + url
1870         else:
1871             return url
1872
1873     def _sleep(self, timeout, video_id, msg_template=None):
1874         if msg_template is None:
1875             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1876         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1877         self.to_screen(msg)
1878         time.sleep(timeout)
1879
1880     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1881                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1882                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1883         manifest = self._download_xml(
1884             manifest_url, video_id, 'Downloading f4m manifest',
1885             'Unable to download f4m manifest',
1886             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1887             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1888             transform_source=transform_source,
1889             fatal=fatal, data=data, headers=headers, query=query)
1890
1891         if manifest is False:
1892             return []
1893
1894         return self._parse_f4m_formats(
1895             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1896             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1897
1898     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1899                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1900                            fatal=True, m3u8_id=None):
1901         if not isinstance(manifest, compat_etree_Element) and not fatal:
1902             return []
1903
1904         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1905         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1906         if akamai_pv is not None and ';' in akamai_pv.text:
1907             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1908             if playerVerificationChallenge.strip() != '':
1909                 return []
1910
1911         formats = []
1912         manifest_version = '1.0'
1913         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1914         if not media_nodes:
1915             manifest_version = '2.0'
1916             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1917         # Remove unsupported DRM protected media from final formats
1918         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1919         media_nodes = remove_encrypted_media(media_nodes)
1920         if not media_nodes:
1921             return formats
1922
1923         manifest_base_url = get_base_url(manifest)
1924
1925         bootstrap_info = xpath_element(
1926             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1927             'bootstrap info', default=None)
1928
1929         vcodec = None
1930         mime_type = xpath_text(
1931             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1932             'base URL', default=None)
1933         if mime_type and mime_type.startswith('audio/'):
1934             vcodec = 'none'
1935
1936         for i, media_el in enumerate(media_nodes):
1937             tbr = int_or_none(media_el.attrib.get('bitrate'))
1938             width = int_or_none(media_el.attrib.get('width'))
1939             height = int_or_none(media_el.attrib.get('height'))
1940             format_id = join_nonempty(f4m_id, tbr or i)
1941             # If <bootstrapInfo> is present, the specified f4m is a
1942             # stream-level manifest, and only set-level manifests may refer to
1943             # external resources.  See section 11.4 and section 4 of F4M spec
1944             if bootstrap_info is None:
1945                 media_url = None
1946                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1947                 if manifest_version == '2.0':
1948                     media_url = media_el.attrib.get('href')
1949                 if media_url is None:
1950                     media_url = media_el.attrib.get('url')
1951                 if not media_url:
1952                     continue
1953                 manifest_url = (
1954                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1955                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1956                 # If media_url is itself a f4m manifest do the recursive extraction
1957                 # since bitrates in parent manifest (this one) and media_url manifest
1958                 # may differ leading to inability to resolve the format by requested
1959                 # bitrate in f4m downloader
1960                 ext = determine_ext(manifest_url)
1961                 if ext == 'f4m':
1962                     f4m_formats = self._extract_f4m_formats(
1963                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1964                         transform_source=transform_source, fatal=fatal)
1965                     # Sometimes stream-level manifest contains single media entry that
1966                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1967                     # At the same time parent's media entry in set-level manifest may
1968                     # contain it. We will copy it from parent in such cases.
1969                     if len(f4m_formats) == 1:
1970                         f = f4m_formats[0]
1971                         f.update({
1972                             'tbr': f.get('tbr') or tbr,
1973                             'width': f.get('width') or width,
1974                             'height': f.get('height') or height,
1975                             'format_id': f.get('format_id') if not tbr else format_id,
1976                             'vcodec': vcodec,
1977                         })
1978                     formats.extend(f4m_formats)
1979                     continue
1980                 elif ext == 'm3u8':
1981                     formats.extend(self._extract_m3u8_formats(
1982                         manifest_url, video_id, 'mp4', preference=preference,
1983                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1984                     continue
1985             formats.append({
1986                 'format_id': format_id,
1987                 'url': manifest_url,
1988                 'manifest_url': manifest_url,
1989                 'ext': 'flv' if bootstrap_info is not None else None,
1990                 'protocol': 'f4m',
1991                 'tbr': tbr,
1992                 'width': width,
1993                 'height': height,
1994                 'vcodec': vcodec,
1995                 'preference': preference,
1996                 'quality': quality,
1997             })
1998         return formats
1999
2000     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2001         return {
2002             'format_id': join_nonempty(m3u8_id, 'meta'),
2003             'url': m3u8_url,
2004             'ext': ext,
2005             'protocol': 'm3u8',
2006             'preference': preference - 100 if preference else -100,
2007             'quality': quality,
2008             'resolution': 'multiple',
2009             'format_note': 'Quality selection URL',
2010         }
2011
2012     def _report_ignoring_subs(self, name):
2013         self.report_warning(bug_reports_message(
2014             f'Ignoring subtitle tracks found in the {name} manifest; '
2015             'if any subtitle tracks are missing,'
2016         ), only_once=True)
2017
2018     def _extract_m3u8_formats(self, *args, **kwargs):
2019         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2020         if subs:
2021             self._report_ignoring_subs('HLS')
2022         return fmts
2023
2024     def _extract_m3u8_formats_and_subtitles(
2025             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2026             preference=None, quality=None, m3u8_id=None, note=None,
2027             errnote=None, fatal=True, live=False, data=None, headers={},
2028             query={}):
2029
2030         res = self._download_webpage_handle(
2031             m3u8_url, video_id,
2032             note='Downloading m3u8 information' if note is None else note,
2033             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2034             fatal=fatal, data=data, headers=headers, query=query)
2035
2036         if res is False:
2037             return [], {}
2038
2039         m3u8_doc, urlh = res
2040         m3u8_url = urlh.geturl()
2041
2042         return self._parse_m3u8_formats_and_subtitles(
2043             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2044             preference=preference, quality=quality, m3u8_id=m3u8_id,
2045             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2046             headers=headers, query=query, video_id=video_id)
2047
2048     def _parse_m3u8_formats_and_subtitles(
2049             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2050             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2051             errnote=None, fatal=True, data=None, headers={}, query={},
2052             video_id=None):
2053         formats, subtitles = [], {}
2054
2055         has_drm = re.search('|'.join([
2056             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2057             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2058         ]), m3u8_doc)
2059
2060         def format_url(url):
2061             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2062
2063         if self.get_param('hls_split_discontinuity', False):
2064             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2065                 if not m3u8_doc:
2066                     if not manifest_url:
2067                         return []
2068                     m3u8_doc = self._download_webpage(
2069                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2070                         note=False, errnote='Failed to download m3u8 playlist information')
2071                     if m3u8_doc is False:
2072                         return []
2073                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2074
2075         else:
2076             def _extract_m3u8_playlist_indices(*args, **kwargs):
2077                 return [None]
2078
2079         # References:
2080         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2081         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2082         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2083
2084         # We should try extracting formats only from master playlists [1, 4.3.4],
2085         # i.e. playlists that describe available qualities. On the other hand
2086         # media playlists [1, 4.3.3] should be returned as is since they contain
2087         # just the media without qualities renditions.
2088         # Fortunately, master playlist can be easily distinguished from media
2089         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2090         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2091         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2092         # media playlist and MUST NOT appear in master playlist thus we can
2093         # clearly detect media playlist with this criterion.
2094
2095         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2096             formats = [{
2097                 'format_id': join_nonempty(m3u8_id, idx),
2098                 'format_index': idx,
2099                 'url': m3u8_url,
2100                 'ext': ext,
2101                 'protocol': entry_protocol,
2102                 'preference': preference,
2103                 'quality': quality,
2104                 'has_drm': has_drm,
2105             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2106
2107             return formats, subtitles
2108
2109         groups = {}
2110         last_stream_inf = {}
2111
2112         def extract_media(x_media_line):
2113             media = parse_m3u8_attributes(x_media_line)
2114             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2115             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2116             if not (media_type and group_id and name):
2117                 return
2118             groups.setdefault(group_id, []).append(media)
2119             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2120             if media_type == 'SUBTITLES':
2121                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2122                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2123                 # However, lack of URI has been spotted in the wild.
2124                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2125                 if not media.get('URI'):
2126                     return
2127                 url = format_url(media['URI'])
2128                 sub_info = {
2129                     'url': url,
2130                     'ext': determine_ext(url),
2131                 }
2132                 if sub_info['ext'] == 'm3u8':
2133                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2134                     # files may contain is WebVTT:
2135                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2136                     sub_info['ext'] = 'vtt'
2137                     sub_info['protocol'] = 'm3u8_native'
2138                 lang = media.get('LANGUAGE') or 'und'
2139                 subtitles.setdefault(lang, []).append(sub_info)
2140             if media_type not in ('VIDEO', 'AUDIO'):
2141                 return
2142             media_url = media.get('URI')
2143             if media_url:
2144                 manifest_url = format_url(media_url)
2145                 formats.extend({
2146                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2147                     'format_note': name,
2148                     'format_index': idx,
2149                     'url': manifest_url,
2150                     'manifest_url': m3u8_url,
2151                     'language': media.get('LANGUAGE'),
2152                     'ext': ext,
2153                     'protocol': entry_protocol,
2154                     'preference': preference,
2155                     'quality': quality,
2156                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2157                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2158
2159         def build_stream_name():
2160             # Despite specification does not mention NAME attribute for
2161             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2162             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2163             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2164             stream_name = last_stream_inf.get('NAME')
2165             if stream_name:
2166                 return stream_name
2167             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2168             # from corresponding rendition group
2169             stream_group_id = last_stream_inf.get('VIDEO')
2170             if not stream_group_id:
2171                 return
2172             stream_group = groups.get(stream_group_id)
2173             if not stream_group:
2174                 return stream_group_id
2175             rendition = stream_group[0]
2176             return rendition.get('NAME') or stream_group_id
2177
2178         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2179         # chance to detect video only formats when EXT-X-STREAM-INF tags
2180         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2181         for line in m3u8_doc.splitlines():
2182             if line.startswith('#EXT-X-MEDIA:'):
2183                 extract_media(line)
2184
2185         for line in m3u8_doc.splitlines():
2186             if line.startswith('#EXT-X-STREAM-INF:'):
2187                 last_stream_inf = parse_m3u8_attributes(line)
2188             elif line.startswith('#') or not line.strip():
2189                 continue
2190             else:
2191                 tbr = float_or_none(
2192                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2193                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2194                 manifest_url = format_url(line.strip())
2195
2196                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2197                     format_id = [m3u8_id, None, idx]
2198                     # Bandwidth of live streams may differ over time thus making
2199                     # format_id unpredictable. So it's better to keep provided
2200                     # format_id intact.
2201                     if not live:
2202                         stream_name = build_stream_name()
2203                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2204                     f = {
2205                         'format_id': join_nonempty(*format_id),
2206                         'format_index': idx,
2207                         'url': manifest_url,
2208                         'manifest_url': m3u8_url,
2209                         'tbr': tbr,
2210                         'ext': ext,
2211                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2212                         'protocol': entry_protocol,
2213                         'preference': preference,
2214                         'quality': quality,
2215                     }
2216                     resolution = last_stream_inf.get('RESOLUTION')
2217                     if resolution:
2218                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2219                         if mobj:
2220                             f['width'] = int(mobj.group('width'))
2221                             f['height'] = int(mobj.group('height'))
2222                     # Unified Streaming Platform
2223                     mobj = re.search(
2224                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2225                     if mobj:
2226                         abr, vbr = mobj.groups()
2227                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2228                         f.update({
2229                             'vbr': vbr,
2230                             'abr': abr,
2231                         })
2232                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2233                     f.update(codecs)
2234                     audio_group_id = last_stream_inf.get('AUDIO')
2235                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2236                     # references a rendition group MUST have a CODECS attribute.
2237                     # However, this is not always respected, for example, [2]
2238                     # contains EXT-X-STREAM-INF tag which references AUDIO
2239                     # rendition group but does not have CODECS and despite
2240                     # referencing an audio group it represents a complete
2241                     # (with audio and video) format. So, for such cases we will
2242                     # ignore references to rendition groups and treat them
2243                     # as complete formats.
2244                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2245                         audio_group = groups.get(audio_group_id)
2246                         if audio_group and audio_group[0].get('URI'):
2247                             # TODO: update acodec for audio only formats with
2248                             # the same GROUP-ID
2249                             f['acodec'] = 'none'
2250                     if not f.get('ext'):
2251                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2252                     formats.append(f)
2253
2254                     # for DailyMotion
2255                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2256                     if progressive_uri:
2257                         http_f = f.copy()
2258                         del http_f['manifest_url']
2259                         http_f.update({
2260                             'format_id': f['format_id'].replace('hls-', 'http-'),
2261                             'protocol': 'http',
2262                             'url': progressive_uri,
2263                         })
2264                         formats.append(http_f)
2265
2266                 last_stream_inf = {}
2267         return formats, subtitles
2268
2269     def _extract_m3u8_vod_duration(
2270             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2271
2272         m3u8_vod = self._download_webpage(
2273             m3u8_vod_url, video_id,
2274             note='Downloading m3u8 VOD manifest' if note is None else note,
2275             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2276             fatal=False, data=data, headers=headers, query=query)
2277
2278         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2279
2280     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2281         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2282             return None
2283
2284         return int(sum(
2285             float(line[len('#EXTINF:'):].split(',')[0])
2286             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2287
2288     @staticmethod
2289     def _xpath_ns(path, namespace=None):
2290         if not namespace:
2291             return path
2292         out = []
2293         for c in path.split('/'):
2294             if not c or c == '.':
2295                 out.append(c)
2296             else:
2297                 out.append('{%s}%s' % (namespace, c))
2298         return '/'.join(out)
2299
2300     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2301         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2302
2303         if smil is False:
2304             assert not fatal
2305             return []
2306
2307         namespace = self._parse_smil_namespace(smil)
2308
2309         fmts = self._parse_smil_formats(
2310             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2311         subs = self._parse_smil_subtitles(
2312             smil, namespace=namespace)
2313
2314         return fmts, subs
2315
2316     def _extract_smil_formats(self, *args, **kwargs):
2317         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2318         if subs:
2319             self._report_ignoring_subs('SMIL')
2320         return fmts
2321
2322     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2323         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2324         if smil is False:
2325             return {}
2326         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2327
2328     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2329         return self._download_xml(
2330             smil_url, video_id, 'Downloading SMIL file',
2331             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2332
2333     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2334         namespace = self._parse_smil_namespace(smil)
2335
2336         formats = self._parse_smil_formats(
2337             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2338         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2339
2340         video_id = os.path.splitext(url_basename(smil_url))[0]
2341         title = None
2342         description = None
2343         upload_date = None
2344         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2345             name = meta.attrib.get('name')
2346             content = meta.attrib.get('content')
2347             if not name or not content:
2348                 continue
2349             if not title and name == 'title':
2350                 title = content
2351             elif not description and name in ('description', 'abstract'):
2352                 description = content
2353             elif not upload_date and name == 'date':
2354                 upload_date = unified_strdate(content)
2355
2356         thumbnails = [{
2357             'id': image.get('type'),
2358             'url': image.get('src'),
2359             'width': int_or_none(image.get('width')),
2360             'height': int_or_none(image.get('height')),
2361         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2362
2363         return {
2364             'id': video_id,
2365             'title': title or video_id,
2366             'description': description,
2367             'upload_date': upload_date,
2368             'thumbnails': thumbnails,
2369             'formats': formats,
2370             'subtitles': subtitles,
2371         }
2372
2373     def _parse_smil_namespace(self, smil):
2374         return self._search_regex(
2375             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2376
2377     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2378         base = smil_url
2379         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2380             b = meta.get('base') or meta.get('httpBase')
2381             if b:
2382                 base = b
2383                 break
2384
2385         formats = []
2386         rtmp_count = 0
2387         http_count = 0
2388         m3u8_count = 0
2389         imgs_count = 0
2390
2391         srcs = set()
2392         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2393         for medium in media:
2394             src = medium.get('src')
2395             if not src or src in srcs:
2396                 continue
2397             srcs.add(src)
2398
2399             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2400             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2401             width = int_or_none(medium.get('width'))
2402             height = int_or_none(medium.get('height'))
2403             proto = medium.get('proto')
2404             ext = medium.get('ext')
2405             src_ext = determine_ext(src)
2406             streamer = medium.get('streamer') or base
2407
2408             if proto == 'rtmp' or streamer.startswith('rtmp'):
2409                 rtmp_count += 1
2410                 formats.append({
2411                     'url': streamer,
2412                     'play_path': src,
2413                     'ext': 'flv',
2414                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2415                     'tbr': bitrate,
2416                     'filesize': filesize,
2417                     'width': width,
2418                     'height': height,
2419                 })
2420                 if transform_rtmp_url:
2421                     streamer, src = transform_rtmp_url(streamer, src)
2422                     formats[-1].update({
2423                         'url': streamer,
2424                         'play_path': src,
2425                     })
2426                 continue
2427
2428             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2429             src_url = src_url.strip()
2430
2431             if proto == 'm3u8' or src_ext == 'm3u8':
2432                 m3u8_formats = self._extract_m3u8_formats(
2433                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2434                 if len(m3u8_formats) == 1:
2435                     m3u8_count += 1
2436                     m3u8_formats[0].update({
2437                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2438                         'tbr': bitrate,
2439                         'width': width,
2440                         'height': height,
2441                     })
2442                 formats.extend(m3u8_formats)
2443             elif src_ext == 'f4m':
2444                 f4m_url = src_url
2445                 if not f4m_params:
2446                     f4m_params = {
2447                         'hdcore': '3.2.0',
2448                         'plugin': 'flowplayer-3.2.0.1',
2449                     }
2450                 f4m_url += '&' if '?' in f4m_url else '?'
2451                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2452                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2453             elif src_ext == 'mpd':
2454                 formats.extend(self._extract_mpd_formats(
2455                     src_url, video_id, mpd_id='dash', fatal=False))
2456             elif re.search(r'\.ism/[Mm]anifest', src_url):
2457                 formats.extend(self._extract_ism_formats(
2458                     src_url, video_id, ism_id='mss', fatal=False))
2459             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2460                 http_count += 1
2461                 formats.append({
2462                     'url': src_url,
2463                     'ext': ext or src_ext or 'flv',
2464                     'format_id': 'http-%d' % (bitrate or http_count),
2465                     'tbr': bitrate,
2466                     'filesize': filesize,
2467                     'width': width,
2468                     'height': height,
2469                 })
2470
2471         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2472             src = medium.get('src')
2473             if not src or src in srcs:
2474                 continue
2475             srcs.add(src)
2476
2477             imgs_count += 1
2478             formats.append({
2479                 'format_id': 'imagestream-%d' % (imgs_count),
2480                 'url': src,
2481                 'ext': mimetype2ext(medium.get('type')),
2482                 'acodec': 'none',
2483                 'vcodec': 'none',
2484                 'width': int_or_none(medium.get('width')),
2485                 'height': int_or_none(medium.get('height')),
2486                 'format_note': 'SMIL storyboards',
2487             })
2488
2489         return formats
2490
2491     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2492         urls = []
2493         subtitles = {}
2494         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2495             src = textstream.get('src')
2496             if not src or src in urls:
2497                 continue
2498             urls.append(src)
2499             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2500             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2501             subtitles.setdefault(lang, []).append({
2502                 'url': src,
2503                 'ext': ext,
2504             })
2505         return subtitles
2506
2507     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2508         xspf = self._download_xml(
2509             xspf_url, playlist_id, 'Downloading xpsf playlist',
2510             'Unable to download xspf manifest', fatal=fatal)
2511         if xspf is False:
2512             return []
2513         return self._parse_xspf(
2514             xspf, playlist_id, xspf_url=xspf_url,
2515             xspf_base_url=base_url(xspf_url))
2516
2517     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2518         NS_MAP = {
2519             'xspf': 'http://xspf.org/ns/0/',
2520             's1': 'http://static.streamone.nl/player/ns/0',
2521         }
2522
2523         entries = []
2524         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2525             title = xpath_text(
2526                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2527             description = xpath_text(
2528                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2529             thumbnail = xpath_text(
2530                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2531             duration = float_or_none(
2532                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2533
2534             formats = []
2535             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2536                 format_url = urljoin(xspf_base_url, location.text)
2537                 if not format_url:
2538                     continue
2539                 formats.append({
2540                     'url': format_url,
2541                     'manifest_url': xspf_url,
2542                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2543                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2544                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2545                 })
2546             self._sort_formats(formats)
2547
2548             entries.append({
2549                 'id': playlist_id,
2550                 'title': title,
2551                 'description': description,
2552                 'thumbnail': thumbnail,
2553                 'duration': duration,
2554                 'formats': formats,
2555             })
2556         return entries
2557
2558     def _extract_mpd_formats(self, *args, **kwargs):
2559         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2560         if subs:
2561             self._report_ignoring_subs('DASH')
2562         return fmts
2563
2564     def _extract_mpd_formats_and_subtitles(
2565             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2566             fatal=True, data=None, headers={}, query={}):
2567         res = self._download_xml_handle(
2568             mpd_url, video_id,
2569             note='Downloading MPD manifest' if note is None else note,
2570             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2571             fatal=fatal, data=data, headers=headers, query=query)
2572         if res is False:
2573             return [], {}
2574         mpd_doc, urlh = res
2575         if mpd_doc is None:
2576             return [], {}
2577         mpd_base_url = base_url(urlh.geturl())
2578
2579         return self._parse_mpd_formats_and_subtitles(
2580             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2581
2582     def _parse_mpd_formats(self, *args, **kwargs):
2583         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2584         if subs:
2585             self._report_ignoring_subs('DASH')
2586         return fmts
2587
2588     def _parse_mpd_formats_and_subtitles(
2589             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2590         """
2591         Parse formats from MPD manifest.
2592         References:
2593          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2594             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2595          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2596         """
2597         if not self.get_param('dynamic_mpd', True):
2598             if mpd_doc.get('type') == 'dynamic':
2599                 return [], {}
2600
2601         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2602
2603         def _add_ns(path):
2604             return self._xpath_ns(path, namespace)
2605
2606         def is_drm_protected(element):
2607             return element.find(_add_ns('ContentProtection')) is not None
2608
2609         def extract_multisegment_info(element, ms_parent_info):
2610             ms_info = ms_parent_info.copy()
2611
2612             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2613             # common attributes and elements.  We will only extract relevant
2614             # for us.
2615             def extract_common(source):
2616                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2617                 if segment_timeline is not None:
2618                     s_e = segment_timeline.findall(_add_ns('S'))
2619                     if s_e:
2620                         ms_info['total_number'] = 0
2621                         ms_info['s'] = []
2622                         for s in s_e:
2623                             r = int(s.get('r', 0))
2624                             ms_info['total_number'] += 1 + r
2625                             ms_info['s'].append({
2626                                 't': int(s.get('t', 0)),
2627                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2628                                 'd': int(s.attrib['d']),
2629                                 'r': r,
2630                             })
2631                 start_number = source.get('startNumber')
2632                 if start_number:
2633                     ms_info['start_number'] = int(start_number)
2634                 timescale = source.get('timescale')
2635                 if timescale:
2636                     ms_info['timescale'] = int(timescale)
2637                 segment_duration = source.get('duration')
2638                 if segment_duration:
2639                     ms_info['segment_duration'] = float(segment_duration)
2640
2641             def extract_Initialization(source):
2642                 initialization = source.find(_add_ns('Initialization'))
2643                 if initialization is not None:
2644                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2645
2646             segment_list = element.find(_add_ns('SegmentList'))
2647             if segment_list is not None:
2648                 extract_common(segment_list)
2649                 extract_Initialization(segment_list)
2650                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2651                 if segment_urls_e:
2652                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2653             else:
2654                 segment_template = element.find(_add_ns('SegmentTemplate'))
2655                 if segment_template is not None:
2656                     extract_common(segment_template)
2657                     media = segment_template.get('media')
2658                     if media:
2659                         ms_info['media'] = media
2660                     initialization = segment_template.get('initialization')
2661                     if initialization:
2662                         ms_info['initialization'] = initialization
2663                     else:
2664                         extract_Initialization(segment_template)
2665             return ms_info
2666
2667         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2668         formats, subtitles = [], {}
2669         stream_numbers = collections.defaultdict(int)
2670         for period in mpd_doc.findall(_add_ns('Period')):
2671             period_duration = parse_duration(period.get('duration')) or mpd_duration
2672             period_ms_info = extract_multisegment_info(period, {
2673                 'start_number': 1,
2674                 'timescale': 1,
2675             })
2676             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2677                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2678                 for representation in adaptation_set.findall(_add_ns('Representation')):
2679                     representation_attrib = adaptation_set.attrib.copy()
2680                     representation_attrib.update(representation.attrib)
2681                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2682                     mime_type = representation_attrib['mimeType']
2683                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2684
2685                     codecs = representation_attrib.get('codecs', '')
2686                     if content_type not in ('video', 'audio', 'text'):
2687                         if mime_type == 'image/jpeg':
2688                             content_type = mime_type
2689                         elif codecs.split('.')[0] == 'stpp':
2690                             content_type = 'text'
2691                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2692                             content_type = 'text'
2693                         else:
2694                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2695                             continue
2696
2697                     base_url = ''
2698                     for element in (representation, adaptation_set, period, mpd_doc):
2699                         base_url_e = element.find(_add_ns('BaseURL'))
2700                         if base_url_e is not None:
2701                             base_url = base_url_e.text + base_url
2702                             if re.match(r'^https?://', base_url):
2703                                 break
2704                     if mpd_base_url and base_url.startswith('/'):
2705                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2706                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2707                         if not mpd_base_url.endswith('/'):
2708                             mpd_base_url += '/'
2709                         base_url = mpd_base_url + base_url
2710                     representation_id = representation_attrib.get('id')
2711                     lang = representation_attrib.get('lang')
2712                     url_el = representation.find(_add_ns('BaseURL'))
2713                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2714                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2715                     if representation_id is not None:
2716                         format_id = representation_id
2717                     else:
2718                         format_id = content_type
2719                     if mpd_id:
2720                         format_id = mpd_id + '-' + format_id
2721                     if content_type in ('video', 'audio'):
2722                         f = {
2723                             'format_id': format_id,
2724                             'manifest_url': mpd_url,
2725                             'ext': mimetype2ext(mime_type),
2726                             'width': int_or_none(representation_attrib.get('width')),
2727                             'height': int_or_none(representation_attrib.get('height')),
2728                             'tbr': float_or_none(bandwidth, 1000),
2729                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2730                             'fps': int_or_none(representation_attrib.get('frameRate')),
2731                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2732                             'format_note': 'DASH %s' % content_type,
2733                             'filesize': filesize,
2734                             'container': mimetype2ext(mime_type) + '_dash',
2735                         }
2736                         f.update(parse_codecs(codecs))
2737                     elif content_type == 'text':
2738                         f = {
2739                             'ext': mimetype2ext(mime_type),
2740                             'manifest_url': mpd_url,
2741                             'filesize': filesize,
2742                         }
2743                     elif content_type == 'image/jpeg':
2744                         # See test case in VikiIE
2745                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2746                         f = {
2747                             'format_id': format_id,
2748                             'ext': 'mhtml',
2749                             'manifest_url': mpd_url,
2750                             'format_note': 'DASH storyboards (jpeg)',
2751                             'acodec': 'none',
2752                             'vcodec': 'none',
2753                         }
2754                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2755                         f['has_drm'] = True
2756                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2757
2758                     def prepare_template(template_name, identifiers):
2759                         tmpl = representation_ms_info[template_name]
2760                         # First of, % characters outside $...$ templates
2761                         # must be escaped by doubling for proper processing
2762                         # by % operator string formatting used further (see
2763                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2764                         t = ''
2765                         in_template = False
2766                         for c in tmpl:
2767                             t += c
2768                             if c == '$':
2769                                 in_template = not in_template
2770                             elif c == '%' and not in_template:
2771                                 t += c
2772                         # Next, $...$ templates are translated to their
2773                         # %(...) counterparts to be used with % operator
2774                         if representation_id is not None:
2775                             t = t.replace('$RepresentationID$', representation_id)
2776                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2777                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2778                         t.replace('$$', '$')
2779                         return t
2780
2781                     # @initialization is a regular template like @media one
2782                     # so it should be handled just the same way (see
2783                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2784                     if 'initialization' in representation_ms_info:
2785                         initialization_template = prepare_template(
2786                             'initialization',
2787                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2788                             # $Time$ shall not be included for @initialization thus
2789                             # only $Bandwidth$ remains
2790                             ('Bandwidth', ))
2791                         representation_ms_info['initialization_url'] = initialization_template % {
2792                             'Bandwidth': bandwidth,
2793                         }
2794
2795                     def location_key(location):
2796                         return 'url' if re.match(r'^https?://', location) else 'path'
2797
2798                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2799
2800                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2801                         media_location_key = location_key(media_template)
2802
2803                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2804                         # can't be used at the same time
2805                         if '%(Number' in media_template and 's' not in representation_ms_info:
2806                             segment_duration = None
2807                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2808                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2809                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2810                             representation_ms_info['fragments'] = [{
2811                                 media_location_key: media_template % {
2812                                     'Number': segment_number,
2813                                     'Bandwidth': bandwidth,
2814                                 },
2815                                 'duration': segment_duration,
2816                             } for segment_number in range(
2817                                 representation_ms_info['start_number'],
2818                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2819                         else:
2820                             # $Number*$ or $Time$ in media template with S list available
2821                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2822                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2823                             representation_ms_info['fragments'] = []
2824                             segment_time = 0
2825                             segment_d = None
2826                             segment_number = representation_ms_info['start_number']
2827
2828                             def add_segment_url():
2829                                 segment_url = media_template % {
2830                                     'Time': segment_time,
2831                                     'Bandwidth': bandwidth,
2832                                     'Number': segment_number,
2833                                 }
2834                                 representation_ms_info['fragments'].append({
2835                                     media_location_key: segment_url,
2836                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2837                                 })
2838
2839                             for num, s in enumerate(representation_ms_info['s']):
2840                                 segment_time = s.get('t') or segment_time
2841                                 segment_d = s['d']
2842                                 add_segment_url()
2843                                 segment_number += 1
2844                                 for r in range(s.get('r', 0)):
2845                                     segment_time += segment_d
2846                                     add_segment_url()
2847                                     segment_number += 1
2848                                 segment_time += segment_d
2849                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2850                         # No media template
2851                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2852                         # or any YouTube dashsegments video
2853                         fragments = []
2854                         segment_index = 0
2855                         timescale = representation_ms_info['timescale']
2856                         for s in representation_ms_info['s']:
2857                             duration = float_or_none(s['d'], timescale)
2858                             for r in range(s.get('r', 0) + 1):
2859                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2860                                 fragments.append({
2861                                     location_key(segment_uri): segment_uri,
2862                                     'duration': duration,
2863                                 })
2864                                 segment_index += 1
2865                         representation_ms_info['fragments'] = fragments
2866                     elif 'segment_urls' in representation_ms_info:
2867                         # Segment URLs with no SegmentTimeline
2868                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2869                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2870                         fragments = []
2871                         segment_duration = float_or_none(
2872                             representation_ms_info['segment_duration'],
2873                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2874                         for segment_url in representation_ms_info['segment_urls']:
2875                             fragment = {
2876                                 location_key(segment_url): segment_url,
2877                             }
2878                             if segment_duration:
2879                                 fragment['duration'] = segment_duration
2880                             fragments.append(fragment)
2881                         representation_ms_info['fragments'] = fragments
2882                     # If there is a fragments key available then we correctly recognized fragmented media.
2883                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2884                     # assumption is not necessarily correct since we may simply have no support for
2885                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2886                     if 'fragments' in representation_ms_info:
2887                         f.update({
2888                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2889                             'url': mpd_url or base_url,
2890                             'fragment_base_url': base_url,
2891                             'fragments': [],
2892                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2893                         })
2894                         if 'initialization_url' in representation_ms_info:
2895                             initialization_url = representation_ms_info['initialization_url']
2896                             if not f.get('url'):
2897                                 f['url'] = initialization_url
2898                             f['fragments'].append({location_key(initialization_url): initialization_url})
2899                         f['fragments'].extend(representation_ms_info['fragments'])
2900                     else:
2901                         # Assuming direct URL to unfragmented media.
2902                         f['url'] = base_url
2903                     if content_type in ('video', 'audio', 'image/jpeg'):
2904                         f['manifest_stream_number'] = stream_numbers[f['url']]
2905                         stream_numbers[f['url']] += 1
2906                         formats.append(f)
2907                     elif content_type == 'text':
2908                         subtitles.setdefault(lang or 'und', []).append(f)
2909
2910         return formats, subtitles
2911
2912     def _extract_ism_formats(self, *args, **kwargs):
2913         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2914         if subs:
2915             self._report_ignoring_subs('ISM')
2916         return fmts
2917
2918     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2919         res = self._download_xml_handle(
2920             ism_url, video_id,
2921             note='Downloading ISM manifest' if note is None else note,
2922             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2923             fatal=fatal, data=data, headers=headers, query=query)
2924         if res is False:
2925             return [], {}
2926         ism_doc, urlh = res
2927         if ism_doc is None:
2928             return [], {}
2929
2930         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2931
2932     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2933         """
2934         Parse formats from ISM manifest.
2935         References:
2936          1. [MS-SSTR]: Smooth Streaming Protocol,
2937             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2938         """
2939         if ism_doc.get('IsLive') == 'TRUE':
2940             return [], {}
2941
2942         duration = int(ism_doc.attrib['Duration'])
2943         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2944
2945         formats = []
2946         subtitles = {}
2947         for stream in ism_doc.findall('StreamIndex'):
2948             stream_type = stream.get('Type')
2949             if stream_type not in ('video', 'audio', 'text'):
2950                 continue
2951             url_pattern = stream.attrib['Url']
2952             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2953             stream_name = stream.get('Name')
2954             stream_language = stream.get('Language', 'und')
2955             for track in stream.findall('QualityLevel'):
2956                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2957                 # TODO: add support for WVC1 and WMAP
2958                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2959                     self.report_warning('%s is not a supported codec' % fourcc)
2960                     continue
2961                 tbr = int(track.attrib['Bitrate']) // 1000
2962                 # [1] does not mention Width and Height attributes. However,
2963                 # they're often present while MaxWidth and MaxHeight are
2964                 # missing, so should be used as fallbacks
2965                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2966                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2967                 sampling_rate = int_or_none(track.get('SamplingRate'))
2968
2969                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2970                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2971
2972                 fragments = []
2973                 fragment_ctx = {
2974                     'time': 0,
2975                 }
2976                 stream_fragments = stream.findall('c')
2977                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2978                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2979                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2980                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2981                     if not fragment_ctx['duration']:
2982                         try:
2983                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2984                         except IndexError:
2985                             next_fragment_time = duration
2986                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2987                     for _ in range(fragment_repeat):
2988                         fragments.append({
2989                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2990                             'duration': fragment_ctx['duration'] / stream_timescale,
2991                         })
2992                         fragment_ctx['time'] += fragment_ctx['duration']
2993
2994                 if stream_type == 'text':
2995                     subtitles.setdefault(stream_language, []).append({
2996                         'ext': 'ismt',
2997                         'protocol': 'ism',
2998                         'url': ism_url,
2999                         'manifest_url': ism_url,
3000                         'fragments': fragments,
3001                         '_download_params': {
3002                             'stream_type': stream_type,
3003                             'duration': duration,
3004                             'timescale': stream_timescale,
3005                             'fourcc': fourcc,
3006                             'language': stream_language,
3007                             'codec_private_data': track.get('CodecPrivateData'),
3008                         }
3009                     })
3010                 elif stream_type in ('video', 'audio'):
3011                     formats.append({
3012                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3013                         'url': ism_url,
3014                         'manifest_url': ism_url,
3015                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3016                         'width': width,
3017                         'height': height,
3018                         'tbr': tbr,
3019                         'asr': sampling_rate,
3020                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3021                         'acodec': 'none' if stream_type == 'video' else fourcc,
3022                         'protocol': 'ism',
3023                         'fragments': fragments,
3024                         'has_drm': ism_doc.find('Protection') is not None,
3025                         '_download_params': {
3026                             'stream_type': stream_type,
3027                             'duration': duration,
3028                             'timescale': stream_timescale,
3029                             'width': width or 0,
3030                             'height': height or 0,
3031                             'fourcc': fourcc,
3032                             'language': stream_language,
3033                             'codec_private_data': track.get('CodecPrivateData'),
3034                             'sampling_rate': sampling_rate,
3035                             'channels': int_or_none(track.get('Channels', 2)),
3036                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3037                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3038                         },
3039                     })
3040         return formats, subtitles
3041
3042     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3043         def absolute_url(item_url):
3044             return urljoin(base_url, item_url)
3045
3046         def parse_content_type(content_type):
3047             if not content_type:
3048                 return {}
3049             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3050             if ctr:
3051                 mimetype, codecs = ctr.groups()
3052                 f = parse_codecs(codecs)
3053                 f['ext'] = mimetype2ext(mimetype)
3054                 return f
3055             return {}
3056
3057         def _media_formats(src, cur_media_type, type_info={}):
3058             full_url = absolute_url(src)
3059             ext = type_info.get('ext') or determine_ext(full_url)
3060             if ext == 'm3u8':
3061                 is_plain_url = False
3062                 formats = self._extract_m3u8_formats(
3063                     full_url, video_id, ext='mp4',
3064                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3065                     preference=preference, quality=quality, fatal=False)
3066             elif ext == 'mpd':
3067                 is_plain_url = False
3068                 formats = self._extract_mpd_formats(
3069                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3070             else:
3071                 is_plain_url = True
3072                 formats = [{
3073                     'url': full_url,
3074                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3075                 }]
3076             return is_plain_url, formats
3077
3078         entries = []
3079         # amp-video and amp-audio are very similar to their HTML5 counterparts
3080         # so we wll include them right here (see
3081         # https://www.ampproject.org/docs/reference/components/amp-video)
3082         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3083         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3084         media_tags = [(media_tag, media_tag_name, media_type, '')
3085                       for media_tag, media_tag_name, media_type
3086                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3087         media_tags.extend(re.findall(
3088             # We only allow video|audio followed by a whitespace or '>'.
3089             # Allowing more characters may end up in significant slow down (see
3090             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3091             # http://www.porntrex.com/maps/videositemap.xml).
3092             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3093         for media_tag, _, media_type, media_content in media_tags:
3094             media_info = {
3095                 'formats': [],
3096                 'subtitles': {},
3097             }
3098             media_attributes = extract_attributes(media_tag)
3099             src = strip_or_none(media_attributes.get('src'))
3100             if src:
3101                 _, formats = _media_formats(src, media_type)
3102                 media_info['formats'].extend(formats)
3103             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3104             if media_content:
3105                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3106                     s_attr = extract_attributes(source_tag)
3107                     # data-video-src and data-src are non standard but seen
3108                     # several times in the wild
3109                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3110                     if not src:
3111                         continue
3112                     f = parse_content_type(s_attr.get('type'))
3113                     is_plain_url, formats = _media_formats(src, media_type, f)
3114                     if is_plain_url:
3115                         # width, height, res, label and title attributes are
3116                         # all not standard but seen several times in the wild
3117                         labels = [
3118                             s_attr.get(lbl)
3119                             for lbl in ('label', 'title')
3120                             if str_or_none(s_attr.get(lbl))
3121                         ]
3122                         width = int_or_none(s_attr.get('width'))
3123                         height = (int_or_none(s_attr.get('height'))
3124                                   or int_or_none(s_attr.get('res')))
3125                         if not width or not height:
3126                             for lbl in labels:
3127                                 resolution = parse_resolution(lbl)
3128                                 if not resolution:
3129                                     continue
3130                                 width = width or resolution.get('width')
3131                                 height = height or resolution.get('height')
3132                         for lbl in labels:
3133                             tbr = parse_bitrate(lbl)
3134                             if tbr:
3135                                 break
3136                         else:
3137                             tbr = None
3138                         f.update({
3139                             'width': width,
3140                             'height': height,
3141                             'tbr': tbr,
3142                             'format_id': s_attr.get('label') or s_attr.get('title'),
3143                         })
3144                         f.update(formats[0])
3145                         media_info['formats'].append(f)
3146                     else:
3147                         media_info['formats'].extend(formats)
3148                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3149                     track_attributes = extract_attributes(track_tag)
3150                     kind = track_attributes.get('kind')
3151                     if not kind or kind in ('subtitles', 'captions'):
3152                         src = strip_or_none(track_attributes.get('src'))
3153                         if not src:
3154                             continue
3155                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3156                         media_info['subtitles'].setdefault(lang, []).append({
3157                             'url': absolute_url(src),
3158                         })
3159             for f in media_info['formats']:
3160                 f.setdefault('http_headers', {})['Referer'] = base_url
3161             if media_info['formats'] or media_info['subtitles']:
3162                 entries.append(media_info)
3163         return entries
3164
3165     def _extract_akamai_formats(self, *args, **kwargs):
3166         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3167         if subs:
3168             self._report_ignoring_subs('akamai')
3169         return fmts
3170
3171     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3172         signed = 'hdnea=' in manifest_url
3173         if not signed:
3174             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3175             manifest_url = re.sub(
3176                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3177                 '', manifest_url).strip('?')
3178
3179         formats = []
3180         subtitles = {}
3181
3182         hdcore_sign = 'hdcore=3.7.0'
3183         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3184         hds_host = hosts.get('hds')
3185         if hds_host:
3186             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3187         if 'hdcore=' not in f4m_url:
3188             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3189         f4m_formats = self._extract_f4m_formats(
3190             f4m_url, video_id, f4m_id='hds', fatal=False)
3191         for entry in f4m_formats:
3192             entry.update({'extra_param_to_segment_url': hdcore_sign})
3193         formats.extend(f4m_formats)
3194
3195         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3196         hls_host = hosts.get('hls')
3197         if hls_host:
3198             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3199         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3200             m3u8_url, video_id, 'mp4', 'm3u8_native',
3201             m3u8_id='hls', fatal=False)
3202         formats.extend(m3u8_formats)
3203         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3204
3205         http_host = hosts.get('http')
3206         if http_host and m3u8_formats and not signed:
3207             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3208             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3209             qualities_length = len(qualities)
3210             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3211                 i = 0
3212                 for f in m3u8_formats:
3213                     if f['vcodec'] != 'none':
3214                         for protocol in ('http', 'https'):
3215                             http_f = f.copy()
3216                             del http_f['manifest_url']
3217                             http_url = re.sub(
3218                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3219                             http_f.update({
3220                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3221                                 'url': http_url,
3222                                 'protocol': protocol,
3223                             })
3224                             formats.append(http_f)
3225                         i += 1
3226
3227         return formats, subtitles
3228
3229     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3230         query = compat_urlparse.urlparse(url).query
3231         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3232         mobj = re.search(
3233             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3234         url_base = mobj.group('url')
3235         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3236         formats = []
3237
3238         def manifest_url(manifest):
3239             m_url = '%s/%s' % (http_base_url, manifest)
3240             if query:
3241                 m_url += '?%s' % query
3242             return m_url
3243
3244         if 'm3u8' not in skip_protocols:
3245             formats.extend(self._extract_m3u8_formats(
3246                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3247                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3248         if 'f4m' not in skip_protocols:
3249             formats.extend(self._extract_f4m_formats(
3250                 manifest_url('manifest.f4m'),
3251                 video_id, f4m_id='hds', fatal=False))
3252         if 'dash' not in skip_protocols:
3253             formats.extend(self._extract_mpd_formats(
3254                 manifest_url('manifest.mpd'),
3255                 video_id, mpd_id='dash', fatal=False))
3256         if re.search(r'(?:/smil:|\.smil)', url_base):
3257             if 'smil' not in skip_protocols:
3258                 rtmp_formats = self._extract_smil_formats(
3259                     manifest_url('jwplayer.smil'),
3260                     video_id, fatal=False)
3261                 for rtmp_format in rtmp_formats:
3262                     rtsp_format = rtmp_format.copy()
3263                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3264                     del rtsp_format['play_path']
3265                     del rtsp_format['ext']
3266                     rtsp_format.update({
3267                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3268                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3269                         'protocol': 'rtsp',
3270                     })
3271                     formats.extend([rtmp_format, rtsp_format])
3272         else:
3273             for protocol in ('rtmp', 'rtsp'):
3274                 if protocol not in skip_protocols:
3275                     formats.append({
3276                         'url': '%s:%s' % (protocol, url_base),
3277                         'format_id': protocol,
3278                         'protocol': protocol,
3279                     })
3280         return formats
3281
3282     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3283         mobj = re.search(
3284             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3285             webpage)
3286         if mobj:
3287             try:
3288                 jwplayer_data = self._parse_json(mobj.group('options'),
3289                                                  video_id=video_id,
3290                                                  transform_source=transform_source)
3291             except ExtractorError:
3292                 pass
3293             else:
3294                 if isinstance(jwplayer_data, dict):
3295                     return jwplayer_data
3296
3297     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3298         jwplayer_data = self._find_jwplayer_data(
3299             webpage, video_id, transform_source=js_to_json)
3300         return self._parse_jwplayer_data(
3301             jwplayer_data, video_id, *args, **kwargs)
3302
3303     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3304                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3305         # JWPlayer backward compatibility: flattened playlists
3306         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3307         if 'playlist' not in jwplayer_data:
3308             jwplayer_data = {'playlist': [jwplayer_data]}
3309
3310         entries = []
3311
3312         # JWPlayer backward compatibility: single playlist item
3313         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3314         if not isinstance(jwplayer_data['playlist'], list):
3315             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3316
3317         for video_data in jwplayer_data['playlist']:
3318             # JWPlayer backward compatibility: flattened sources
3319             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3320             if 'sources' not in video_data:
3321                 video_data['sources'] = [video_data]
3322
3323             this_video_id = video_id or video_data['mediaid']
3324
3325             formats = self._parse_jwplayer_formats(
3326                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3327                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3328
3329             subtitles = {}
3330             tracks = video_data.get('tracks')
3331             if tracks and isinstance(tracks, list):
3332                 for track in tracks:
3333                     if not isinstance(track, dict):
3334                         continue
3335                     track_kind = track.get('kind')
3336                     if not track_kind or not isinstance(track_kind, compat_str):
3337                         continue
3338                     if track_kind.lower() not in ('captions', 'subtitles'):
3339                         continue
3340                     track_url = urljoin(base_url, track.get('file'))
3341                     if not track_url:
3342                         continue
3343                     subtitles.setdefault(track.get('label') or 'en', []).append({
3344                         'url': self._proto_relative_url(track_url)
3345                     })
3346
3347             entry = {
3348                 'id': this_video_id,
3349                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3350                 'description': clean_html(video_data.get('description')),
3351                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3352                 'timestamp': int_or_none(video_data.get('pubdate')),
3353                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3354                 'subtitles': subtitles,
3355             }
3356             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3357             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3358                 entry.update({
3359                     '_type': 'url_transparent',
3360                     'url': formats[0]['url'],
3361                 })
3362             else:
3363                 self._sort_formats(formats)
3364                 entry['formats'] = formats
3365             entries.append(entry)
3366         if len(entries) == 1:
3367             return entries[0]
3368         else:
3369             return self.playlist_result(entries)
3370
3371     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3372                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3373         urls = []
3374         formats = []
3375         for source in jwplayer_sources_data:
3376             if not isinstance(source, dict):
3377                 continue
3378             source_url = urljoin(
3379                 base_url, self._proto_relative_url(source.get('file')))
3380             if not source_url or source_url in urls:
3381                 continue
3382             urls.append(source_url)
3383             source_type = source.get('type') or ''
3384             ext = mimetype2ext(source_type) or determine_ext(source_url)
3385             if source_type == 'hls' or ext == 'm3u8':
3386                 formats.extend(self._extract_m3u8_formats(
3387                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3388                     m3u8_id=m3u8_id, fatal=False))
3389             elif source_type == 'dash' or ext == 'mpd':
3390                 formats.extend(self._extract_mpd_formats(
3391                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3392             elif ext == 'smil':
3393                 formats.extend(self._extract_smil_formats(
3394                     source_url, video_id, fatal=False))
3395             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3396             elif source_type.startswith('audio') or ext in (
3397                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3398                 formats.append({
3399                     'url': source_url,
3400                     'vcodec': 'none',
3401                     'ext': ext,
3402                 })
3403             else:
3404                 height = int_or_none(source.get('height'))
3405                 if height is None:
3406                     # Often no height is provided but there is a label in
3407                     # format like "1080p", "720p SD", or 1080.
3408                     height = int_or_none(self._search_regex(
3409                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3410                         'height', default=None))
3411                 a_format = {
3412                     'url': source_url,
3413                     'width': int_or_none(source.get('width')),
3414                     'height': height,
3415                     'tbr': int_or_none(source.get('bitrate')),
3416                     'ext': ext,
3417                 }
3418                 if source_url.startswith('rtmp'):
3419                     a_format['ext'] = 'flv'
3420                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3421                     # of jwplayer.flash.swf
3422                     rtmp_url_parts = re.split(
3423                         r'((?:mp4|mp3|flv):)', source_url, 1)
3424                     if len(rtmp_url_parts) == 3:
3425                         rtmp_url, prefix, play_path = rtmp_url_parts
3426                         a_format.update({
3427                             'url': rtmp_url,
3428                             'play_path': prefix + play_path,
3429                         })
3430                     if rtmp_params:
3431                         a_format.update(rtmp_params)
3432                 formats.append(a_format)
3433         return formats
3434
3435     def _live_title(self, name):
3436         """ Generate the title for a live video """
3437         now = datetime.datetime.now()
3438         now_str = now.strftime('%Y-%m-%d %H:%M')
3439         return name + ' ' + now_str
3440
3441     def _int(self, v, name, fatal=False, **kwargs):
3442         res = int_or_none(v, **kwargs)
3443         if 'get_attr' in kwargs:
3444             print(getattr(v, kwargs['get_attr']))
3445         if res is None:
3446             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3447             if fatal:
3448                 raise ExtractorError(msg)
3449             else:
3450                 self.report_warning(msg)
3451         return res
3452
3453     def _float(self, v, name, fatal=False, **kwargs):
3454         res = float_or_none(v, **kwargs)
3455         if res is None:
3456             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3457             if fatal:
3458                 raise ExtractorError(msg)
3459             else:
3460                 self.report_warning(msg)
3461         return res
3462
3463     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3464                     path='/', secure=False, discard=False, rest={}, **kwargs):
3465         cookie = compat_cookiejar_Cookie(
3466             0, name, value, port, port is not None, domain, True,
3467             domain.startswith('.'), path, True, secure, expire_time,
3468             discard, None, None, rest)
3469         self._downloader.cookiejar.set_cookie(cookie)
3470
3471     def _get_cookies(self, url):
3472         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3473         req = sanitized_Request(url)
3474         self._downloader.cookiejar.add_cookie_header(req)
3475         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3476
3477     def _apply_first_set_cookie_header(self, url_handle, cookie):
3478         """
3479         Apply first Set-Cookie header instead of the last. Experimental.
3480
3481         Some sites (e.g. [1-3]) may serve two cookies under the same name
3482         in Set-Cookie header and expect the first (old) one to be set rather
3483         than second (new). However, as of RFC6265 the newer one cookie
3484         should be set into cookie store what actually happens.
3485         We will workaround this issue by resetting the cookie to
3486         the first one manually.
3487         1. https://new.vk.com/
3488         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3489         3. https://learning.oreilly.com/
3490         """
3491         for header, cookies in url_handle.headers.items():
3492             if header.lower() != 'set-cookie':
3493                 continue
3494             if sys.version_info[0] >= 3:
3495                 cookies = cookies.encode('iso-8859-1')
3496             cookies = cookies.decode('utf-8')
3497             cookie_value = re.search(
3498                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3499             if cookie_value:
3500                 value, domain = cookie_value.groups()
3501                 self._set_cookie(domain, cookie, value)
3502                 break
3503
3504     def get_testcases(self, include_onlymatching=False):
3505         t = getattr(self, '_TEST', None)
3506         if t:
3507             assert not hasattr(self, '_TESTS'), \
3508                 '%s has _TEST and _TESTS' % type(self).__name__
3509             tests = [t]
3510         else:
3511             tests = getattr(self, '_TESTS', [])
3512         for t in tests:
3513             if not include_onlymatching and t.get('only_matching', False):
3514                 continue
3515             t['name'] = type(self).__name__[:-len('IE')]
3516             yield t
3517
3518     def is_suitable(self, age_limit):
3519         """ Test whether the extractor is generally suitable for the given
3520         age limit (i.e. pornographic sites are not, all others usually are) """
3521
3522         any_restricted = False
3523         for tc in self.get_testcases(include_onlymatching=False):
3524             if tc.get('playlist', []):
3525                 tc = tc['playlist'][0]
3526             is_restricted = age_restricted(
3527                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3528             if not is_restricted:
3529                 return True
3530             any_restricted = any_restricted or is_restricted
3531         return not any_restricted
3532
3533     def extract_subtitles(self, *args, **kwargs):
3534         if (self.get_param('writesubtitles', False)
3535                 or self.get_param('listsubtitles')):
3536             return self._get_subtitles(*args, **kwargs)
3537         return {}
3538
3539     def _get_subtitles(self, *args, **kwargs):
3540         raise NotImplementedError('This method must be implemented by subclasses')
3541
3542     def extract_comments(self, *args, **kwargs):
3543         if not self.get_param('getcomments'):
3544             return None
3545         generator = self._get_comments(*args, **kwargs)
3546
3547         def extractor():
3548             comments = []
3549             try:
3550                 while True:
3551                     comments.append(next(generator))
3552             except KeyboardInterrupt:
3553                 interrupted = True
3554                 self.to_screen('Interrupted by user')
3555             except StopIteration:
3556                 interrupted = False
3557             comment_count = len(comments)
3558             self.to_screen(f'Extracted {comment_count} comments')
3559             return {
3560                 'comments': comments,
3561                 'comment_count': None if interrupted else comment_count
3562             }
3563         return extractor
3564
3565     def _get_comments(self, *args, **kwargs):
3566         raise NotImplementedError('This method must be implemented by subclasses')
3567
3568     @staticmethod
3569     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3570         """ Merge subtitle items for one language. Items with duplicated URLs
3571         will be dropped. """
3572         list1_urls = set([item['url'] for item in subtitle_list1])
3573         ret = list(subtitle_list1)
3574         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3575         return ret
3576
3577     @classmethod
3578     def _merge_subtitles(cls, *dicts, target=None):
3579         """ Merge subtitle dictionaries, language by language. """
3580         if target is None:
3581             target = {}
3582         for d in dicts:
3583             for lang, subs in d.items():
3584                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3585         return target
3586
3587     def extract_automatic_captions(self, *args, **kwargs):
3588         if (self.get_param('writeautomaticsub', False)
3589                 or self.get_param('listsubtitles')):
3590             return self._get_automatic_captions(*args, **kwargs)
3591         return {}
3592
3593     def _get_automatic_captions(self, *args, **kwargs):
3594         raise NotImplementedError('This method must be implemented by subclasses')
3595
3596     def mark_watched(self, *args, **kwargs):
3597         if not self.get_param('mark_watched', False):
3598             return
3599         if (self._get_login_info()[0] is not None
3600                 or self.get_param('cookiefile')
3601                 or self.get_param('cookiesfrombrowser')):
3602             self._mark_watched(*args, **kwargs)
3603
3604     def _mark_watched(self, *args, **kwargs):
3605         raise NotImplementedError('This method must be implemented by subclasses')
3606
3607     def geo_verification_headers(self):
3608         headers = {}
3609         geo_verification_proxy = self.get_param('geo_verification_proxy')
3610         if geo_verification_proxy:
3611             headers['Ytdl-request-proxy'] = geo_verification_proxy
3612         return headers
3613
3614     def _generic_id(self, url):
3615         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3616
3617     def _generic_title(self, url):
3618         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3619
3620     @staticmethod
3621     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3622         all_known = all(map(
3623             lambda x: x is not None,
3624             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3625         return (
3626             'private' if is_private
3627             else 'premium_only' if needs_premium
3628             else 'subscriber_only' if needs_subscription
3629             else 'needs_auth' if needs_auth
3630             else 'unlisted' if is_unlisted
3631             else 'public' if all_known
3632             else None)
3633
3634     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3635         '''
3636         @returns            A list of values for the extractor argument given by "key"
3637                             or "default" if no such key is present
3638         @param default      The default value to return when the key is not present (default: [])
3639         @param casesense    When false, the values are converted to lower case
3640         '''
3641         val = traverse_obj(
3642             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3643         if val is None:
3644             return [] if default is NO_DEFAULT else default
3645         return list(val) if casesense else [x.lower() for x in val]
3646
3647
3648 class SearchInfoExtractor(InfoExtractor):
3649     """
3650     Base class for paged search queries extractors.
3651     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3652     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3653     """
3654
3655     _MAX_RESULTS = float('inf')
3656
3657     @classmethod
3658     def _make_valid_url(cls):
3659         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3660
3661     @classmethod
3662     def suitable(cls, url):
3663         return re.match(cls._make_valid_url(), url) is not None
3664
3665     def _real_extract(self, query):
3666         mobj = re.match(self._make_valid_url(), query)
3667         if mobj is None:
3668             raise ExtractorError('Invalid search query "%s"' % query)
3669
3670         prefix = mobj.group('prefix')
3671         query = mobj.group('query')
3672         if prefix == '':
3673             return self._get_n_results(query, 1)
3674         elif prefix == 'all':
3675             return self._get_n_results(query, self._MAX_RESULTS)
3676         else:
3677             n = int(prefix)
3678             if n <= 0:
3679                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3680             elif n > self._MAX_RESULTS:
3681                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3682                 n = self._MAX_RESULTS
3683             return self._get_n_results(query, n)
3684
3685     def _get_n_results(self, query, n):
3686         """Get a specified number of results for a query.
3687         Either this function or _search_results must be overridden by subclasses """
3688         return self.playlist_result(
3689             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3690             query, query)
3691
3692     def _search_results(self, query):
3693         """Returns an iterator of search results"""
3694         raise NotImplementedError('This method must be implemented by subclasses')
3695
3696     @property
3697     def SEARCH_KEY(self):
3698         return self._SEARCH_KEY