yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import hashlib
   4 import itertools
   5 import json
   6 import math
   7 import netrc
   8 import os
   9 import random
  10 import sys
  11 import time
  12 import xml.etree.ElementTree
  13
  14 from ..compat import (
  15     compat_cookiejar_Cookie,
  16     compat_cookies_SimpleCookie,
  17     compat_etree_fromstring,
  18     compat_expanduser,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_unquote,
  25     compat_urllib_parse_urlencode,
  26     compat_urllib_request,
  27     compat_urlparse,
  28     re,
  29 )
  30 from ..downloader import FileDownloader
  31 from ..downloader.f4m import get_base_url, remove_encrypted_media
  32 from ..utils import (
  33     JSON_LD_RE,
  34     NO_DEFAULT,
  35     ExtractorError,
  36     GeoRestrictedError,
  37     GeoUtils,
  38     RegexNotFoundError,
  39     UnsupportedError,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     encode_data_uri,
  48     error_to_compat_str,
  49     extract_attributes,
  50     filter_dict,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     int_or_none,
  55     join_nonempty,
  56     js_to_json,
  57     mimetype2ext,
  58     network_exceptions,
  59     orderedSet,
  60     parse_bitrate,
  61     parse_codecs,
  62     parse_duration,
  63     parse_iso8601,
  64     parse_m3u8_attributes,
  65     parse_resolution,
  66     sanitize_filename,
  67     sanitized_Request,
  68     str_or_none,
  69     str_to_int,
  70     strip_or_none,
  71     traverse_obj,
  72     try_get,
  73     unescapeHTML,
  74     unified_strdate,
  75     unified_timestamp,
  76     update_Request,
  77     update_url_query,
  78     url_basename,
  79     url_or_none,
  80     urljoin,
  81     variadic,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor:
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped.
 107
 108     Additionally, it must contain either a formats entry or a url one:
 109
 110     formats:        A list of dictionaries for each format available, ordered
 111                     from worst to best quality.
 112
 113                     Potential fields:
 114                     * url        The mandatory URL representing the media:
 115                                    for plain file media - HTTP URL of this file,
 116                                    for RTMP - RTMP URL,
 117                                    for HLS - URL of the M3U8 media playlist,
 118                                    for HDS - URL of the F4M manifest,
 119                                    for DASH
 120                                      - HTTP URL to plain file media (in case of
 121                                        unfragmented media)
 122                                      - URL of the MPD manifest or base URL
 123                                        representing the media if MPD manifest
 124                                        is parsed from a string (in case of
 125                                        fragmented media)
 126                                    for MSS - URL of the ISM manifest.
 127                     * manifest_url
 128                                  The URL of the manifest file in case of
 129                                  fragmented media:
 130                                    for HLS - URL of the M3U8 master playlist,
 131                                    for HDS - URL of the F4M manifest,
 132                                    for DASH - URL of the MPD manifest,
 133                                    for MSS - URL of the ISM manifest.
 134                     * manifest_stream_number  (For internal use only)
 135                                  The index of the stream in the manifest file
 136                     * ext        Will be calculated from URL if missing
 137                     * format     A human-readable description of the format
 138                                  ("mp4 container with h264/opus").
 139                                  Calculated from the format_id, width, height.
 140                                  and format_note fields if missing.
 141                     * format_id  A short description of the format
 142                                  ("mp4_h264_opus" or "19").
 143                                 Technically optional, but strongly recommended.
 144                     * format_note Additional info about the format
 145                                  ("3D" or "DASH video")
 146                     * width      Width of the video, if known
 147                     * height     Height of the video, if known
 148                     * resolution Textual description of width and height
 149                     * dynamic_range The dynamic range of the video. One of:
 150                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 151                     * tbr        Average bitrate of audio and video in KBit/s
 152                     * abr        Average audio bitrate in KBit/s
 153                     * acodec     Name of the audio codec in use
 154                     * asr        Audio sampling rate in Hertz
 155                     * vbr        Average video bitrate in KBit/s
 156                     * fps        Frame rate
 157                     * vcodec     Name of the video codec in use
 158                     * container  Name of the container format
 159                     * filesize   The number of bytes, if known in advance
 160                     * filesize_approx  An estimate for the number of bytes
 161                     * player_url SWF Player URL (used for rtmpdump).
 162                     * protocol   The protocol that will be used for the actual
 163                                  download, lower-case. One of "http", "https" or
 164                                  one of the protocols defined in downloader.PROTOCOL_MAP
 165                     * fragment_base_url
 166                                  Base URL for fragments. Each fragment's path
 167                                  value (if present) will be relative to
 168                                  this URL.
 169                     * fragments  A list of fragments of a fragmented media.
 170                                  Each fragment entry must contain either an url
 171                                  or a path. If an url is present it should be
 172                                  considered by a client. Otherwise both path and
 173                                  fragment_base_url must be present. Here is
 174                                  the list of all potential fields:
 175                                  * "url" - fragment's URL
 176                                  * "path" - fragment's path relative to
 177                                             fragment_base_url
 178                                  * "duration" (optional, int or float)
 179                                  * "filesize" (optional, int)
 180                     * is_from_start  Is a live format that can be downloaded
 181                                 from the start. Boolean
 182                     * preference Order number of this format. If this field is
 183                                  present and not None, the formats get sorted
 184                                  by this field, regardless of all other values.
 185                                  -1 for default (order by other properties),
 186                                  -2 or smaller for less than default.
 187                                  < -1000 to hide the format (if there is
 188                                     another one which is strictly better)
 189                     * language   Language code, e.g. "de" or "en-US".
 190                     * language_preference  Is this in the language mentioned in
 191                                  the URL?
 192                                  10 if it's what the URL is about,
 193                                  -1 for default (don't know),
 194                                  -10 otherwise, other values reserved for now.
 195                     * quality    Order number of the video quality of this
 196                                  format, irrespective of the file format.
 197                                  -1 for default (order by other properties),
 198                                  -2 or smaller for less than default.
 199                     * source_preference  Order number for this video source
 200                                   (quality takes higher priority)
 201                                  -1 for default (order by other properties),
 202                                  -2 or smaller for less than default.
 203                     * http_headers  A dictionary of additional HTTP headers
 204                                  to add to the request.
 205                     * stretched_ratio  If given and not 1, indicates that the
 206                                  video's pixels are not square.
 207                                  width : height ratio as float.
 208                     * no_resume  The server does not support resuming the
 209                                  (HTTP or RTMP) download. Boolean.
 210                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 211                     * downloader_options  A dictionary of downloader options as
 212                                  described in FileDownloader (For internal use only)
 213                     RTMP formats can also have the additional fields: page_url,
 214                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 215                     rtmp_protocol, rtmp_real_time
 216
 217     url:            Final video URL.
 218     ext:            Video filename extension.
 219     format:         The video format, defaults to ext (used for --get-format)
 220     player_url:     SWF Player URL (used for rtmpdump).
 221
 222     The following fields are optional:
 223
 224     direct:         True if a direct video file was given (must only be set by GenericIE)
 225     alt_title:      A secondary title of the video.
 226     display_id      An alternative identifier for the video, not necessarily
 227                     unique, but available before title. Typically, id is
 228                     something like "4234987", title "Dancing naked mole rats",
 229                     and display_id "dancing-naked-mole-rats"
 230     thumbnails:     A list of dictionaries, with the following entries:
 231                         * "id" (optional, string) - Thumbnail format ID
 232                         * "url"
 233                         * "preference" (optional, int) - quality of the image
 234                         * "width" (optional, int)
 235                         * "height" (optional, int)
 236                         * "resolution" (optional, string "{width}x{height}",
 237                                         deprecated)
 238                         * "filesize" (optional, int)
 239                         * "http_headers" (dict) - HTTP headers for the request
 240     thumbnail:      Full URL to a video thumbnail image.
 241     description:    Full video description.
 242     uploader:       Full name of the video uploader.
 243     license:        License name the video is licensed under.
 244     creator:        The creator of the video.
 245     timestamp:      UNIX timestamp of the moment the video was uploaded
 246     upload_date:    Video upload date in UTC (YYYYMMDD).
 247                     If not explicitly set, calculated from timestamp
 248     release_timestamp: UNIX timestamp of the moment the video was released.
 249                     If it is not clear whether to use timestamp or this, use the former
 250     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 251                     If not explicitly set, calculated from release_timestamp
 252     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 253     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 254                     If not explicitly set, calculated from modified_timestamp
 255     uploader_id:    Nickname or id of the video uploader.
 256     uploader_url:   Full URL to a personal webpage of the video uploader.
 257     channel:        Full name of the channel the video is uploaded on.
 258                     Note that channel fields may or may not repeat uploader
 259                     fields. This depends on a particular extractor.
 260     channel_id:     Id of the channel.
 261     channel_url:    Full URL to a channel webpage.
 262     channel_follower_count: Number of followers of the channel.
 263     location:       Physical location where the video was filmed.
 264     subtitles:      The available subtitles as a dictionary in the format
 265                     {tag: subformats}. "tag" is usually a language code, and
 266                     "subformats" is a list sorted from lower to higher
 267                     preference, each element is a dictionary with the "ext"
 268                     entry and one of:
 269                         * "data": The subtitles file contents
 270                         * "url": A URL pointing to the subtitles file
 271                     It can optionally also have:
 272                         * "name": Name or description of the subtitles
 273                         * "http_headers": A dictionary of additional HTTP headers
 274                                   to add to the request.
 275                     "ext" will be calculated from URL if missing
 276     automatic_captions: Like 'subtitles'; contains automatically generated
 277                     captions instead of normal subtitles
 278     duration:       Length of the video in seconds, as an integer or float.
 279     view_count:     How many users have watched the video on the platform.
 280     like_count:     Number of positive ratings of the video
 281     dislike_count:  Number of negative ratings of the video
 282     repost_count:   Number of reposts of the video
 283     average_rating: Average rating give by users, the scale used depends on the webpage
 284     comment_count:  Number of comments on the video
 285     comments:       A list of comments, each with one or more of the following
 286                     properties (all but one of text or html optional):
 287                         * "author" - human-readable name of the comment author
 288                         * "author_id" - user ID of the comment author
 289                         * "author_thumbnail" - The thumbnail of the comment author
 290                         * "id" - Comment ID
 291                         * "html" - Comment as HTML
 292                         * "text" - Plain text of the comment
 293                         * "timestamp" - UNIX timestamp of comment
 294                         * "parent" - ID of the comment this one is replying to.
 295                                      Set to "root" to indicate that this is a
 296                                      comment to the original video.
 297                         * "like_count" - Number of positive ratings of the comment
 298                         * "dislike_count" - Number of negative ratings of the comment
 299                         * "is_favorited" - Whether the comment is marked as
 300                                            favorite by the video uploader
 301                         * "author_is_uploader" - Whether the comment is made by
 302                                                  the video uploader
 303     age_limit:      Age restriction for the video, as an integer (years)
 304     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 305                     should allow to get the same result again. (It will be set
 306                     by YoutubeDL if it's missing)
 307     categories:     A list of categories that the video falls in, for example
 308                     ["Sports", "Berlin"]
 309     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 310     cast:           A list of the video cast
 311     is_live:        True, False, or None (=unknown). Whether this video is a
 312                     live stream that goes on instead of a fixed-length video.
 313     was_live:       True, False, or None (=unknown). Whether this video was
 314                     originally a live stream.
 315     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 316                     If absent, automatically set from is_live, was_live
 317     start_time:     Time in seconds where the reproduction should start, as
 318                     specified in the URL.
 319     end_time:       Time in seconds where the reproduction should end, as
 320                     specified in the URL.
 321     chapters:       A list of dictionaries, with the following entries:
 322                         * "start_time" - The start time of the chapter in seconds
 323                         * "end_time" - The end time of the chapter in seconds
 324                         * "title" (optional, string)
 325     playable_in_embed: Whether this video is allowed to play in embedded
 326                     players on other sites. Can be True (=always allowed),
 327                     False (=never allowed), None (=unknown), or a string
 328                     specifying the criteria for embedability (Eg: 'whitelist')
 329     availability:   Under what condition the video is available. One of
 330                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 331                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 332                     to set it
 333     __post_extractor: A function to be called just before the metadata is
 334                     written to either disk, logger or console. The function
 335                     must return a dict which will be added to the info_dict.
 336                     This is usefull for additional information that is
 337                     time-consuming to extract. Note that the fields thus
 338                     extracted will not be available to output template and
 339                     match_filter. So, only "comments" and "comment_count" are
 340                     currently allowed to be extracted via this method.
 341
 342     The following fields should only be used when the video belongs to some logical
 343     chapter or section:
 344
 345     chapter:        Name or title of the chapter the video belongs to.
 346     chapter_number: Number of the chapter the video belongs to, as an integer.
 347     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 348
 349     The following fields should only be used when the video is an episode of some
 350     series, programme or podcast:
 351
 352     series:         Title of the series or programme the video episode belongs to.
 353     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 354     season:         Title of the season the video episode belongs to.
 355     season_number:  Number of the season the video episode belongs to, as an integer.
 356     season_id:      Id of the season the video episode belongs to, as a unicode string.
 357     episode:        Title of the video episode. Unlike mandatory video title field,
 358                     this field should denote the exact title of the video episode
 359                     without any kind of decoration.
 360     episode_number: Number of the video episode within a season, as an integer.
 361     episode_id:     Id of the video episode, as a unicode string.
 362
 363     The following fields should only be used when the media is a track or a part of
 364     a music album:
 365
 366     track:          Title of the track.
 367     track_number:   Number of the track within an album or a disc, as an integer.
 368     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 369                     as a unicode string.
 370     artist:         Artist(s) of the track.
 371     genre:          Genre(s) of the track.
 372     album:          Title of the album the track belongs to.
 373     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 374     album_artist:   List of all artists appeared on the album (e.g.
 375                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 376                     and compilations).
 377     disc_number:    Number of the disc or other physical medium the track belongs to,
 378                     as an integer.
 379     release_year:   Year (YYYY) when the album was released.
 380     composer:       Composer of the piece
 381
 382     Unless mentioned otherwise, the fields should be Unicode strings.
 383
 384     Unless mentioned otherwise, None is equivalent to absence of information.
 385
 386
 387     _type "playlist" indicates multiple videos.
 388     There must be a key "entries", which is a list, an iterable, or a PagedList
 389     object, each element of which is a valid dictionary by this specification.
 390
 391     Additionally, playlists can have "id", "title", and any other relevent
 392     attributes with the same semantics as videos (see above).
 393
 394     It can also have the following optional fields:
 395
 396     playlist_count: The total number of videos in a playlist. If not given,
 397                     YoutubeDL tries to calculate it from "entries"
 398
 399
 400     _type "multi_video" indicates that there are multiple videos that
 401     form a single show, for examples multiple acts of an opera or TV episode.
 402     It must have an entries key like a playlist and contain all the keys
 403     required for a video at the same time.
 404
 405
 406     _type "url" indicates that the video must be extracted from another
 407     location, possibly by a different extractor. Its only required key is:
 408     "url" - the next URL to extract.
 409     The key "ie_key" can be set to the class name (minus the trailing "IE",
 410     e.g. "Youtube") if the extractor class is known in advance.
 411     Additionally, the dictionary may have any properties of the resolved entity
 412     known in advance, for example "title" if the title of the referred video is
 413     known ahead of time.
 414
 415
 416     _type "url_transparent" entities have the same specification as "url", but
 417     indicate that the given additional information is more precise than the one
 418     associated with the resolved URL.
 419     This is useful when a site employs a video service that hosts the video and
 420     its technical metadata, but that video service does not embed a useful
 421     title, description etc.
 422
 423
 424     Subclasses of this should define a _VALID_URL regexp and, re-define the
 425     _real_extract() and (optionally) _real_initialize() methods.
 426     Probably, they should also be added to the list of extractors.
 427
 428     Subclasses may also override suitable() if necessary, but ensure the function
 429     signature is preserved and that this function imports everything it needs
 430     (except other extractors), so that lazy_extractors works correctly.
 431
 432     To support username + password (or netrc) login, the extractor must define a
 433     _NETRC_MACHINE and re-define _perform_login(username, password) and
 434     (optionally) _initialize_pre_login() methods. The _perform_login method will
 435     be called between _initialize_pre_login and _real_initialize if credentials
 436     are passed by the user. In cases where it is necessary to have the login
 437     process as part of the extraction rather than initialization, _perform_login
 438     can be left undefined.
 439
 440     _GEO_BYPASS attribute may be set to False in order to disable
 441     geo restriction bypass mechanisms for a particular extractor.
 442     Though it won't disable explicit geo restriction bypass based on
 443     country code provided with geo_bypass_country.
 444
 445     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 446     countries for this extractor. One of these countries will be used by
 447     geo restriction bypass mechanism right away in order to bypass
 448     geo restriction, of course, if the mechanism is not disabled.
 449
 450     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 451     IP blocks in CIDR notation for this extractor. One of these IP blocks
 452     will be used by geo restriction bypass mechanism similarly
 453     to _GEO_COUNTRIES.
 454
 455     The _WORKING attribute should be set to False for broken IEs
 456     in order to warn the users and skip the tests.
 457     """
 458
 459     _ready = False
 460     _downloader = None
 461     _x_forwarded_for_ip = None
 462     _GEO_BYPASS = True
 463     _GEO_COUNTRIES = None
 464     _GEO_IP_BLOCKS = None
 465     _WORKING = True
 466     _NETRC_MACHINE = None
 467     IE_DESC = None
 468
 469     _LOGIN_HINTS = {
 470         'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials',
 471         'cookies': (
 472             'Use --cookies-from-browser or --cookies for the authentication. '
 473             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 474         'password': 'Use --username and --password, or --netrc to provide account credentials',
 475     }
 476
 477     def __init__(self, downloader=None):
 478         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 479         If a downloader is not passed during initialization,
 480         it must be set using "set_downloader()" before "extract()" is called"""
 481         self._ready = False
 482         self._x_forwarded_for_ip = None
 483         self._printed_messages = set()
 484         self.set_downloader(downloader)
 485
 486     @classmethod
 487     def _match_valid_url(cls, url):
 488         # This does not use has/getattr intentionally - we want to know whether
 489         # we have cached the regexp for *this* class, whereas getattr would also
 490         # match the superclass
 491         if '_VALID_URL_RE' not in cls.__dict__:
 492             if '_VALID_URL' not in cls.__dict__:
 493                 cls._VALID_URL = cls._make_valid_url()
 494             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 495         return cls._VALID_URL_RE.match(url)
 496
 497     @classmethod
 498     def suitable(cls, url):
 499         """Receives a URL and returns True if suitable for this IE."""
 500         # This function must import everything it needs (except other extractors),
 501         # so that lazy_extractors works correctly
 502         return cls._match_valid_url(url) is not None
 503
 504     @classmethod
 505     def _match_id(cls, url):
 506         return cls._match_valid_url(url).group('id')
 507
 508     @classmethod
 509     def get_temp_id(cls, url):
 510         try:
 511             return cls._match_id(url)
 512         except (IndexError, AttributeError):
 513             return None
 514
 515     @classmethod
 516     def working(cls):
 517         """Getter method for _WORKING."""
 518         return cls._WORKING
 519
 520     @classmethod
 521     def supports_login(cls):
 522         return bool(cls._NETRC_MACHINE)
 523
 524     def initialize(self):
 525         """Initializes an instance (authentication, etc)."""
 526         self._printed_messages = set()
 527         self._initialize_geo_bypass({
 528             'countries': self._GEO_COUNTRIES,
 529             'ip_blocks': self._GEO_IP_BLOCKS,
 530         })
 531         if not self._ready:
 532             self._initialize_pre_login()
 533             if self.supports_login():
 534                 username, password = self._get_login_info()
 535                 if username:
 536                     self._perform_login(username, password)
 537             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 538                 self.report_warning(f'Login with password is not supported for this website. {self._LOGIN_HINTS["cookies"]}')
 539             self._real_initialize()
 540             self._ready = True
 541
 542     def _initialize_geo_bypass(self, geo_bypass_context):
 543         """
 544         Initialize geo restriction bypass mechanism.
 545
 546         This method is used to initialize geo bypass mechanism based on faking
 547         X-Forwarded-For HTTP header. A random country from provided country list
 548         is selected and a random IP belonging to this country is generated. This
 549         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 550         HTTP requests.
 551
 552         This method will be used for initial geo bypass mechanism initialization
 553         during the instance initialization with _GEO_COUNTRIES and
 554         _GEO_IP_BLOCKS.
 555
 556         You may also manually call it from extractor's code if geo bypass
 557         information is not available beforehand (e.g. obtained during
 558         extraction) or due to some other reason. In this case you should pass
 559         this information in geo bypass context passed as first argument. It may
 560         contain following fields:
 561
 562         countries:  List of geo unrestricted countries (similar
 563                     to _GEO_COUNTRIES)
 564         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 565                     (similar to _GEO_IP_BLOCKS)
 566
 567         """
 568         if not self._x_forwarded_for_ip:
 569
 570             # Geo bypass mechanism is explicitly disabled by user
 571             if not self.get_param('geo_bypass', True):
 572                 return
 573
 574             if not geo_bypass_context:
 575                 geo_bypass_context = {}
 576
 577             # Backward compatibility: previously _initialize_geo_bypass
 578             # expected a list of countries, some 3rd party code may still use
 579             # it this way
 580             if isinstance(geo_bypass_context, (list, tuple)):
 581                 geo_bypass_context = {
 582                     'countries': geo_bypass_context,
 583                 }
 584
 585             # The whole point of geo bypass mechanism is to fake IP
 586             # as X-Forwarded-For HTTP header based on some IP block or
 587             # country code.
 588
 589             # Path 1: bypassing based on IP block in CIDR notation
 590
 591             # Explicit IP block specified by user, use it right away
 592             # regardless of whether extractor is geo bypassable or not
 593             ip_block = self.get_param('geo_bypass_ip_block', None)
 594
 595             # Otherwise use random IP block from geo bypass context but only
 596             # if extractor is known as geo bypassable
 597             if not ip_block:
 598                 ip_blocks = geo_bypass_context.get('ip_blocks')
 599                 if self._GEO_BYPASS and ip_blocks:
 600                     ip_block = random.choice(ip_blocks)
 601
 602             if ip_block:
 603                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 604                 self._downloader.write_debug(
 605                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 606                 return
 607
 608             # Path 2: bypassing based on country code
 609
 610             # Explicit country code specified by user, use it right away
 611             # regardless of whether extractor is geo bypassable or not
 612             country = self.get_param('geo_bypass_country', None)
 613
 614             # Otherwise use random country code from geo bypass context but
 615             # only if extractor is known as geo bypassable
 616             if not country:
 617                 countries = geo_bypass_context.get('countries')
 618                 if self._GEO_BYPASS and countries:
 619                     country = random.choice(countries)
 620
 621             if country:
 622                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 623                 self._downloader.write_debug(
 624                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 625
 626     def extract(self, url):
 627         """Extracts URL information and returns it in list of dicts."""
 628         try:
 629             for _ in range(2):
 630                 try:
 631                     self.initialize()
 632                     self.write_debug('Extracting URL: %s' % url)
 633                     ie_result = self._real_extract(url)
 634                     if ie_result is None:
 635                         return None
 636                     if self._x_forwarded_for_ip:
 637                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 638                     subtitles = ie_result.get('subtitles')
 639                     if (subtitles and 'live_chat' in subtitles
 640                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 641                         del subtitles['live_chat']
 642                     return ie_result
 643                 except GeoRestrictedError as e:
 644                     if self.__maybe_fake_ip_and_retry(e.countries):
 645                         continue
 646                     raise
 647         except UnsupportedError:
 648             raise
 649         except ExtractorError as e:
 650             kwargs = {
 651                 'video_id': e.video_id or self.get_temp_id(url),
 652                 'ie': self.IE_NAME,
 653                 'tb': e.traceback or sys.exc_info()[2],
 654                 'expected': e.expected,
 655                 'cause': e.cause
 656             }
 657             if hasattr(e, 'countries'):
 658                 kwargs['countries'] = e.countries
 659             raise type(e)(e.orig_msg, **kwargs)
 660         except compat_http_client.IncompleteRead as e:
 661             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 662         except (KeyError, StopIteration) as e:
 663             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 664
 665     def __maybe_fake_ip_and_retry(self, countries):
 666         if (not self.get_param('geo_bypass_country', None)
 667                 and self._GEO_BYPASS
 668                 and self.get_param('geo_bypass', True)
 669                 and not self._x_forwarded_for_ip
 670                 and countries):
 671             country_code = random.choice(countries)
 672             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 673             if self._x_forwarded_for_ip:
 674                 self.report_warning(
 675                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 676                     % (self._x_forwarded_for_ip, country_code.upper()))
 677                 return True
 678         return False
 679
 680     def set_downloader(self, downloader):
 681         """Sets a YoutubeDL instance as the downloader for this IE."""
 682         self._downloader = downloader
 683
 684     def _initialize_pre_login(self):
 685         """ Intialization before login. Redefine in subclasses."""
 686         pass
 687
 688     def _perform_login(self, username, password):
 689         """ Login with username and password. Redefine in subclasses."""
 690         pass
 691
 692     def _real_initialize(self):
 693         """Real initialization process. Redefine in subclasses."""
 694         pass
 695
 696     def _real_extract(self, url):
 697         """Real extraction process. Redefine in subclasses."""
 698         raise NotImplementedError('This method must be implemented by subclasses')
 699
 700     @classmethod
 701     def ie_key(cls):
 702         """A string for getting the InfoExtractor with get_info_extractor"""
 703         return cls.__name__[:-2]
 704
 705     @property
 706     def IE_NAME(self):
 707         return compat_str(type(self).__name__[:-2])
 708
 709     @staticmethod
 710     def __can_accept_status_code(err, expected_status):
 711         assert isinstance(err, compat_urllib_error.HTTPError)
 712         if expected_status is None:
 713             return False
 714         elif callable(expected_status):
 715             return expected_status(err.code) is True
 716         else:
 717             return err.code in variadic(expected_status)
 718
 719     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 720         """
 721         Return the response handle.
 722
 723         See _download_webpage docstring for arguments specification.
 724         """
 725         if not self._downloader._first_webpage_request:
 726             sleep_interval = self.get_param('sleep_interval_requests') or 0
 727             if sleep_interval > 0:
 728                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 729                 time.sleep(sleep_interval)
 730         else:
 731             self._downloader._first_webpage_request = False
 732
 733         if note is None:
 734             self.report_download_webpage(video_id)
 735         elif note is not False:
 736             if video_id is None:
 737                 self.to_screen(str(note))
 738             else:
 739                 self.to_screen(f'{video_id}: {note}')
 740
 741         # Some sites check X-Forwarded-For HTTP header in order to figure out
 742         # the origin of the client behind proxy. This allows bypassing geo
 743         # restriction by faking this header's value to IP that belongs to some
 744         # geo unrestricted country. We will do so once we encounter any
 745         # geo restriction error.
 746         if self._x_forwarded_for_ip:
 747             if 'X-Forwarded-For' not in headers:
 748                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 749
 750         if isinstance(url_or_request, compat_urllib_request.Request):
 751             url_or_request = update_Request(
 752                 url_or_request, data=data, headers=headers, query=query)
 753         else:
 754             if query:
 755                 url_or_request = update_url_query(url_or_request, query)
 756             if data is not None or headers:
 757                 url_or_request = sanitized_Request(url_or_request, data, headers)
 758         try:
 759             return self._downloader.urlopen(url_or_request)
 760         except network_exceptions as err:
 761             if isinstance(err, compat_urllib_error.HTTPError):
 762                 if self.__can_accept_status_code(err, expected_status):
 763                     # Retain reference to error to prevent file object from
 764                     # being closed before it can be read. Works around the
 765                     # effects of <https://bugs.python.org/issue15002>
 766                     # introduced in Python 3.4.1.
 767                     err.fp._error = err
 768                     return err.fp
 769
 770             if errnote is False:
 771                 return False
 772             if errnote is None:
 773                 errnote = 'Unable to download webpage'
 774
 775             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 776             if fatal:
 777                 raise ExtractorError(errmsg, cause=err)
 778             else:
 779                 self.report_warning(errmsg)
 780                 return False
 781
 782     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 783         """
 784         Return a tuple (page content as string, URL handle).
 785
 786         See _download_webpage docstring for arguments specification.
 787         """
 788         # Strip hashes from the URL (#1038)
 789         if isinstance(url_or_request, (compat_str, str)):
 790             url_or_request = url_or_request.partition('#')[0]
 791
 792         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 793         if urlh is False:
 794             assert not fatal
 795             return False
 796         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 797         return (content, urlh)
 798
 799     @staticmethod
 800     def _guess_encoding_from_content(content_type, webpage_bytes):
 801         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 802         if m:
 803             encoding = m.group(1)
 804         else:
 805             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 806                           webpage_bytes[:1024])
 807             if m:
 808                 encoding = m.group(1).decode('ascii')
 809             elif webpage_bytes.startswith(b'\xff\xfe'):
 810                 encoding = 'utf-16'
 811             else:
 812                 encoding = 'utf-8'
 813
 814         return encoding
 815
 816     def __check_blocked(self, content):
 817         first_block = content[:512]
 818         if ('<title>Access to this site is blocked</title>' in content
 819                 and 'Websense' in first_block):
 820             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 821             blocked_iframe = self._html_search_regex(
 822                 r'<iframe src="([^"]+)"', content,
 823                 'Websense information URL', default=None)
 824             if blocked_iframe:
 825                 msg += ' Visit %s for more details' % blocked_iframe
 826             raise ExtractorError(msg, expected=True)
 827         if '<title>The URL you requested has been blocked</title>' in first_block:
 828             msg = (
 829                 'Access to this webpage has been blocked by Indian censorship. '
 830                 'Use a VPN or proxy server (with --proxy) to route around it.')
 831             block_msg = self._html_search_regex(
 832                 r'</h1><p>(.*?)</p>',
 833                 content, 'block message', default=None)
 834             if block_msg:
 835                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 836             raise ExtractorError(msg, expected=True)
 837         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 838                 and 'blocklist.rkn.gov.ru' in content):
 839             raise ExtractorError(
 840                 'Access to this webpage has been blocked by decision of the Russian government. '
 841                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 842                 expected=True)
 843
 844     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 845         content_type = urlh.headers.get('Content-Type', '')
 846         webpage_bytes = urlh.read()
 847         if prefix is not None:
 848             webpage_bytes = prefix + webpage_bytes
 849         if not encoding:
 850             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 851         if self.get_param('dump_intermediate_pages', False):
 852             self.to_screen('Dumping request to ' + urlh.geturl())
 853             dump = base64.b64encode(webpage_bytes).decode('ascii')
 854             self._downloader.to_screen(dump)
 855         if self.get_param('write_pages', False):
 856             basen = f'{video_id}_{urlh.geturl()}'
 857             trim_length = self.get_param('trim_file_name') or 240
 858             if len(basen) > trim_length:
 859                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 860                 basen = basen[:trim_length - len(h)] + h
 861             raw_filename = basen + '.dump'
 862             filename = sanitize_filename(raw_filename, restricted=True)
 863             self.to_screen('Saving request to ' + filename)
 864             # Working around MAX_PATH limitation on Windows (see
 865             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 866             if compat_os_name == 'nt':
 867                 absfilepath = os.path.abspath(filename)
 868                 if len(absfilepath) > 259:
 869                     filename = '\\\\?\\' + absfilepath
 870             with open(filename, 'wb') as outf:
 871                 outf.write(webpage_bytes)
 872
 873         try:
 874             content = webpage_bytes.decode(encoding, 'replace')
 875         except LookupError:
 876             content = webpage_bytes.decode('utf-8', 'replace')
 877
 878         self.__check_blocked(content)
 879
 880         return content
 881
 882     def _download_webpage(
 883             self, url_or_request, video_id, note=None, errnote=None,
 884             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 885             headers={}, query={}, expected_status=None):
 886         """
 887         Return the data of the page as a string.
 888
 889         Arguments:
 890         url_or_request -- plain text URL as a string or
 891             a compat_urllib_request.Requestobject
 892         video_id -- Video/playlist/item identifier (string)
 893
 894         Keyword arguments:
 895         note -- note printed before downloading (string)
 896         errnote -- note printed in case of an error (string)
 897         fatal -- flag denoting whether error should be considered fatal,
 898             i.e. whether it should cause ExtractionError to be raised,
 899             otherwise a warning will be reported and extraction continued
 900         tries -- number of tries
 901         timeout -- sleep interval between tries
 902         encoding -- encoding for a page content decoding, guessed automatically
 903             when not explicitly specified
 904         data -- POST data (bytes)
 905         headers -- HTTP headers (dict)
 906         query -- URL query (dict)
 907         expected_status -- allows to accept failed HTTP requests (non 2xx
 908             status code) by explicitly specifying a set of accepted status
 909             codes. Can be any of the following entities:
 910                 - an integer type specifying an exact failed status code to
 911                   accept
 912                 - a list or a tuple of integer types specifying a list of
 913                   failed status codes to accept
 914                 - a callable accepting an actual failed status code and
 915                   returning True if it should be accepted
 916             Note that this argument does not affect success status codes (2xx)
 917             which are always accepted.
 918         """
 919
 920         success = False
 921         try_count = 0
 922         while success is False:
 923             try:
 924                 res = self._download_webpage_handle(
 925                     url_or_request, video_id, note, errnote, fatal,
 926                     encoding=encoding, data=data, headers=headers, query=query,
 927                     expected_status=expected_status)
 928                 success = True
 929             except compat_http_client.IncompleteRead as e:
 930                 try_count += 1
 931                 if try_count >= tries:
 932                     raise e
 933                 self._sleep(timeout, video_id)
 934         if res is False:
 935             return res
 936         else:
 937             content, _ = res
 938             return content
 939
 940     def _download_xml_handle(
 941             self, url_or_request, video_id, note='Downloading XML',
 942             errnote='Unable to download XML', transform_source=None,
 943             fatal=True, encoding=None, data=None, headers={}, query={},
 944             expected_status=None):
 945         """
 946         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
 947
 948         See _download_webpage docstring for arguments specification.
 949         """
 950         res = self._download_webpage_handle(
 951             url_or_request, video_id, note, errnote, fatal=fatal,
 952             encoding=encoding, data=data, headers=headers, query=query,
 953             expected_status=expected_status)
 954         if res is False:
 955             return res
 956         xml_string, urlh = res
 957         return self._parse_xml(
 958             xml_string, video_id, transform_source=transform_source,
 959             fatal=fatal), urlh
 960
 961     def _download_xml(
 962             self, url_or_request, video_id,
 963             note='Downloading XML', errnote='Unable to download XML',
 964             transform_source=None, fatal=True, encoding=None,
 965             data=None, headers={}, query={}, expected_status=None):
 966         """
 967         Return the xml as an xml.etree.ElementTree.Element.
 968
 969         See _download_webpage docstring for arguments specification.
 970         """
 971         res = self._download_xml_handle(
 972             url_or_request, video_id, note=note, errnote=errnote,
 973             transform_source=transform_source, fatal=fatal, encoding=encoding,
 974             data=data, headers=headers, query=query,
 975             expected_status=expected_status)
 976         return res if res is False else res[0]
 977
 978     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 979         if transform_source:
 980             xml_string = transform_source(xml_string)
 981         try:
 982             return compat_etree_fromstring(xml_string.encode('utf-8'))
 983         except xml.etree.ElementTree.ParseError as ve:
 984             errmsg = '%s: Failed to parse XML ' % video_id
 985             if fatal:
 986                 raise ExtractorError(errmsg, cause=ve)
 987             else:
 988                 self.report_warning(errmsg + str(ve))
 989
 990     def _download_json_handle(
 991             self, url_or_request, video_id, note='Downloading JSON metadata',
 992             errnote='Unable to download JSON metadata', transform_source=None,
 993             fatal=True, encoding=None, data=None, headers={}, query={},
 994             expected_status=None):
 995         """
 996         Return a tuple (JSON object, URL handle).
 997
 998         See _download_webpage docstring for arguments specification.
 999         """
1000         res = self._download_webpage_handle(
1001             url_or_request, video_id, note, errnote, fatal=fatal,
1002             encoding=encoding, data=data, headers=headers, query=query,
1003             expected_status=expected_status)
1004         if res is False:
1005             return res
1006         json_string, urlh = res
1007         return self._parse_json(
1008             json_string, video_id, transform_source=transform_source,
1009             fatal=fatal), urlh
1010
1011     def _download_json(
1012             self, url_or_request, video_id, note='Downloading JSON metadata',
1013             errnote='Unable to download JSON metadata', transform_source=None,
1014             fatal=True, encoding=None, data=None, headers={}, query={},
1015             expected_status=None):
1016         """
1017         Return the JSON object as a dict.
1018
1019         See _download_webpage docstring for arguments specification.
1020         """
1021         res = self._download_json_handle(
1022             url_or_request, video_id, note=note, errnote=errnote,
1023             transform_source=transform_source, fatal=fatal, encoding=encoding,
1024             data=data, headers=headers, query=query,
1025             expected_status=expected_status)
1026         return res if res is False else res[0]
1027
1028     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
1029         if transform_source:
1030             json_string = transform_source(json_string)
1031         try:
1032             return json.loads(json_string, strict=False)
1033         except ValueError as ve:
1034             errmsg = '%s: Failed to parse JSON ' % video_id
1035             if fatal:
1036                 raise ExtractorError(errmsg, cause=ve)
1037             else:
1038                 self.report_warning(errmsg + str(ve))
1039
1040     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1041         return self._parse_json(
1042             data[data.find('{'):data.rfind('}') + 1],
1043             video_id, transform_source, fatal)
1044
1045     def _download_socket_json_handle(
1046             self, url_or_request, video_id, note='Polling socket',
1047             errnote='Unable to poll socket', transform_source=None,
1048             fatal=True, encoding=None, data=None, headers={}, query={},
1049             expected_status=None):
1050         """
1051         Return a tuple (JSON object, URL handle).
1052
1053         See _download_webpage docstring for arguments specification.
1054         """
1055         res = self._download_webpage_handle(
1056             url_or_request, video_id, note, errnote, fatal=fatal,
1057             encoding=encoding, data=data, headers=headers, query=query,
1058             expected_status=expected_status)
1059         if res is False:
1060             return res
1061         webpage, urlh = res
1062         return self._parse_socket_response_as_json(
1063             webpage, video_id, transform_source=transform_source,
1064             fatal=fatal), urlh
1065
1066     def _download_socket_json(
1067             self, url_or_request, video_id, note='Polling socket',
1068             errnote='Unable to poll socket', transform_source=None,
1069             fatal=True, encoding=None, data=None, headers={}, query={},
1070             expected_status=None):
1071         """
1072         Return the JSON object as a dict.
1073
1074         See _download_webpage docstring for arguments specification.
1075         """
1076         res = self._download_socket_json_handle(
1077             url_or_request, video_id, note=note, errnote=errnote,
1078             transform_source=transform_source, fatal=fatal, encoding=encoding,
1079             data=data, headers=headers, query=query,
1080             expected_status=expected_status)
1081         return res if res is False else res[0]
1082
1083     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1084         idstr = format_field(video_id, template='%s: ')
1085         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1086         if only_once:
1087             if f'WARNING: {msg}' in self._printed_messages:
1088                 return
1089             self._printed_messages.add(f'WARNING: {msg}')
1090         self._downloader.report_warning(msg, *args, **kwargs)
1091
1092     def to_screen(self, msg, *args, **kwargs):
1093         """Print msg to screen, prefixing it with '[ie_name]'"""
1094         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1095
1096     def write_debug(self, msg, *args, **kwargs):
1097         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1098
1099     def get_param(self, name, default=None, *args, **kwargs):
1100         if self._downloader:
1101             return self._downloader.params.get(name, default, *args, **kwargs)
1102         return default
1103
1104     def report_drm(self, video_id, partial=False):
1105         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1106
1107     def report_extraction(self, id_or_name):
1108         """Report information extraction."""
1109         self.to_screen('%s: Extracting information' % id_or_name)
1110
1111     def report_download_webpage(self, video_id):
1112         """Report webpage download."""
1113         self.to_screen('%s: Downloading webpage' % video_id)
1114
1115     def report_age_confirmation(self):
1116         """Report attempt to confirm age."""
1117         self.to_screen('Confirming age')
1118
1119     def report_login(self):
1120         """Report attempt to log in."""
1121         self.to_screen('Logging in')
1122
1123     def raise_login_required(
1124             self, msg='This video is only available for registered users',
1125             metadata_available=False, method=NO_DEFAULT):
1126         if metadata_available and (
1127                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1128             self.report_warning(msg)
1129             return
1130         if method is NO_DEFAULT:
1131             method = 'any' if self.supports_login() else 'cookies'
1132         if method is not None:
1133             assert method in self._LOGIN_HINTS, 'Invalid login method'
1134             msg = f'{msg}. {self._LOGIN_HINTS[method]}'
1135         raise ExtractorError(msg, expected=True)
1136
1137     def raise_geo_restricted(
1138             self, msg='This video is not available from your location due to geo restriction',
1139             countries=None, metadata_available=False):
1140         if metadata_available and (
1141                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1142             self.report_warning(msg)
1143         else:
1144             raise GeoRestrictedError(msg, countries=countries)
1145
1146     def raise_no_formats(self, msg, expected=False, video_id=None):
1147         if expected and (
1148                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1149             self.report_warning(msg, video_id)
1150         elif isinstance(msg, ExtractorError):
1151             raise msg
1152         else:
1153             raise ExtractorError(msg, expected=expected, video_id=video_id)
1154
1155     # Methods for following #608
1156     @staticmethod
1157     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1158         """Returns a URL that points to a page that should be processed"""
1159         if ie is not None:
1160             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1161         if video_id is not None:
1162             kwargs['id'] = video_id
1163         if video_title is not None:
1164             kwargs['title'] = video_title
1165         return {
1166             **kwargs,
1167             '_type': 'url_transparent' if url_transparent else 'url',
1168             'url': url,
1169         }
1170
1171     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1172         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1173                 for m in orderedSet(map(getter, matches) if getter else matches))
1174         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1175
1176     @staticmethod
1177     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1178         """Returns a playlist"""
1179         if playlist_id:
1180             kwargs['id'] = playlist_id
1181         if playlist_title:
1182             kwargs['title'] = playlist_title
1183         if playlist_description is not None:
1184             kwargs['description'] = playlist_description
1185         return {
1186             **kwargs,
1187             '_type': 'multi_video' if multi_video else 'playlist',
1188             'entries': entries,
1189         }
1190
1191     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1192         """
1193         Perform a regex search on the given string, using a single or a list of
1194         patterns returning the first matching group.
1195         In case of failure return a default value or raise a WARNING or a
1196         RegexNotFoundError, depending on fatal, specifying the field name.
1197         """
1198         if string is None:
1199             mobj = None
1200         elif isinstance(pattern, (str, re.Pattern)):
1201             mobj = re.search(pattern, string, flags)
1202         else:
1203             for p in pattern:
1204                 mobj = re.search(p, string, flags)
1205                 if mobj:
1206                     break
1207
1208         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1209
1210         if mobj:
1211             if group is None:
1212                 # return the first matching group
1213                 return next(g for g in mobj.groups() if g is not None)
1214             elif isinstance(group, (list, tuple)):
1215                 return tuple(mobj.group(g) for g in group)
1216             else:
1217                 return mobj.group(group)
1218         elif default is not NO_DEFAULT:
1219             return default
1220         elif fatal:
1221             raise RegexNotFoundError('Unable to extract %s' % _name)
1222         else:
1223             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1224             return None
1225
1226     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1227         """
1228         Like _search_regex, but strips HTML tags and unescapes entities.
1229         """
1230         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1231         if res:
1232             return clean_html(res).strip()
1233         else:
1234             return res
1235
1236     def _get_netrc_login_info(self, netrc_machine=None):
1237         username = None
1238         password = None
1239         netrc_machine = netrc_machine or self._NETRC_MACHINE
1240
1241         if self.get_param('usenetrc', False):
1242             try:
1243                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1244                 if os.path.isdir(netrc_file):
1245                     netrc_file = os.path.join(netrc_file, '.netrc')
1246                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1247                 if info is not None:
1248                     username = info[0]
1249                     password = info[2]
1250                 else:
1251                     raise netrc.NetrcParseError(
1252                         'No authenticators for %s' % netrc_machine)
1253             except (OSError, netrc.NetrcParseError) as err:
1254                 self.report_warning(
1255                     'parsing .netrc: %s' % error_to_compat_str(err))
1256
1257         return username, password
1258
1259     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1260         """
1261         Get the login info as (username, password)
1262         First look for the manually specified credentials using username_option
1263         and password_option as keys in params dictionary. If no such credentials
1264         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1265         value.
1266         If there's no info available, return (None, None)
1267         """
1268
1269         # Attempt to use provided username and password or .netrc data
1270         username = self.get_param(username_option)
1271         if username is not None:
1272             password = self.get_param(password_option)
1273         else:
1274             username, password = self._get_netrc_login_info(netrc_machine)
1275
1276         return username, password
1277
1278     def _get_tfa_info(self, note='two-factor verification code'):
1279         """
1280         Get the two-factor authentication info
1281         TODO - asking the user will be required for sms/phone verify
1282         currently just uses the command line option
1283         If there's no info available, return None
1284         """
1285
1286         tfa = self.get_param('twofactor')
1287         if tfa is not None:
1288             return tfa
1289
1290         return compat_getpass('Type %s and press [Return]: ' % note)
1291
1292     # Helper functions for extracting OpenGraph info
1293     @staticmethod
1294     def _og_regexes(prop):
1295         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1296         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1297                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1298         template = r'<meta[^>]+?%s[^>]+?%s'
1299         return [
1300             template % (property_re, content_re),
1301             template % (content_re, property_re),
1302         ]
1303
1304     @staticmethod
1305     def _meta_regex(prop):
1306         return r'''(?isx)<meta
1307                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1308                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1309
1310     def _og_search_property(self, prop, html, name=None, **kargs):
1311         prop = variadic(prop)
1312         if name is None:
1313             name = 'OpenGraph %s' % prop[0]
1314         og_regexes = []
1315         for p in prop:
1316             og_regexes.extend(self._og_regexes(p))
1317         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1318         if escaped is None:
1319             return None
1320         return unescapeHTML(escaped)
1321
1322     def _og_search_thumbnail(self, html, **kargs):
1323         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1324
1325     def _og_search_description(self, html, **kargs):
1326         return self._og_search_property('description', html, fatal=False, **kargs)
1327
1328     def _og_search_title(self, html, *, fatal=False, **kargs):
1329         return self._og_search_property('title', html, fatal=fatal, **kargs)
1330
1331     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1332         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1333         if secure:
1334             regexes = self._og_regexes('video:secure_url') + regexes
1335         return self._html_search_regex(regexes, html, name, **kargs)
1336
1337     def _og_search_url(self, html, **kargs):
1338         return self._og_search_property('url', html, **kargs)
1339
1340     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1341         return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1342
1343     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1344         name = variadic(name)
1345         if display_name is None:
1346             display_name = name[0]
1347         return self._html_search_regex(
1348             [self._meta_regex(n) for n in name],
1349             html, display_name, fatal=fatal, group='content', **kwargs)
1350
1351     def _dc_search_uploader(self, html):
1352         return self._html_search_meta('dc.creator', html, 'uploader')
1353
1354     def _rta_search(self, html):
1355         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1356         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1357                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1358                      html):
1359             return 18
1360         return 0
1361
1362     def _media_rating_search(self, html):
1363         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1364         rating = self._html_search_meta('rating', html)
1365
1366         if not rating:
1367             return None
1368
1369         RATING_TABLE = {
1370             'safe for kids': 0,
1371             'general': 8,
1372             '14 years': 14,
1373             'mature': 17,
1374             'restricted': 19,
1375         }
1376         return RATING_TABLE.get(rating.lower())
1377
1378     def _family_friendly_search(self, html):
1379         # See http://schema.org/VideoObject
1380         family_friendly = self._html_search_meta(
1381             'isFamilyFriendly', html, default=None)
1382
1383         if not family_friendly:
1384             return None
1385
1386         RATING_TABLE = {
1387             '1': 0,
1388             'true': 0,
1389             '0': 18,
1390             'false': 18,
1391         }
1392         return RATING_TABLE.get(family_friendly.lower())
1393
1394     def _twitter_search_player(self, html):
1395         return self._html_search_meta('twitter:player', html,
1396                                       'twitter card player')
1397
1398     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1399         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1400         default = kwargs.get('default', NO_DEFAULT)
1401         # JSON-LD may be malformed and thus `fatal` should be respected.
1402         # At the same time `default` may be passed that assumes `fatal=False`
1403         # for _search_regex. Let's simulate the same behavior here as well.
1404         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1405         json_ld = []
1406         for mobj in json_ld_list:
1407             json_ld_item = self._parse_json(
1408                 mobj.group('json_ld'), video_id, fatal=fatal)
1409             if not json_ld_item:
1410                 continue
1411             if isinstance(json_ld_item, dict):
1412                 json_ld.append(json_ld_item)
1413             elif isinstance(json_ld_item, (list, tuple)):
1414                 json_ld.extend(json_ld_item)
1415         if json_ld:
1416             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1417         if json_ld:
1418             return json_ld
1419         if default is not NO_DEFAULT:
1420             return default
1421         elif fatal:
1422             raise RegexNotFoundError('Unable to extract JSON-LD')
1423         else:
1424             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1425             return {}
1426
1427     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1428         if isinstance(json_ld, compat_str):
1429             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1430         if not json_ld:
1431             return {}
1432         info = {}
1433         if not isinstance(json_ld, (list, tuple, dict)):
1434             return info
1435         if isinstance(json_ld, dict):
1436             json_ld = [json_ld]
1437
1438         INTERACTION_TYPE_MAP = {
1439             'CommentAction': 'comment',
1440             'AgreeAction': 'like',
1441             'DisagreeAction': 'dislike',
1442             'LikeAction': 'like',
1443             'DislikeAction': 'dislike',
1444             'ListenAction': 'view',
1445             'WatchAction': 'view',
1446             'ViewAction': 'view',
1447         }
1448
1449         def extract_interaction_type(e):
1450             interaction_type = e.get('interactionType')
1451             if isinstance(interaction_type, dict):
1452                 interaction_type = interaction_type.get('@type')
1453             return str_or_none(interaction_type)
1454
1455         def extract_interaction_statistic(e):
1456             interaction_statistic = e.get('interactionStatistic')
1457             if isinstance(interaction_statistic, dict):
1458                 interaction_statistic = [interaction_statistic]
1459             if not isinstance(interaction_statistic, list):
1460                 return
1461             for is_e in interaction_statistic:
1462                 if not isinstance(is_e, dict):
1463                     continue
1464                 if is_e.get('@type') != 'InteractionCounter':
1465                     continue
1466                 interaction_type = extract_interaction_type(is_e)
1467                 if not interaction_type:
1468                     continue
1469                 # For interaction count some sites provide string instead of
1470                 # an integer (as per spec) with non digit characters (e.g. ",")
1471                 # so extracting count with more relaxed str_to_int
1472                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1473                 if interaction_count is None:
1474                     continue
1475                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1476                 if not count_kind:
1477                     continue
1478                 count_key = '%s_count' % count_kind
1479                 if info.get(count_key) is not None:
1480                     continue
1481                 info[count_key] = interaction_count
1482
1483         def extract_chapter_information(e):
1484             chapters = [{
1485                 'title': part.get('name'),
1486                 'start_time': part.get('startOffset'),
1487                 'end_time': part.get('endOffset'),
1488             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1489             for idx, (last_c, current_c, next_c) in enumerate(zip(
1490                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1491                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1492                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1493                 if None in current_c.values():
1494                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1495                     return
1496             if chapters:
1497                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1498                 info['chapters'] = chapters
1499
1500         def extract_video_object(e):
1501             assert e['@type'] == 'VideoObject'
1502             author = e.get('author')
1503             info.update({
1504                 'url': url_or_none(e.get('contentUrl')),
1505                 'title': unescapeHTML(e.get('name')),
1506                 'description': unescapeHTML(e.get('description')),
1507                 'thumbnails': [{'url': url_or_none(url)}
1508                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1509                 'duration': parse_duration(e.get('duration')),
1510                 'timestamp': unified_timestamp(e.get('uploadDate')),
1511                 # author can be an instance of 'Organization' or 'Person' types.
1512                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1513                 # however some websites are using 'Text' type instead.
1514                 # 1. https://schema.org/VideoObject
1515                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1516                 'filesize': float_or_none(e.get('contentSize')),
1517                 'tbr': int_or_none(e.get('bitrate')),
1518                 'width': int_or_none(e.get('width')),
1519                 'height': int_or_none(e.get('height')),
1520                 'view_count': int_or_none(e.get('interactionCount')),
1521             })
1522             extract_interaction_statistic(e)
1523             extract_chapter_information(e)
1524
1525         def traverse_json_ld(json_ld, at_top_level=True):
1526             for e in json_ld:
1527                 if at_top_level and '@context' not in e:
1528                     continue
1529                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1530                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1531                     break
1532                 item_type = e.get('@type')
1533                 if expected_type is not None and expected_type != item_type:
1534                     continue
1535                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1536                 if rating is not None:
1537                     info['average_rating'] = rating
1538                 if item_type in ('TVEpisode', 'Episode'):
1539                     episode_name = unescapeHTML(e.get('name'))
1540                     info.update({
1541                         'episode': episode_name,
1542                         'episode_number': int_or_none(e.get('episodeNumber')),
1543                         'description': unescapeHTML(e.get('description')),
1544                     })
1545                     if not info.get('title') and episode_name:
1546                         info['title'] = episode_name
1547                     part_of_season = e.get('partOfSeason')
1548                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1549                         info.update({
1550                             'season': unescapeHTML(part_of_season.get('name')),
1551                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1552                         })
1553                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1554                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1555                         info['series'] = unescapeHTML(part_of_series.get('name'))
1556                 elif item_type == 'Movie':
1557                     info.update({
1558                         'title': unescapeHTML(e.get('name')),
1559                         'description': unescapeHTML(e.get('description')),
1560                         'duration': parse_duration(e.get('duration')),
1561                         'timestamp': unified_timestamp(e.get('dateCreated')),
1562                     })
1563                 elif item_type in ('Article', 'NewsArticle'):
1564                     info.update({
1565                         'timestamp': parse_iso8601(e.get('datePublished')),
1566                         'title': unescapeHTML(e.get('headline')),
1567                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1568                     })
1569                     if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1570                         extract_video_object(e['video'][0])
1571                 elif item_type == 'VideoObject':
1572                     extract_video_object(e)
1573                     if expected_type is None:
1574                         continue
1575                     else:
1576                         break
1577                 video = e.get('video')
1578                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1579                     extract_video_object(video)
1580                 if expected_type is None:
1581                     continue
1582                 else:
1583                     break
1584         traverse_json_ld(json_ld)
1585
1586         return filter_dict(info)
1587
1588     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1589         return self._parse_json(
1590             self._search_regex(
1591                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1592                 webpage, 'next.js data', fatal=fatal, **kw),
1593             video_id, transform_source=transform_source, fatal=fatal)
1594
1595     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1596         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1597         # not all website do this, but it can be changed
1598         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1599         rectx = re.escape(context_name)
1600         js, arg_keys, arg_vals = self._search_regex(
1601             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1602              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1603             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1604
1605         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1606
1607         for key, val in args.items():
1608             if val in ('undefined', 'void 0'):
1609                 args[key] = 'null'
1610
1611         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1612
1613     @staticmethod
1614     def _hidden_inputs(html):
1615         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1616         hidden_inputs = {}
1617         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1618             attrs = extract_attributes(input)
1619             if not input:
1620                 continue
1621             if attrs.get('type') not in ('hidden', 'submit'):
1622                 continue
1623             name = attrs.get('name') or attrs.get('id')
1624             value = attrs.get('value')
1625             if name and value is not None:
1626                 hidden_inputs[name] = value
1627         return hidden_inputs
1628
1629     def _form_hidden_inputs(self, form_id, html):
1630         form = self._search_regex(
1631             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1632             html, '%s form' % form_id, group='form')
1633         return self._hidden_inputs(form)
1634
1635     class FormatSort:
1636         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1637
1638         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1639                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1640                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1641         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1642                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1643                         'fps', 'fs_approx', 'source', 'id')
1644
1645         settings = {
1646             'vcodec': {'type': 'ordered', 'regex': True,
1647                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1648             'acodec': {'type': 'ordered', 'regex': True,
1649                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1650             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1651                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1652             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1653                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1654             'vext': {'type': 'ordered', 'field': 'video_ext',
1655                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1656                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1657             'aext': {'type': 'ordered', 'field': 'audio_ext',
1658                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1659                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1660             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1661             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1662                            'field': ('vcodec', 'acodec'),
1663                            'function': lambda it: int(any(v != 'none' for v in it))},
1664             'ie_pref': {'priority': True, 'type': 'extractor'},
1665             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1666             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1667             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1668             'quality': {'convert': 'float', 'default': -1},
1669             'filesize': {'convert': 'bytes'},
1670             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1671             'id': {'convert': 'string', 'field': 'format_id'},
1672             'height': {'convert': 'float_none'},
1673             'width': {'convert': 'float_none'},
1674             'fps': {'convert': 'float_none'},
1675             'tbr': {'convert': 'float_none'},
1676             'vbr': {'convert': 'float_none'},
1677             'abr': {'convert': 'float_none'},
1678             'asr': {'convert': 'float_none'},
1679             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1680
1681             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1682             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1683             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1684             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1685             'res': {'type': 'multiple', 'field': ('height', 'width'),
1686                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1687
1688             # For compatibility with youtube-dl
1689             'format_id': {'type': 'alias', 'field': 'id'},
1690             'preference': {'type': 'alias', 'field': 'ie_pref'},
1691             'language_preference': {'type': 'alias', 'field': 'lang'},
1692             'source_preference': {'type': 'alias', 'field': 'source'},
1693             'protocol': {'type': 'alias', 'field': 'proto'},
1694             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1695
1696             # Deprecated
1697             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1698             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1699             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1700             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1701             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1702             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1703             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1704             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1705             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1706             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1707             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1708             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1709             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1710             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1711             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1712             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1713             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1714             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1715             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1716             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1717         }
1718
1719         def __init__(self, ie, field_preference):
1720             self._order = []
1721             self.ydl = ie._downloader
1722             self.evaluate_params(self.ydl.params, field_preference)
1723             if ie.get_param('verbose'):
1724                 self.print_verbose_info(self.ydl.write_debug)
1725
1726         def _get_field_setting(self, field, key):
1727             if field not in self.settings:
1728                 if key in ('forced', 'priority'):
1729                     return False
1730                 self.ydl.deprecation_warning(
1731                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1732                     'and may be removed in a future version')
1733                 self.settings[field] = {}
1734             propObj = self.settings[field]
1735             if key not in propObj:
1736                 type = propObj.get('type')
1737                 if key == 'field':
1738                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1739                 elif key == 'convert':
1740                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1741                 else:
1742                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1743                 propObj[key] = default
1744             return propObj[key]
1745
1746         def _resolve_field_value(self, field, value, convertNone=False):
1747             if value is None:
1748                 if not convertNone:
1749                     return None
1750             else:
1751                 value = value.lower()
1752             conversion = self._get_field_setting(field, 'convert')
1753             if conversion == 'ignore':
1754                 return None
1755             if conversion == 'string':
1756                 return value
1757             elif conversion == 'float_none':
1758                 return float_or_none(value)
1759             elif conversion == 'bytes':
1760                 return FileDownloader.parse_bytes(value)
1761             elif conversion == 'order':
1762                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1763                 use_regex = self._get_field_setting(field, 'regex')
1764                 list_length = len(order_list)
1765                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1766                 if use_regex and value is not None:
1767                     for i, regex in enumerate(order_list):
1768                         if regex and re.match(regex, value):
1769                             return list_length - i
1770                     return list_length - empty_pos  # not in list
1771                 else:  # not regex or  value = None
1772                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1773             else:
1774                 if value.isnumeric():
1775                     return float(value)
1776                 else:
1777                     self.settings[field]['convert'] = 'string'
1778                     return value
1779
1780         def evaluate_params(self, params, sort_extractor):
1781             self._use_free_order = params.get('prefer_free_formats', False)
1782             self._sort_user = params.get('format_sort', [])
1783             self._sort_extractor = sort_extractor
1784
1785             def add_item(field, reverse, closest, limit_text):
1786                 field = field.lower()
1787                 if field in self._order:
1788                     return
1789                 self._order.append(field)
1790                 limit = self._resolve_field_value(field, limit_text)
1791                 data = {
1792                     'reverse': reverse,
1793                     'closest': False if limit is None else closest,
1794                     'limit_text': limit_text,
1795                     'limit': limit}
1796                 if field in self.settings:
1797                     self.settings[field].update(data)
1798                 else:
1799                     self.settings[field] = data
1800
1801             sort_list = (
1802                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1803                 + (tuple() if params.get('format_sort_force', False)
1804                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1805                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1806
1807             for item in sort_list:
1808                 match = re.match(self.regex, item)
1809                 if match is None:
1810                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1811                 field = match.group('field')
1812                 if field is None:
1813                     continue
1814                 if self._get_field_setting(field, 'type') == 'alias':
1815                     alias, field = field, self._get_field_setting(field, 'field')
1816                     if self._get_field_setting(alias, 'deprecated'):
1817                         self.ydl.deprecation_warning(
1818                             f'Format sorting alias {alias} is deprecated '
1819                             f'and may be removed in a future version. Please use {field} instead')
1820                 reverse = match.group('reverse') is not None
1821                 closest = match.group('separator') == '~'
1822                 limit_text = match.group('limit')
1823
1824                 has_limit = limit_text is not None
1825                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1826                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1827
1828                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1829                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1830                 limit_count = len(limits)
1831                 for (i, f) in enumerate(fields):
1832                     add_item(f, reverse, closest,
1833                              limits[i] if i < limit_count
1834                              else limits[0] if has_limit and not has_multiple_limits
1835                              else None)
1836
1837         def print_verbose_info(self, write_debug):
1838             if self._sort_user:
1839                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1840             if self._sort_extractor:
1841                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1842             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1843                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1844                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1845                               self._get_field_setting(field, 'limit_text'),
1846                               self._get_field_setting(field, 'limit'))
1847                 if self._get_field_setting(field, 'limit_text') is not None else '')
1848                 for field in self._order if self._get_field_setting(field, 'visible')]))
1849
1850         def _calculate_field_preference_from_value(self, format, field, type, value):
1851             reverse = self._get_field_setting(field, 'reverse')
1852             closest = self._get_field_setting(field, 'closest')
1853             limit = self._get_field_setting(field, 'limit')
1854
1855             if type == 'extractor':
1856                 maximum = self._get_field_setting(field, 'max')
1857                 if value is None or (maximum is not None and value >= maximum):
1858                     value = -1
1859             elif type == 'boolean':
1860                 in_list = self._get_field_setting(field, 'in_list')
1861                 not_in_list = self._get_field_setting(field, 'not_in_list')
1862                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1863             elif type == 'ordered':
1864                 value = self._resolve_field_value(field, value, True)
1865
1866             # try to convert to number
1867             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1868             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1869             if is_num:
1870                 value = val_num
1871
1872             return ((-10, 0) if value is None
1873                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1874                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1875                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1876                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1877                     else (-1, value, 0))
1878
1879         def _calculate_field_preference(self, format, field):
1880             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1881             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1882             if type == 'multiple':
1883                 type = 'field'  # Only 'field' is allowed in multiple for now
1884                 actual_fields = self._get_field_setting(field, 'field')
1885
1886                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1887             else:
1888                 value = get_value(field)
1889             return self._calculate_field_preference_from_value(format, field, type, value)
1890
1891         def calculate_preference(self, format):
1892             # Determine missing protocol
1893             if not format.get('protocol'):
1894                 format['protocol'] = determine_protocol(format)
1895
1896             # Determine missing ext
1897             if not format.get('ext') and 'url' in format:
1898                 format['ext'] = determine_ext(format['url'])
1899             if format.get('vcodec') == 'none':
1900                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1901                 format['video_ext'] = 'none'
1902             else:
1903                 format['video_ext'] = format['ext']
1904                 format['audio_ext'] = 'none'
1905             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1906             #    format['preference'] = -1000
1907
1908             # Determine missing bitrates
1909             if format.get('tbr') is None:
1910                 if format.get('vbr') is not None and format.get('abr') is not None:
1911                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1912             else:
1913                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1914                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1915                 if format.get('acodec') != 'none' and format.get('abr') is None:
1916                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1917
1918             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1919
1920     def _sort_formats(self, formats, field_preference=[]):
1921         if not formats:
1922             return
1923         format_sort = self.FormatSort(self, field_preference)
1924         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1925
1926     def _check_formats(self, formats, video_id):
1927         if formats:
1928             formats[:] = filter(
1929                 lambda f: self._is_valid_url(
1930                     f['url'], video_id,
1931                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1932                 formats)
1933
1934     @staticmethod
1935     def _remove_duplicate_formats(formats):
1936         format_urls = set()
1937         unique_formats = []
1938         for f in formats:
1939             if f['url'] not in format_urls:
1940                 format_urls.add(f['url'])
1941                 unique_formats.append(f)
1942         formats[:] = unique_formats
1943
1944     def _is_valid_url(self, url, video_id, item='video', headers={}):
1945         url = self._proto_relative_url(url, scheme='http:')
1946         # For now assume non HTTP(S) URLs always valid
1947         if not (url.startswith('http://') or url.startswith('https://')):
1948             return True
1949         try:
1950             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1951             return True
1952         except ExtractorError as e:
1953             self.to_screen(
1954                 '%s: %s URL is invalid, skipping: %s'
1955                 % (video_id, item, error_to_compat_str(e.cause)))
1956             return False
1957
1958     def http_scheme(self):
1959         """ Either "http:" or "https:", depending on the user's preferences """
1960         return (
1961             'http:'
1962             if self.get_param('prefer_insecure', False)
1963             else 'https:')
1964
1965     def _proto_relative_url(self, url, scheme=None):
1966         if url is None:
1967             return url
1968         if url.startswith('//'):
1969             if scheme is None:
1970                 scheme = self.http_scheme()
1971             return scheme + url
1972         else:
1973             return url
1974
1975     def _sleep(self, timeout, video_id, msg_template=None):
1976         if msg_template is None:
1977             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1978         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1979         self.to_screen(msg)
1980         time.sleep(timeout)
1981
1982     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1983                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1984                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1985         manifest = self._download_xml(
1986             manifest_url, video_id, 'Downloading f4m manifest',
1987             'Unable to download f4m manifest',
1988             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1989             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1990             transform_source=transform_source,
1991             fatal=fatal, data=data, headers=headers, query=query)
1992
1993         if manifest is False:
1994             return []
1995
1996         return self._parse_f4m_formats(
1997             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1998             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1999
2000     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2001                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2002                            fatal=True, m3u8_id=None):
2003         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2004             return []
2005
2006         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2007         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2008         if akamai_pv is not None and ';' in akamai_pv.text:
2009             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2010             if playerVerificationChallenge.strip() != '':
2011                 return []
2012
2013         formats = []
2014         manifest_version = '1.0'
2015         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2016         if not media_nodes:
2017             manifest_version = '2.0'
2018             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2019         # Remove unsupported DRM protected media from final formats
2020         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2021         media_nodes = remove_encrypted_media(media_nodes)
2022         if not media_nodes:
2023             return formats
2024
2025         manifest_base_url = get_base_url(manifest)
2026
2027         bootstrap_info = xpath_element(
2028             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2029             'bootstrap info', default=None)
2030
2031         vcodec = None
2032         mime_type = xpath_text(
2033             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2034             'base URL', default=None)
2035         if mime_type and mime_type.startswith('audio/'):
2036             vcodec = 'none'
2037
2038         for i, media_el in enumerate(media_nodes):
2039             tbr = int_or_none(media_el.attrib.get('bitrate'))
2040             width = int_or_none(media_el.attrib.get('width'))
2041             height = int_or_none(media_el.attrib.get('height'))
2042             format_id = join_nonempty(f4m_id, tbr or i)
2043             # If <bootstrapInfo> is present, the specified f4m is a
2044             # stream-level manifest, and only set-level manifests may refer to
2045             # external resources.  See section 11.4 and section 4 of F4M spec
2046             if bootstrap_info is None:
2047                 media_url = None
2048                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2049                 if manifest_version == '2.0':
2050                     media_url = media_el.attrib.get('href')
2051                 if media_url is None:
2052                     media_url = media_el.attrib.get('url')
2053                 if not media_url:
2054                     continue
2055                 manifest_url = (
2056                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2057                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2058                 # If media_url is itself a f4m manifest do the recursive extraction
2059                 # since bitrates in parent manifest (this one) and media_url manifest
2060                 # may differ leading to inability to resolve the format by requested
2061                 # bitrate in f4m downloader
2062                 ext = determine_ext(manifest_url)
2063                 if ext == 'f4m':
2064                     f4m_formats = self._extract_f4m_formats(
2065                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2066                         transform_source=transform_source, fatal=fatal)
2067                     # Sometimes stream-level manifest contains single media entry that
2068                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2069                     # At the same time parent's media entry in set-level manifest may
2070                     # contain it. We will copy it from parent in such cases.
2071                     if len(f4m_formats) == 1:
2072                         f = f4m_formats[0]
2073                         f.update({
2074                             'tbr': f.get('tbr') or tbr,
2075                             'width': f.get('width') or width,
2076                             'height': f.get('height') or height,
2077                             'format_id': f.get('format_id') if not tbr else format_id,
2078                             'vcodec': vcodec,
2079                         })
2080                     formats.extend(f4m_formats)
2081                     continue
2082                 elif ext == 'm3u8':
2083                     formats.extend(self._extract_m3u8_formats(
2084                         manifest_url, video_id, 'mp4', preference=preference,
2085                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2086                     continue
2087             formats.append({
2088                 'format_id': format_id,
2089                 'url': manifest_url,
2090                 'manifest_url': manifest_url,
2091                 'ext': 'flv' if bootstrap_info is not None else None,
2092                 'protocol': 'f4m',
2093                 'tbr': tbr,
2094                 'width': width,
2095                 'height': height,
2096                 'vcodec': vcodec,
2097                 'preference': preference,
2098                 'quality': quality,
2099             })
2100         return formats
2101
2102     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2103         return {
2104             'format_id': join_nonempty(m3u8_id, 'meta'),
2105             'url': m3u8_url,
2106             'ext': ext,
2107             'protocol': 'm3u8',
2108             'preference': preference - 100 if preference else -100,
2109             'quality': quality,
2110             'resolution': 'multiple',
2111             'format_note': 'Quality selection URL',
2112         }
2113
2114     def _report_ignoring_subs(self, name):
2115         self.report_warning(bug_reports_message(
2116             f'Ignoring subtitle tracks found in the {name} manifest; '
2117             'if any subtitle tracks are missing,'
2118         ), only_once=True)
2119
2120     def _extract_m3u8_formats(self, *args, **kwargs):
2121         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2122         if subs:
2123             self._report_ignoring_subs('HLS')
2124         return fmts
2125
2126     def _extract_m3u8_formats_and_subtitles(
2127             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2128             preference=None, quality=None, m3u8_id=None, note=None,
2129             errnote=None, fatal=True, live=False, data=None, headers={},
2130             query={}):
2131
2132         res = self._download_webpage_handle(
2133             m3u8_url, video_id,
2134             note='Downloading m3u8 information' if note is None else note,
2135             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2136             fatal=fatal, data=data, headers=headers, query=query)
2137
2138         if res is False:
2139             return [], {}
2140
2141         m3u8_doc, urlh = res
2142         m3u8_url = urlh.geturl()
2143
2144         return self._parse_m3u8_formats_and_subtitles(
2145             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2146             preference=preference, quality=quality, m3u8_id=m3u8_id,
2147             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2148             headers=headers, query=query, video_id=video_id)
2149
2150     def _parse_m3u8_formats_and_subtitles(
2151             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2152             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2153             errnote=None, fatal=True, data=None, headers={}, query={},
2154             video_id=None):
2155         formats, subtitles = [], {}
2156
2157         has_drm = re.search('|'.join([
2158             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2159             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2160         ]), m3u8_doc)
2161
2162         def format_url(url):
2163             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2164
2165         if self.get_param('hls_split_discontinuity', False):
2166             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2167                 if not m3u8_doc:
2168                     if not manifest_url:
2169                         return []
2170                     m3u8_doc = self._download_webpage(
2171                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2172                         note=False, errnote='Failed to download m3u8 playlist information')
2173                     if m3u8_doc is False:
2174                         return []
2175                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2176
2177         else:
2178             def _extract_m3u8_playlist_indices(*args, **kwargs):
2179                 return [None]
2180
2181         # References:
2182         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2183         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2184         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2185
2186         # We should try extracting formats only from master playlists [1, 4.3.4],
2187         # i.e. playlists that describe available qualities. On the other hand
2188         # media playlists [1, 4.3.3] should be returned as is since they contain
2189         # just the media without qualities renditions.
2190         # Fortunately, master playlist can be easily distinguished from media
2191         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2192         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2193         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2194         # media playlist and MUST NOT appear in master playlist thus we can
2195         # clearly detect media playlist with this criterion.
2196
2197         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2198             formats = [{
2199                 'format_id': join_nonempty(m3u8_id, idx),
2200                 'format_index': idx,
2201                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2202                 'ext': ext,
2203                 'protocol': entry_protocol,
2204                 'preference': preference,
2205                 'quality': quality,
2206                 'has_drm': has_drm,
2207             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2208
2209             return formats, subtitles
2210
2211         groups = {}
2212         last_stream_inf = {}
2213
2214         def extract_media(x_media_line):
2215             media = parse_m3u8_attributes(x_media_line)
2216             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2217             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2218             if not (media_type and group_id and name):
2219                 return
2220             groups.setdefault(group_id, []).append(media)
2221             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2222             if media_type == 'SUBTITLES':
2223                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2224                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2225                 # However, lack of URI has been spotted in the wild.
2226                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2227                 if not media.get('URI'):
2228                     return
2229                 url = format_url(media['URI'])
2230                 sub_info = {
2231                     'url': url,
2232                     'ext': determine_ext(url),
2233                 }
2234                 if sub_info['ext'] == 'm3u8':
2235                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2236                     # files may contain is WebVTT:
2237                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2238                     sub_info['ext'] = 'vtt'
2239                     sub_info['protocol'] = 'm3u8_native'
2240                 lang = media.get('LANGUAGE') or 'und'
2241                 subtitles.setdefault(lang, []).append(sub_info)
2242             if media_type not in ('VIDEO', 'AUDIO'):
2243                 return
2244             media_url = media.get('URI')
2245             if media_url:
2246                 manifest_url = format_url(media_url)
2247                 formats.extend({
2248                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2249                     'format_note': name,
2250                     'format_index': idx,
2251                     'url': manifest_url,
2252                     'manifest_url': m3u8_url,
2253                     'language': media.get('LANGUAGE'),
2254                     'ext': ext,
2255                     'protocol': entry_protocol,
2256                     'preference': preference,
2257                     'quality': quality,
2258                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2259                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2260
2261         def build_stream_name():
2262             # Despite specification does not mention NAME attribute for
2263             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2264             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2265             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2266             stream_name = last_stream_inf.get('NAME')
2267             if stream_name:
2268                 return stream_name
2269             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2270             # from corresponding rendition group
2271             stream_group_id = last_stream_inf.get('VIDEO')
2272             if not stream_group_id:
2273                 return
2274             stream_group = groups.get(stream_group_id)
2275             if not stream_group:
2276                 return stream_group_id
2277             rendition = stream_group[0]
2278             return rendition.get('NAME') or stream_group_id
2279
2280         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2281         # chance to detect video only formats when EXT-X-STREAM-INF tags
2282         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2283         for line in m3u8_doc.splitlines():
2284             if line.startswith('#EXT-X-MEDIA:'):
2285                 extract_media(line)
2286
2287         for line in m3u8_doc.splitlines():
2288             if line.startswith('#EXT-X-STREAM-INF:'):
2289                 last_stream_inf = parse_m3u8_attributes(line)
2290             elif line.startswith('#') or not line.strip():
2291                 continue
2292             else:
2293                 tbr = float_or_none(
2294                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2295                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2296                 manifest_url = format_url(line.strip())
2297
2298                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2299                     format_id = [m3u8_id, None, idx]
2300                     # Bandwidth of live streams may differ over time thus making
2301                     # format_id unpredictable. So it's better to keep provided
2302                     # format_id intact.
2303                     if not live:
2304                         stream_name = build_stream_name()
2305                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2306                     f = {
2307                         'format_id': join_nonempty(*format_id),
2308                         'format_index': idx,
2309                         'url': manifest_url,
2310                         'manifest_url': m3u8_url,
2311                         'tbr': tbr,
2312                         'ext': ext,
2313                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2314                         'protocol': entry_protocol,
2315                         'preference': preference,
2316                         'quality': quality,
2317                     }
2318                     resolution = last_stream_inf.get('RESOLUTION')
2319                     if resolution:
2320                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2321                         if mobj:
2322                             f['width'] = int(mobj.group('width'))
2323                             f['height'] = int(mobj.group('height'))
2324                     # Unified Streaming Platform
2325                     mobj = re.search(
2326                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2327                     if mobj:
2328                         abr, vbr = mobj.groups()
2329                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2330                         f.update({
2331                             'vbr': vbr,
2332                             'abr': abr,
2333                         })
2334                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2335                     f.update(codecs)
2336                     audio_group_id = last_stream_inf.get('AUDIO')
2337                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2338                     # references a rendition group MUST have a CODECS attribute.
2339                     # However, this is not always respected, for example, [2]
2340                     # contains EXT-X-STREAM-INF tag which references AUDIO
2341                     # rendition group but does not have CODECS and despite
2342                     # referencing an audio group it represents a complete
2343                     # (with audio and video) format. So, for such cases we will
2344                     # ignore references to rendition groups and treat them
2345                     # as complete formats.
2346                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2347                         audio_group = groups.get(audio_group_id)
2348                         if audio_group and audio_group[0].get('URI'):
2349                             # TODO: update acodec for audio only formats with
2350                             # the same GROUP-ID
2351                             f['acodec'] = 'none'
2352                     if not f.get('ext'):
2353                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2354                     formats.append(f)
2355
2356                     # for DailyMotion
2357                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2358                     if progressive_uri:
2359                         http_f = f.copy()
2360                         del http_f['manifest_url']
2361                         http_f.update({
2362                             'format_id': f['format_id'].replace('hls-', 'http-'),
2363                             'protocol': 'http',
2364                             'url': progressive_uri,
2365                         })
2366                         formats.append(http_f)
2367
2368                 last_stream_inf = {}
2369         return formats, subtitles
2370
2371     def _extract_m3u8_vod_duration(
2372             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2373
2374         m3u8_vod = self._download_webpage(
2375             m3u8_vod_url, video_id,
2376             note='Downloading m3u8 VOD manifest' if note is None else note,
2377             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2378             fatal=False, data=data, headers=headers, query=query)
2379
2380         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2381
2382     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2383         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2384             return None
2385
2386         return int(sum(
2387             float(line[len('#EXTINF:'):].split(',')[0])
2388             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2389
2390     @staticmethod
2391     def _xpath_ns(path, namespace=None):
2392         if not namespace:
2393             return path
2394         out = []
2395         for c in path.split('/'):
2396             if not c or c == '.':
2397                 out.append(c)
2398             else:
2399                 out.append('{%s}%s' % (namespace, c))
2400         return '/'.join(out)
2401
2402     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2403         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2404
2405         if smil is False:
2406             assert not fatal
2407             return [], {}
2408
2409         namespace = self._parse_smil_namespace(smil)
2410
2411         fmts = self._parse_smil_formats(
2412             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2413         subs = self._parse_smil_subtitles(
2414             smil, namespace=namespace)
2415
2416         return fmts, subs
2417
2418     def _extract_smil_formats(self, *args, **kwargs):
2419         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2420         if subs:
2421             self._report_ignoring_subs('SMIL')
2422         return fmts
2423
2424     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2425         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2426         if smil is False:
2427             return {}
2428         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2429
2430     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2431         return self._download_xml(
2432             smil_url, video_id, 'Downloading SMIL file',
2433             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2434
2435     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2436         namespace = self._parse_smil_namespace(smil)
2437
2438         formats = self._parse_smil_formats(
2439             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2440         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2441
2442         video_id = os.path.splitext(url_basename(smil_url))[0]
2443         title = None
2444         description = None
2445         upload_date = None
2446         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2447             name = meta.attrib.get('name')
2448             content = meta.attrib.get('content')
2449             if not name or not content:
2450                 continue
2451             if not title and name == 'title':
2452                 title = content
2453             elif not description and name in ('description', 'abstract'):
2454                 description = content
2455             elif not upload_date and name == 'date':
2456                 upload_date = unified_strdate(content)
2457
2458         thumbnails = [{
2459             'id': image.get('type'),
2460             'url': image.get('src'),
2461             'width': int_or_none(image.get('width')),
2462             'height': int_or_none(image.get('height')),
2463         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2464
2465         return {
2466             'id': video_id,
2467             'title': title or video_id,
2468             'description': description,
2469             'upload_date': upload_date,
2470             'thumbnails': thumbnails,
2471             'formats': formats,
2472             'subtitles': subtitles,
2473         }
2474
2475     def _parse_smil_namespace(self, smil):
2476         return self._search_regex(
2477             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2478
2479     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2480         base = smil_url
2481         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2482             b = meta.get('base') or meta.get('httpBase')
2483             if b:
2484                 base = b
2485                 break
2486
2487         formats = []
2488         rtmp_count = 0
2489         http_count = 0
2490         m3u8_count = 0
2491         imgs_count = 0
2492
2493         srcs = set()
2494         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2495         for medium in media:
2496             src = medium.get('src')
2497             if not src or src in srcs:
2498                 continue
2499             srcs.add(src)
2500
2501             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2502             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2503             width = int_or_none(medium.get('width'))
2504             height = int_or_none(medium.get('height'))
2505             proto = medium.get('proto')
2506             ext = medium.get('ext')
2507             src_ext = determine_ext(src)
2508             streamer = medium.get('streamer') or base
2509
2510             if proto == 'rtmp' or streamer.startswith('rtmp'):
2511                 rtmp_count += 1
2512                 formats.append({
2513                     'url': streamer,
2514                     'play_path': src,
2515                     'ext': 'flv',
2516                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2517                     'tbr': bitrate,
2518                     'filesize': filesize,
2519                     'width': width,
2520                     'height': height,
2521                 })
2522                 if transform_rtmp_url:
2523                     streamer, src = transform_rtmp_url(streamer, src)
2524                     formats[-1].update({
2525                         'url': streamer,
2526                         'play_path': src,
2527                     })
2528                 continue
2529
2530             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2531             src_url = src_url.strip()
2532
2533             if proto == 'm3u8' or src_ext == 'm3u8':
2534                 m3u8_formats = self._extract_m3u8_formats(
2535                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2536                 if len(m3u8_formats) == 1:
2537                     m3u8_count += 1
2538                     m3u8_formats[0].update({
2539                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2540                         'tbr': bitrate,
2541                         'width': width,
2542                         'height': height,
2543                     })
2544                 formats.extend(m3u8_formats)
2545             elif src_ext == 'f4m':
2546                 f4m_url = src_url
2547                 if not f4m_params:
2548                     f4m_params = {
2549                         'hdcore': '3.2.0',
2550                         'plugin': 'flowplayer-3.2.0.1',
2551                     }
2552                 f4m_url += '&' if '?' in f4m_url else '?'
2553                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2554                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2555             elif src_ext == 'mpd':
2556                 formats.extend(self._extract_mpd_formats(
2557                     src_url, video_id, mpd_id='dash', fatal=False))
2558             elif re.search(r'\.ism/[Mm]anifest', src_url):
2559                 formats.extend(self._extract_ism_formats(
2560                     src_url, video_id, ism_id='mss', fatal=False))
2561             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2562                 http_count += 1
2563                 formats.append({
2564                     'url': src_url,
2565                     'ext': ext or src_ext or 'flv',
2566                     'format_id': 'http-%d' % (bitrate or http_count),
2567                     'tbr': bitrate,
2568                     'filesize': filesize,
2569                     'width': width,
2570                     'height': height,
2571                 })
2572
2573         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2574             src = medium.get('src')
2575             if not src or src in srcs:
2576                 continue
2577             srcs.add(src)
2578
2579             imgs_count += 1
2580             formats.append({
2581                 'format_id': 'imagestream-%d' % (imgs_count),
2582                 'url': src,
2583                 'ext': mimetype2ext(medium.get('type')),
2584                 'acodec': 'none',
2585                 'vcodec': 'none',
2586                 'width': int_or_none(medium.get('width')),
2587                 'height': int_or_none(medium.get('height')),
2588                 'format_note': 'SMIL storyboards',
2589             })
2590
2591         return formats
2592
2593     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2594         urls = []
2595         subtitles = {}
2596         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2597             src = textstream.get('src')
2598             if not src or src in urls:
2599                 continue
2600             urls.append(src)
2601             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2602             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2603             subtitles.setdefault(lang, []).append({
2604                 'url': src,
2605                 'ext': ext,
2606             })
2607         return subtitles
2608
2609     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2610         xspf = self._download_xml(
2611             xspf_url, playlist_id, 'Downloading xpsf playlist',
2612             'Unable to download xspf manifest', fatal=fatal)
2613         if xspf is False:
2614             return []
2615         return self._parse_xspf(
2616             xspf, playlist_id, xspf_url=xspf_url,
2617             xspf_base_url=base_url(xspf_url))
2618
2619     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2620         NS_MAP = {
2621             'xspf': 'http://xspf.org/ns/0/',
2622             's1': 'http://static.streamone.nl/player/ns/0',
2623         }
2624
2625         entries = []
2626         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2627             title = xpath_text(
2628                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2629             description = xpath_text(
2630                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2631             thumbnail = xpath_text(
2632                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2633             duration = float_or_none(
2634                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2635
2636             formats = []
2637             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2638                 format_url = urljoin(xspf_base_url, location.text)
2639                 if not format_url:
2640                     continue
2641                 formats.append({
2642                     'url': format_url,
2643                     'manifest_url': xspf_url,
2644                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2645                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2646                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2647                 })
2648             self._sort_formats(formats)
2649
2650             entries.append({
2651                 'id': playlist_id,
2652                 'title': title,
2653                 'description': description,
2654                 'thumbnail': thumbnail,
2655                 'duration': duration,
2656                 'formats': formats,
2657             })
2658         return entries
2659
2660     def _extract_mpd_formats(self, *args, **kwargs):
2661         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2662         if subs:
2663             self._report_ignoring_subs('DASH')
2664         return fmts
2665
2666     def _extract_mpd_formats_and_subtitles(
2667             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2668             fatal=True, data=None, headers={}, query={}):
2669         res = self._download_xml_handle(
2670             mpd_url, video_id,
2671             note='Downloading MPD manifest' if note is None else note,
2672             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2673             fatal=fatal, data=data, headers=headers, query=query)
2674         if res is False:
2675             return [], {}
2676         mpd_doc, urlh = res
2677         if mpd_doc is None:
2678             return [], {}
2679         mpd_base_url = base_url(urlh.geturl())
2680
2681         return self._parse_mpd_formats_and_subtitles(
2682             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2683
2684     def _parse_mpd_formats(self, *args, **kwargs):
2685         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2686         if subs:
2687             self._report_ignoring_subs('DASH')
2688         return fmts
2689
2690     def _parse_mpd_formats_and_subtitles(
2691             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2692         """
2693         Parse formats from MPD manifest.
2694         References:
2695          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2696             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2697          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2698         """
2699         if not self.get_param('dynamic_mpd', True):
2700             if mpd_doc.get('type') == 'dynamic':
2701                 return [], {}
2702
2703         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2704
2705         def _add_ns(path):
2706             return self._xpath_ns(path, namespace)
2707
2708         def is_drm_protected(element):
2709             return element.find(_add_ns('ContentProtection')) is not None
2710
2711         def extract_multisegment_info(element, ms_parent_info):
2712             ms_info = ms_parent_info.copy()
2713
2714             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2715             # common attributes and elements.  We will only extract relevant
2716             # for us.
2717             def extract_common(source):
2718                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2719                 if segment_timeline is not None:
2720                     s_e = segment_timeline.findall(_add_ns('S'))
2721                     if s_e:
2722                         ms_info['total_number'] = 0
2723                         ms_info['s'] = []
2724                         for s in s_e:
2725                             r = int(s.get('r', 0))
2726                             ms_info['total_number'] += 1 + r
2727                             ms_info['s'].append({
2728                                 't': int(s.get('t', 0)),
2729                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2730                                 'd': int(s.attrib['d']),
2731                                 'r': r,
2732                             })
2733                 start_number = source.get('startNumber')
2734                 if start_number:
2735                     ms_info['start_number'] = int(start_number)
2736                 timescale = source.get('timescale')
2737                 if timescale:
2738                     ms_info['timescale'] = int(timescale)
2739                 segment_duration = source.get('duration')
2740                 if segment_duration:
2741                     ms_info['segment_duration'] = float(segment_duration)
2742
2743             def extract_Initialization(source):
2744                 initialization = source.find(_add_ns('Initialization'))
2745                 if initialization is not None:
2746                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2747
2748             segment_list = element.find(_add_ns('SegmentList'))
2749             if segment_list is not None:
2750                 extract_common(segment_list)
2751                 extract_Initialization(segment_list)
2752                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2753                 if segment_urls_e:
2754                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2755             else:
2756                 segment_template = element.find(_add_ns('SegmentTemplate'))
2757                 if segment_template is not None:
2758                     extract_common(segment_template)
2759                     media = segment_template.get('media')
2760                     if media:
2761                         ms_info['media'] = media
2762                     initialization = segment_template.get('initialization')
2763                     if initialization:
2764                         ms_info['initialization'] = initialization
2765                     else:
2766                         extract_Initialization(segment_template)
2767             return ms_info
2768
2769         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2770         formats, subtitles = [], {}
2771         stream_numbers = collections.defaultdict(int)
2772         for period in mpd_doc.findall(_add_ns('Period')):
2773             period_duration = parse_duration(period.get('duration')) or mpd_duration
2774             period_ms_info = extract_multisegment_info(period, {
2775                 'start_number': 1,
2776                 'timescale': 1,
2777             })
2778             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2779                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2780                 for representation in adaptation_set.findall(_add_ns('Representation')):
2781                     representation_attrib = adaptation_set.attrib.copy()
2782                     representation_attrib.update(representation.attrib)
2783                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2784                     mime_type = representation_attrib['mimeType']
2785                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2786
2787                     codecs = parse_codecs(representation_attrib.get('codecs', ''))
2788                     if content_type not in ('video', 'audio', 'text'):
2789                         if mime_type == 'image/jpeg':
2790                             content_type = mime_type
2791                         elif codecs['vcodec'] != 'none':
2792                             content_type = 'video'
2793                         elif codecs['acodec'] != 'none':
2794                             content_type = 'audio'
2795                         elif codecs.get('tcodec', 'none') != 'none':
2796                             content_type = 'text'
2797                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2798                             content_type = 'text'
2799                         else:
2800                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2801                             continue
2802
2803                     base_url = ''
2804                     for element in (representation, adaptation_set, period, mpd_doc):
2805                         base_url_e = element.find(_add_ns('BaseURL'))
2806                         if base_url_e is not None:
2807                             base_url = base_url_e.text + base_url
2808                             if re.match(r'^https?://', base_url):
2809                                 break
2810                     if mpd_base_url and base_url.startswith('/'):
2811                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2812                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2813                         if not mpd_base_url.endswith('/'):
2814                             mpd_base_url += '/'
2815                         base_url = mpd_base_url + base_url
2816                     representation_id = representation_attrib.get('id')
2817                     lang = representation_attrib.get('lang')
2818                     url_el = representation.find(_add_ns('BaseURL'))
2819                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2820                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2821                     if representation_id is not None:
2822                         format_id = representation_id
2823                     else:
2824                         format_id = content_type
2825                     if mpd_id:
2826                         format_id = mpd_id + '-' + format_id
2827                     if content_type in ('video', 'audio'):
2828                         f = {
2829                             'format_id': format_id,
2830                             'manifest_url': mpd_url,
2831                             'ext': mimetype2ext(mime_type),
2832                             'width': int_or_none(representation_attrib.get('width')),
2833                             'height': int_or_none(representation_attrib.get('height')),
2834                             'tbr': float_or_none(bandwidth, 1000),
2835                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2836                             'fps': int_or_none(representation_attrib.get('frameRate')),
2837                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2838                             'format_note': 'DASH %s' % content_type,
2839                             'filesize': filesize,
2840                             'container': mimetype2ext(mime_type) + '_dash',
2841                             **codecs
2842                         }
2843                     elif content_type == 'text':
2844                         f = {
2845                             'ext': mimetype2ext(mime_type),
2846                             'manifest_url': mpd_url,
2847                             'filesize': filesize,
2848                         }
2849                     elif content_type == 'image/jpeg':
2850                         # See test case in VikiIE
2851                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2852                         f = {
2853                             'format_id': format_id,
2854                             'ext': 'mhtml',
2855                             'manifest_url': mpd_url,
2856                             'format_note': 'DASH storyboards (jpeg)',
2857                             'acodec': 'none',
2858                             'vcodec': 'none',
2859                         }
2860                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2861                         f['has_drm'] = True
2862                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2863
2864                     def prepare_template(template_name, identifiers):
2865                         tmpl = representation_ms_info[template_name]
2866                         # First of, % characters outside $...$ templates
2867                         # must be escaped by doubling for proper processing
2868                         # by % operator string formatting used further (see
2869                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2870                         t = ''
2871                         in_template = False
2872                         for c in tmpl:
2873                             t += c
2874                             if c == '$':
2875                                 in_template = not in_template
2876                             elif c == '%' and not in_template:
2877                                 t += c
2878                         # Next, $...$ templates are translated to their
2879                         # %(...) counterparts to be used with % operator
2880                         if representation_id is not None:
2881                             t = t.replace('$RepresentationID$', representation_id)
2882                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2883                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2884                         t.replace('$$', '$')
2885                         return t
2886
2887                     # @initialization is a regular template like @media one
2888                     # so it should be handled just the same way (see
2889                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2890                     if 'initialization' in representation_ms_info:
2891                         initialization_template = prepare_template(
2892                             'initialization',
2893                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2894                             # $Time$ shall not be included for @initialization thus
2895                             # only $Bandwidth$ remains
2896                             ('Bandwidth', ))
2897                         representation_ms_info['initialization_url'] = initialization_template % {
2898                             'Bandwidth': bandwidth,
2899                         }
2900
2901                     def location_key(location):
2902                         return 'url' if re.match(r'^https?://', location) else 'path'
2903
2904                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2905
2906                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2907                         media_location_key = location_key(media_template)
2908
2909                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2910                         # can't be used at the same time
2911                         if '%(Number' in media_template and 's' not in representation_ms_info:
2912                             segment_duration = None
2913                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2914                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2915                                 representation_ms_info['total_number'] = int(math.ceil(
2916                                     float_or_none(period_duration, segment_duration, default=0)))
2917                             representation_ms_info['fragments'] = [{
2918                                 media_location_key: media_template % {
2919                                     'Number': segment_number,
2920                                     'Bandwidth': bandwidth,
2921                                 },
2922                                 'duration': segment_duration,
2923                             } for segment_number in range(
2924                                 representation_ms_info['start_number'],
2925                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2926                         else:
2927                             # $Number*$ or $Time$ in media template with S list available
2928                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2929                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2930                             representation_ms_info['fragments'] = []
2931                             segment_time = 0
2932                             segment_d = None
2933                             segment_number = representation_ms_info['start_number']
2934
2935                             def add_segment_url():
2936                                 segment_url = media_template % {
2937                                     'Time': segment_time,
2938                                     'Bandwidth': bandwidth,
2939                                     'Number': segment_number,
2940                                 }
2941                                 representation_ms_info['fragments'].append({
2942                                     media_location_key: segment_url,
2943                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2944                                 })
2945
2946                             for num, s in enumerate(representation_ms_info['s']):
2947                                 segment_time = s.get('t') or segment_time
2948                                 segment_d = s['d']
2949                                 add_segment_url()
2950                                 segment_number += 1
2951                                 for r in range(s.get('r', 0)):
2952                                     segment_time += segment_d
2953                                     add_segment_url()
2954                                     segment_number += 1
2955                                 segment_time += segment_d
2956                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2957                         # No media template
2958                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2959                         # or any YouTube dashsegments video
2960                         fragments = []
2961                         segment_index = 0
2962                         timescale = representation_ms_info['timescale']
2963                         for s in representation_ms_info['s']:
2964                             duration = float_or_none(s['d'], timescale)
2965                             for r in range(s.get('r', 0) + 1):
2966                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2967                                 fragments.append({
2968                                     location_key(segment_uri): segment_uri,
2969                                     'duration': duration,
2970                                 })
2971                                 segment_index += 1
2972                         representation_ms_info['fragments'] = fragments
2973                     elif 'segment_urls' in representation_ms_info:
2974                         # Segment URLs with no SegmentTimeline
2975                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2976                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2977                         fragments = []
2978                         segment_duration = float_or_none(
2979                             representation_ms_info['segment_duration'],
2980                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2981                         for segment_url in representation_ms_info['segment_urls']:
2982                             fragment = {
2983                                 location_key(segment_url): segment_url,
2984                             }
2985                             if segment_duration:
2986                                 fragment['duration'] = segment_duration
2987                             fragments.append(fragment)
2988                         representation_ms_info['fragments'] = fragments
2989                     # If there is a fragments key available then we correctly recognized fragmented media.
2990                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2991                     # assumption is not necessarily correct since we may simply have no support for
2992                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2993                     if 'fragments' in representation_ms_info:
2994                         f.update({
2995                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2996                             'url': mpd_url or base_url,
2997                             'fragment_base_url': base_url,
2998                             'fragments': [],
2999                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3000                         })
3001                         if 'initialization_url' in representation_ms_info:
3002                             initialization_url = representation_ms_info['initialization_url']
3003                             if not f.get('url'):
3004                                 f['url'] = initialization_url
3005                             f['fragments'].append({location_key(initialization_url): initialization_url})
3006                         f['fragments'].extend(representation_ms_info['fragments'])
3007                         if not period_duration:
3008                             period_duration = try_get(
3009                                 representation_ms_info,
3010                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3011                     else:
3012                         # Assuming direct URL to unfragmented media.
3013                         f['url'] = base_url
3014                     if content_type in ('video', 'audio', 'image/jpeg'):
3015                         f['manifest_stream_number'] = stream_numbers[f['url']]
3016                         stream_numbers[f['url']] += 1
3017                         formats.append(f)
3018                     elif content_type == 'text':
3019                         subtitles.setdefault(lang or 'und', []).append(f)
3020
3021         return formats, subtitles
3022
3023     def _extract_ism_formats(self, *args, **kwargs):
3024         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3025         if subs:
3026             self._report_ignoring_subs('ISM')
3027         return fmts
3028
3029     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3030         res = self._download_xml_handle(
3031             ism_url, video_id,
3032             note='Downloading ISM manifest' if note is None else note,
3033             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3034             fatal=fatal, data=data, headers=headers, query=query)
3035         if res is False:
3036             return [], {}
3037         ism_doc, urlh = res
3038         if ism_doc is None:
3039             return [], {}
3040
3041         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3042
3043     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3044         """
3045         Parse formats from ISM manifest.
3046         References:
3047          1. [MS-SSTR]: Smooth Streaming Protocol,
3048             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3049         """
3050         if ism_doc.get('IsLive') == 'TRUE':
3051             return [], {}
3052
3053         duration = int(ism_doc.attrib['Duration'])
3054         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3055
3056         formats = []
3057         subtitles = {}
3058         for stream in ism_doc.findall('StreamIndex'):
3059             stream_type = stream.get('Type')
3060             if stream_type not in ('video', 'audio', 'text'):
3061                 continue
3062             url_pattern = stream.attrib['Url']
3063             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3064             stream_name = stream.get('Name')
3065             stream_language = stream.get('Language', 'und')
3066             for track in stream.findall('QualityLevel'):
3067                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3068                 # TODO: add support for WVC1 and WMAP
3069                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3070                     self.report_warning('%s is not a supported codec' % fourcc)
3071                     continue
3072                 tbr = int(track.attrib['Bitrate']) // 1000
3073                 # [1] does not mention Width and Height attributes. However,
3074                 # they're often present while MaxWidth and MaxHeight are
3075                 # missing, so should be used as fallbacks
3076                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3077                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3078                 sampling_rate = int_or_none(track.get('SamplingRate'))
3079
3080                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3081                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3082
3083                 fragments = []
3084                 fragment_ctx = {
3085                     'time': 0,
3086                 }
3087                 stream_fragments = stream.findall('c')
3088                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3089                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3090                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3091                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3092                     if not fragment_ctx['duration']:
3093                         try:
3094                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3095                         except IndexError:
3096                             next_fragment_time = duration
3097                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3098                     for _ in range(fragment_repeat):
3099                         fragments.append({
3100                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3101                             'duration': fragment_ctx['duration'] / stream_timescale,
3102                         })
3103                         fragment_ctx['time'] += fragment_ctx['duration']
3104
3105                 if stream_type == 'text':
3106                     subtitles.setdefault(stream_language, []).append({
3107                         'ext': 'ismt',
3108                         'protocol': 'ism',
3109                         'url': ism_url,
3110                         'manifest_url': ism_url,
3111                         'fragments': fragments,
3112                         '_download_params': {
3113                             'stream_type': stream_type,
3114                             'duration': duration,
3115                             'timescale': stream_timescale,
3116                             'fourcc': fourcc,
3117                             'language': stream_language,
3118                             'codec_private_data': track.get('CodecPrivateData'),
3119                         }
3120                     })
3121                 elif stream_type in ('video', 'audio'):
3122                     formats.append({
3123                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3124                         'url': ism_url,
3125                         'manifest_url': ism_url,
3126                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3127                         'width': width,
3128                         'height': height,
3129                         'tbr': tbr,
3130                         'asr': sampling_rate,
3131                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3132                         'acodec': 'none' if stream_type == 'video' else fourcc,
3133                         'protocol': 'ism',
3134                         'fragments': fragments,
3135                         'has_drm': ism_doc.find('Protection') is not None,
3136                         '_download_params': {
3137                             'stream_type': stream_type,
3138                             'duration': duration,
3139                             'timescale': stream_timescale,
3140                             'width': width or 0,
3141                             'height': height or 0,
3142                             'fourcc': fourcc,
3143                             'language': stream_language,
3144                             'codec_private_data': track.get('CodecPrivateData'),
3145                             'sampling_rate': sampling_rate,
3146                             'channels': int_or_none(track.get('Channels', 2)),
3147                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3148                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3149                         },
3150                     })
3151         return formats, subtitles
3152
3153     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3154         def absolute_url(item_url):
3155             return urljoin(base_url, item_url)
3156
3157         def parse_content_type(content_type):
3158             if not content_type:
3159                 return {}
3160             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3161             if ctr:
3162                 mimetype, codecs = ctr.groups()
3163                 f = parse_codecs(codecs)
3164                 f['ext'] = mimetype2ext(mimetype)
3165                 return f
3166             return {}
3167
3168         def _media_formats(src, cur_media_type, type_info={}):
3169             full_url = absolute_url(src)
3170             ext = type_info.get('ext') or determine_ext(full_url)
3171             if ext == 'm3u8':
3172                 is_plain_url = False
3173                 formats = self._extract_m3u8_formats(
3174                     full_url, video_id, ext='mp4',
3175                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3176                     preference=preference, quality=quality, fatal=False)
3177             elif ext == 'mpd':
3178                 is_plain_url = False
3179                 formats = self._extract_mpd_formats(
3180                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3181             else:
3182                 is_plain_url = True
3183                 formats = [{
3184                     'url': full_url,
3185                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3186                 }]
3187             return is_plain_url, formats
3188
3189         entries = []
3190         # amp-video and amp-audio are very similar to their HTML5 counterparts
3191         # so we wll include them right here (see
3192         # https://www.ampproject.org/docs/reference/components/amp-video)
3193         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3194         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3195         media_tags = [(media_tag, media_tag_name, media_type, '')
3196                       for media_tag, media_tag_name, media_type
3197                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3198         media_tags.extend(re.findall(
3199             # We only allow video|audio followed by a whitespace or '>'.
3200             # Allowing more characters may end up in significant slow down (see
3201             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3202             # http://www.porntrex.com/maps/videositemap.xml).
3203             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3204         for media_tag, _, media_type, media_content in media_tags:
3205             media_info = {
3206                 'formats': [],
3207                 'subtitles': {},
3208             }
3209             media_attributes = extract_attributes(media_tag)
3210             src = strip_or_none(media_attributes.get('src'))
3211             if src:
3212                 _, formats = _media_formats(src, media_type)
3213                 media_info['formats'].extend(formats)
3214             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3215             if media_content:
3216                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3217                     s_attr = extract_attributes(source_tag)
3218                     # data-video-src and data-src are non standard but seen
3219                     # several times in the wild
3220                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3221                     if not src:
3222                         continue
3223                     f = parse_content_type(s_attr.get('type'))
3224                     is_plain_url, formats = _media_formats(src, media_type, f)
3225                     if is_plain_url:
3226                         # width, height, res, label and title attributes are
3227                         # all not standard but seen several times in the wild
3228                         labels = [
3229                             s_attr.get(lbl)
3230                             for lbl in ('label', 'title')
3231                             if str_or_none(s_attr.get(lbl))
3232                         ]
3233                         width = int_or_none(s_attr.get('width'))
3234                         height = (int_or_none(s_attr.get('height'))
3235                                   or int_or_none(s_attr.get('res')))
3236                         if not width or not height:
3237                             for lbl in labels:
3238                                 resolution = parse_resolution(lbl)
3239                                 if not resolution:
3240                                     continue
3241                                 width = width or resolution.get('width')
3242                                 height = height or resolution.get('height')
3243                         for lbl in labels:
3244                             tbr = parse_bitrate(lbl)
3245                             if tbr:
3246                                 break
3247                         else:
3248                             tbr = None
3249                         f.update({
3250                             'width': width,
3251                             'height': height,
3252                             'tbr': tbr,
3253                             'format_id': s_attr.get('label') or s_attr.get('title'),
3254                         })
3255                         f.update(formats[0])
3256                         media_info['formats'].append(f)
3257                     else:
3258                         media_info['formats'].extend(formats)
3259                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3260                     track_attributes = extract_attributes(track_tag)
3261                     kind = track_attributes.get('kind')
3262                     if not kind or kind in ('subtitles', 'captions'):
3263                         src = strip_or_none(track_attributes.get('src'))
3264                         if not src:
3265                             continue
3266                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3267                         media_info['subtitles'].setdefault(lang, []).append({
3268                             'url': absolute_url(src),
3269                         })
3270             for f in media_info['formats']:
3271                 f.setdefault('http_headers', {})['Referer'] = base_url
3272             if media_info['formats'] or media_info['subtitles']:
3273                 entries.append(media_info)
3274         return entries
3275
3276     def _extract_akamai_formats(self, *args, **kwargs):
3277         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3278         if subs:
3279             self._report_ignoring_subs('akamai')
3280         return fmts
3281
3282     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3283         signed = 'hdnea=' in manifest_url
3284         if not signed:
3285             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3286             manifest_url = re.sub(
3287                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3288                 '', manifest_url).strip('?')
3289
3290         formats = []
3291         subtitles = {}
3292
3293         hdcore_sign = 'hdcore=3.7.0'
3294         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3295         hds_host = hosts.get('hds')
3296         if hds_host:
3297             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3298         if 'hdcore=' not in f4m_url:
3299             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3300         f4m_formats = self._extract_f4m_formats(
3301             f4m_url, video_id, f4m_id='hds', fatal=False)
3302         for entry in f4m_formats:
3303             entry.update({'extra_param_to_segment_url': hdcore_sign})
3304         formats.extend(f4m_formats)
3305
3306         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3307         hls_host = hosts.get('hls')
3308         if hls_host:
3309             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3310         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3311             m3u8_url, video_id, 'mp4', 'm3u8_native',
3312             m3u8_id='hls', fatal=False)
3313         formats.extend(m3u8_formats)
3314         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3315
3316         http_host = hosts.get('http')
3317         if http_host and m3u8_formats and not signed:
3318             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3319             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3320             qualities_length = len(qualities)
3321             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3322                 i = 0
3323                 for f in m3u8_formats:
3324                     if f['vcodec'] != 'none':
3325                         for protocol in ('http', 'https'):
3326                             http_f = f.copy()
3327                             del http_f['manifest_url']
3328                             http_url = re.sub(
3329                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3330                             http_f.update({
3331                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3332                                 'url': http_url,
3333                                 'protocol': protocol,
3334                             })
3335                             formats.append(http_f)
3336                         i += 1
3337
3338         return formats, subtitles
3339
3340     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3341         query = compat_urlparse.urlparse(url).query
3342         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3343         mobj = re.search(
3344             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3345         url_base = mobj.group('url')
3346         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3347         formats = []
3348
3349         def manifest_url(manifest):
3350             m_url = f'{http_base_url}/{manifest}'
3351             if query:
3352                 m_url += '?%s' % query
3353             return m_url
3354
3355         if 'm3u8' not in skip_protocols:
3356             formats.extend(self._extract_m3u8_formats(
3357                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3358                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3359         if 'f4m' not in skip_protocols:
3360             formats.extend(self._extract_f4m_formats(
3361                 manifest_url('manifest.f4m'),
3362                 video_id, f4m_id='hds', fatal=False))
3363         if 'dash' not in skip_protocols:
3364             formats.extend(self._extract_mpd_formats(
3365                 manifest_url('manifest.mpd'),
3366                 video_id, mpd_id='dash', fatal=False))
3367         if re.search(r'(?:/smil:|\.smil)', url_base):
3368             if 'smil' not in skip_protocols:
3369                 rtmp_formats = self._extract_smil_formats(
3370                     manifest_url('jwplayer.smil'),
3371                     video_id, fatal=False)
3372                 for rtmp_format in rtmp_formats:
3373                     rtsp_format = rtmp_format.copy()
3374                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3375                     del rtsp_format['play_path']
3376                     del rtsp_format['ext']
3377                     rtsp_format.update({
3378                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3379                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3380                         'protocol': 'rtsp',
3381                     })
3382                     formats.extend([rtmp_format, rtsp_format])
3383         else:
3384             for protocol in ('rtmp', 'rtsp'):
3385                 if protocol not in skip_protocols:
3386                     formats.append({
3387                         'url': f'{protocol}:{url_base}',
3388                         'format_id': protocol,
3389                         'protocol': protocol,
3390                     })
3391         return formats
3392
3393     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3394         mobj = re.search(
3395             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3396             webpage)
3397         if mobj:
3398             try:
3399                 jwplayer_data = self._parse_json(mobj.group('options'),
3400                                                  video_id=video_id,
3401                                                  transform_source=transform_source)
3402             except ExtractorError:
3403                 pass
3404             else:
3405                 if isinstance(jwplayer_data, dict):
3406                     return jwplayer_data
3407
3408     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3409         jwplayer_data = self._find_jwplayer_data(
3410             webpage, video_id, transform_source=js_to_json)
3411         return self._parse_jwplayer_data(
3412             jwplayer_data, video_id, *args, **kwargs)
3413
3414     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3415                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3416         # JWPlayer backward compatibility: flattened playlists
3417         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3418         if 'playlist' not in jwplayer_data:
3419             jwplayer_data = {'playlist': [jwplayer_data]}
3420
3421         entries = []
3422
3423         # JWPlayer backward compatibility: single playlist item
3424         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3425         if not isinstance(jwplayer_data['playlist'], list):
3426             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3427
3428         for video_data in jwplayer_data['playlist']:
3429             # JWPlayer backward compatibility: flattened sources
3430             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3431             if 'sources' not in video_data:
3432                 video_data['sources'] = [video_data]
3433
3434             this_video_id = video_id or video_data['mediaid']
3435
3436             formats = self._parse_jwplayer_formats(
3437                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3438                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3439
3440             subtitles = {}
3441             tracks = video_data.get('tracks')
3442             if tracks and isinstance(tracks, list):
3443                 for track in tracks:
3444                     if not isinstance(track, dict):
3445                         continue
3446                     track_kind = track.get('kind')
3447                     if not track_kind or not isinstance(track_kind, compat_str):
3448                         continue
3449                     if track_kind.lower() not in ('captions', 'subtitles'):
3450                         continue
3451                     track_url = urljoin(base_url, track.get('file'))
3452                     if not track_url:
3453                         continue
3454                     subtitles.setdefault(track.get('label') or 'en', []).append({
3455                         'url': self._proto_relative_url(track_url)
3456                     })
3457
3458             entry = {
3459                 'id': this_video_id,
3460                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3461                 'description': clean_html(video_data.get('description')),
3462                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3463                 'timestamp': int_or_none(video_data.get('pubdate')),
3464                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3465                 'subtitles': subtitles,
3466             }
3467             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3468             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3469                 entry.update({
3470                     '_type': 'url_transparent',
3471                     'url': formats[0]['url'],
3472                 })
3473             else:
3474                 self._sort_formats(formats)
3475                 entry['formats'] = formats
3476             entries.append(entry)
3477         if len(entries) == 1:
3478             return entries[0]
3479         else:
3480             return self.playlist_result(entries)
3481
3482     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3483                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3484         urls = []
3485         formats = []
3486         for source in jwplayer_sources_data:
3487             if not isinstance(source, dict):
3488                 continue
3489             source_url = urljoin(
3490                 base_url, self._proto_relative_url(source.get('file')))
3491             if not source_url or source_url in urls:
3492                 continue
3493             urls.append(source_url)
3494             source_type = source.get('type') or ''
3495             ext = mimetype2ext(source_type) or determine_ext(source_url)
3496             if source_type == 'hls' or ext == 'm3u8':
3497                 formats.extend(self._extract_m3u8_formats(
3498                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3499                     m3u8_id=m3u8_id, fatal=False))
3500             elif source_type == 'dash' or ext == 'mpd':
3501                 formats.extend(self._extract_mpd_formats(
3502                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3503             elif ext == 'smil':
3504                 formats.extend(self._extract_smil_formats(
3505                     source_url, video_id, fatal=False))
3506             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3507             elif source_type.startswith('audio') or ext in (
3508                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3509                 formats.append({
3510                     'url': source_url,
3511                     'vcodec': 'none',
3512                     'ext': ext,
3513                 })
3514             else:
3515                 height = int_or_none(source.get('height'))
3516                 if height is None:
3517                     # Often no height is provided but there is a label in
3518                     # format like "1080p", "720p SD", or 1080.
3519                     height = int_or_none(self._search_regex(
3520                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3521                         'height', default=None))
3522                 a_format = {
3523                     'url': source_url,
3524                     'width': int_or_none(source.get('width')),
3525                     'height': height,
3526                     'tbr': int_or_none(source.get('bitrate')),
3527                     'ext': ext,
3528                 }
3529                 if source_url.startswith('rtmp'):
3530                     a_format['ext'] = 'flv'
3531                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3532                     # of jwplayer.flash.swf
3533                     rtmp_url_parts = re.split(
3534                         r'((?:mp4|mp3|flv):)', source_url, 1)
3535                     if len(rtmp_url_parts) == 3:
3536                         rtmp_url, prefix, play_path = rtmp_url_parts
3537                         a_format.update({
3538                             'url': rtmp_url,
3539                             'play_path': prefix + play_path,
3540                         })
3541                     if rtmp_params:
3542                         a_format.update(rtmp_params)
3543                 formats.append(a_format)
3544         return formats
3545
3546     def _live_title(self, name):
3547         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3548         return name
3549
3550     def _int(self, v, name, fatal=False, **kwargs):
3551         res = int_or_none(v, **kwargs)
3552         if res is None:
3553             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3554             if fatal:
3555                 raise ExtractorError(msg)
3556             else:
3557                 self.report_warning(msg)
3558         return res
3559
3560     def _float(self, v, name, fatal=False, **kwargs):
3561         res = float_or_none(v, **kwargs)
3562         if res is None:
3563             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3564             if fatal:
3565                 raise ExtractorError(msg)
3566             else:
3567                 self.report_warning(msg)
3568         return res
3569
3570     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3571                     path='/', secure=False, discard=False, rest={}, **kwargs):
3572         cookie = compat_cookiejar_Cookie(
3573             0, name, value, port, port is not None, domain, True,
3574             domain.startswith('.'), path, True, secure, expire_time,
3575             discard, None, None, rest)
3576         self._downloader.cookiejar.set_cookie(cookie)
3577
3578     def _get_cookies(self, url):
3579         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3580         req = sanitized_Request(url)
3581         self._downloader.cookiejar.add_cookie_header(req)
3582         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3583
3584     def _apply_first_set_cookie_header(self, url_handle, cookie):
3585         """
3586         Apply first Set-Cookie header instead of the last. Experimental.
3587
3588         Some sites (e.g. [1-3]) may serve two cookies under the same name
3589         in Set-Cookie header and expect the first (old) one to be set rather
3590         than second (new). However, as of RFC6265 the newer one cookie
3591         should be set into cookie store what actually happens.
3592         We will workaround this issue by resetting the cookie to
3593         the first one manually.
3594         1. https://new.vk.com/
3595         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3596         3. https://learning.oreilly.com/
3597         """
3598         for header, cookies in url_handle.headers.items():
3599             if header.lower() != 'set-cookie':
3600                 continue
3601             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3602             cookie_value = re.search(
3603                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3604             if cookie_value:
3605                 value, domain = cookie_value.groups()
3606                 self._set_cookie(domain, cookie, value)
3607                 break
3608
3609     def get_testcases(self, include_onlymatching=False):
3610         t = getattr(self, '_TEST', None)
3611         if t:
3612             assert not hasattr(self, '_TESTS'), \
3613                 '%s has _TEST and _TESTS' % type(self).__name__
3614             tests = [t]
3615         else:
3616             tests = getattr(self, '_TESTS', [])
3617         for t in tests:
3618             if not include_onlymatching and t.get('only_matching', False):
3619                 continue
3620             t['name'] = type(self).__name__[:-len('IE')]
3621             yield t
3622
3623     def is_suitable(self, age_limit):
3624         """ Test whether the extractor is generally suitable for the given
3625         age limit (i.e. pornographic sites are not, all others usually are) """
3626
3627         any_restricted = False
3628         for tc in self.get_testcases(include_onlymatching=False):
3629             if tc.get('playlist', []):
3630                 tc = tc['playlist'][0]
3631             is_restricted = age_restricted(
3632                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3633             if not is_restricted:
3634                 return True
3635             any_restricted = any_restricted or is_restricted
3636         return not any_restricted
3637
3638     def extract_subtitles(self, *args, **kwargs):
3639         if (self.get_param('writesubtitles', False)
3640                 or self.get_param('listsubtitles')):
3641             return self._get_subtitles(*args, **kwargs)
3642         return {}
3643
3644     def _get_subtitles(self, *args, **kwargs):
3645         raise NotImplementedError('This method must be implemented by subclasses')
3646
3647     def extract_comments(self, *args, **kwargs):
3648         if not self.get_param('getcomments'):
3649             return None
3650         generator = self._get_comments(*args, **kwargs)
3651
3652         def extractor():
3653             comments = []
3654             interrupted = True
3655             try:
3656                 while True:
3657                     comments.append(next(generator))
3658             except StopIteration:
3659                 interrupted = False
3660             except KeyboardInterrupt:
3661                 self.to_screen('Interrupted by user')
3662             except Exception as e:
3663                 if self.get_param('ignoreerrors') is not True:
3664                     raise
3665                 self._downloader.report_error(e)
3666             comment_count = len(comments)
3667             self.to_screen(f'Extracted {comment_count} comments')
3668             return {
3669                 'comments': comments,
3670                 'comment_count': None if interrupted else comment_count
3671             }
3672         return extractor
3673
3674     def _get_comments(self, *args, **kwargs):
3675         raise NotImplementedError('This method must be implemented by subclasses')
3676
3677     @staticmethod
3678     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3679         """ Merge subtitle items for one language. Items with duplicated URLs/data
3680         will be dropped. """
3681         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3682         ret = list(subtitle_list1)
3683         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3684         return ret
3685
3686     @classmethod
3687     def _merge_subtitles(cls, *dicts, target=None):
3688         """ Merge subtitle dictionaries, language by language. """
3689         if target is None:
3690             target = {}
3691         for d in dicts:
3692             for lang, subs in d.items():
3693                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3694         return target
3695
3696     def extract_automatic_captions(self, *args, **kwargs):
3697         if (self.get_param('writeautomaticsub', False)
3698                 or self.get_param('listsubtitles')):
3699             return self._get_automatic_captions(*args, **kwargs)
3700         return {}
3701
3702     def _get_automatic_captions(self, *args, **kwargs):
3703         raise NotImplementedError('This method must be implemented by subclasses')
3704
3705     def mark_watched(self, *args, **kwargs):
3706         if not self.get_param('mark_watched', False):
3707             return
3708         if (self.supports_login() and self._get_login_info()[0] is not None
3709                 or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')):
3710             self._mark_watched(*args, **kwargs)
3711
3712     def _mark_watched(self, *args, **kwargs):
3713         raise NotImplementedError('This method must be implemented by subclasses')
3714
3715     def geo_verification_headers(self):
3716         headers = {}
3717         geo_verification_proxy = self.get_param('geo_verification_proxy')
3718         if geo_verification_proxy:
3719             headers['Ytdl-request-proxy'] = geo_verification_proxy
3720         return headers
3721
3722     def _generic_id(self, url):
3723         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3724
3725     def _generic_title(self, url):
3726         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3727
3728     @staticmethod
3729     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3730         all_known = all(map(
3731             lambda x: x is not None,
3732             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3733         return (
3734             'private' if is_private
3735             else 'premium_only' if needs_premium
3736             else 'subscriber_only' if needs_subscription
3737             else 'needs_auth' if needs_auth
3738             else 'unlisted' if is_unlisted
3739             else 'public' if all_known
3740             else None)
3741
3742     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3743         '''
3744         @returns            A list of values for the extractor argument given by "key"
3745                             or "default" if no such key is present
3746         @param default      The default value to return when the key is not present (default: [])
3747         @param casesense    When false, the values are converted to lower case
3748         '''
3749         val = traverse_obj(
3750             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3751         if val is None:
3752             return [] if default is NO_DEFAULT else default
3753         return list(val) if casesense else [x.lower() for x in val]
3754
3755     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3756         if not playlist_id or not video_id:
3757             return not video_id
3758
3759         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3760         if no_playlist is not None:
3761             return not no_playlist
3762
3763         video_id = '' if video_id is True else f' {video_id}'
3764         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3765         if self.get_param('noplaylist'):
3766             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3767             return False
3768         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3769         return True
3770
3771
3772 class SearchInfoExtractor(InfoExtractor):
3773     """
3774     Base class for paged search queries extractors.
3775     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3776     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3777     """
3778
3779     _MAX_RESULTS = float('inf')
3780
3781     @classmethod
3782     def _make_valid_url(cls):
3783         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3784
3785     def _real_extract(self, query):
3786         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3787         if prefix == '':
3788             return self._get_n_results(query, 1)
3789         elif prefix == 'all':
3790             return self._get_n_results(query, self._MAX_RESULTS)
3791         else:
3792             n = int(prefix)
3793             if n <= 0:
3794                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3795             elif n > self._MAX_RESULTS:
3796                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3797                 n = self._MAX_RESULTS
3798             return self._get_n_results(query, n)
3799
3800     def _get_n_results(self, query, n):
3801         """Get a specified number of results for a query.
3802         Either this function or _search_results must be overridden by subclasses """
3803         return self.playlist_result(
3804             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3805             query, query)
3806
3807     def _search_results(self, query):
3808         """Returns an iterator of search results"""
3809         raise NotImplementedError('This method must be implemented by subclasses')
3810
3811     @property
3812     def SEARCH_KEY(self):
3813         return self._SEARCH_KEY