yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import hashlib
   4 import itertools
   5 import json
   6 import math
   7 import netrc
   8 import os
   9 import random
  10 import sys
  11 import time
  12 import xml.etree.ElementTree
  13
  14 from ..compat import (
  15     compat_cookiejar_Cookie,
  16     compat_cookies_SimpleCookie,
  17     compat_etree_fromstring,
  18     compat_expanduser,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_unquote,
  25     compat_urllib_parse_urlencode,
  26     compat_urllib_request,
  27     compat_urlparse,
  28     re,
  29 )
  30 from ..downloader import FileDownloader
  31 from ..downloader.f4m import get_base_url, remove_encrypted_media
  32 from ..utils import (
  33     JSON_LD_RE,
  34     NO_DEFAULT,
  35     ExtractorError,
  36     GeoRestrictedError,
  37     GeoUtils,
  38     RegexNotFoundError,
  39     UnsupportedError,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     encode_data_uri,
  48     error_to_compat_str,
  49     extract_attributes,
  50     filter_dict,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     int_or_none,
  55     join_nonempty,
  56     js_to_json,
  57     mimetype2ext,
  58     network_exceptions,
  59     orderedSet,
  60     parse_bitrate,
  61     parse_codecs,
  62     parse_duration,
  63     parse_iso8601,
  64     parse_m3u8_attributes,
  65     parse_resolution,
  66     sanitize_filename,
  67     sanitized_Request,
  68     str_or_none,
  69     str_to_int,
  70     strip_or_none,
  71     traverse_obj,
  72     try_get,
  73     unescapeHTML,
  74     unified_strdate,
  75     unified_timestamp,
  76     update_Request,
  77     update_url_query,
  78     url_basename,
  79     url_or_none,
  80     urljoin,
  81     variadic,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor:
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped.
 107
 108     Additionally, it must contain either a formats entry or a url one:
 109
 110     formats:        A list of dictionaries for each format available, ordered
 111                     from worst to best quality.
 112
 113                     Potential fields:
 114                     * url        The mandatory URL representing the media:
 115                                    for plain file media - HTTP URL of this file,
 116                                    for RTMP - RTMP URL,
 117                                    for HLS - URL of the M3U8 media playlist,
 118                                    for HDS - URL of the F4M manifest,
 119                                    for DASH
 120                                      - HTTP URL to plain file media (in case of
 121                                        unfragmented media)
 122                                      - URL of the MPD manifest or base URL
 123                                        representing the media if MPD manifest
 124                                        is parsed from a string (in case of
 125                                        fragmented media)
 126                                    for MSS - URL of the ISM manifest.
 127                     * manifest_url
 128                                  The URL of the manifest file in case of
 129                                  fragmented media:
 130                                    for HLS - URL of the M3U8 master playlist,
 131                                    for HDS - URL of the F4M manifest,
 132                                    for DASH - URL of the MPD manifest,
 133                                    for MSS - URL of the ISM manifest.
 134                     * manifest_stream_number  (For internal use only)
 135                                  The index of the stream in the manifest file
 136                     * ext        Will be calculated from URL if missing
 137                     * format     A human-readable description of the format
 138                                  ("mp4 container with h264/opus").
 139                                  Calculated from the format_id, width, height.
 140                                  and format_note fields if missing.
 141                     * format_id  A short description of the format
 142                                  ("mp4_h264_opus" or "19").
 143                                 Technically optional, but strongly recommended.
 144                     * format_note Additional info about the format
 145                                  ("3D" or "DASH video")
 146                     * width      Width of the video, if known
 147                     * height     Height of the video, if known
 148                     * resolution Textual description of width and height
 149                     * dynamic_range The dynamic range of the video. One of:
 150                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 151                     * tbr        Average bitrate of audio and video in KBit/s
 152                     * abr        Average audio bitrate in KBit/s
 153                     * acodec     Name of the audio codec in use
 154                     * asr        Audio sampling rate in Hertz
 155                     * vbr        Average video bitrate in KBit/s
 156                     * fps        Frame rate
 157                     * vcodec     Name of the video codec in use
 158                     * container  Name of the container format
 159                     * filesize   The number of bytes, if known in advance
 160                     * filesize_approx  An estimate for the number of bytes
 161                     * player_url SWF Player URL (used for rtmpdump).
 162                     * protocol   The protocol that will be used for the actual
 163                                  download, lower-case. One of "http", "https" or
 164                                  one of the protocols defined in downloader.PROTOCOL_MAP
 165                     * fragment_base_url
 166                                  Base URL for fragments. Each fragment's path
 167                                  value (if present) will be relative to
 168                                  this URL.
 169                     * fragments  A list of fragments of a fragmented media.
 170                                  Each fragment entry must contain either an url
 171                                  or a path. If an url is present it should be
 172                                  considered by a client. Otherwise both path and
 173                                  fragment_base_url must be present. Here is
 174                                  the list of all potential fields:
 175                                  * "url" - fragment's URL
 176                                  * "path" - fragment's path relative to
 177                                             fragment_base_url
 178                                  * "duration" (optional, int or float)
 179                                  * "filesize" (optional, int)
 180                     * is_from_start  Is a live format that can be downloaded
 181                                 from the start. Boolean
 182                     * preference Order number of this format. If this field is
 183                                  present and not None, the formats get sorted
 184                                  by this field, regardless of all other values.
 185                                  -1 for default (order by other properties),
 186                                  -2 or smaller for less than default.
 187                                  < -1000 to hide the format (if there is
 188                                     another one which is strictly better)
 189                     * language   Language code, e.g. "de" or "en-US".
 190                     * language_preference  Is this in the language mentioned in
 191                                  the URL?
 192                                  10 if it's what the URL is about,
 193                                  -1 for default (don't know),
 194                                  -10 otherwise, other values reserved for now.
 195                     * quality    Order number of the video quality of this
 196                                  format, irrespective of the file format.
 197                                  -1 for default (order by other properties),
 198                                  -2 or smaller for less than default.
 199                     * source_preference  Order number for this video source
 200                                   (quality takes higher priority)
 201                                  -1 for default (order by other properties),
 202                                  -2 or smaller for less than default.
 203                     * http_headers  A dictionary of additional HTTP headers
 204                                  to add to the request.
 205                     * stretched_ratio  If given and not 1, indicates that the
 206                                  video's pixels are not square.
 207                                  width : height ratio as float.
 208                     * no_resume  The server does not support resuming the
 209                                  (HTTP or RTMP) download. Boolean.
 210                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 211                     * downloader_options  A dictionary of downloader options
 212                                  (For internal use only)
 213                                  * http_chunk_size Chunk size for HTTP downloads
 214                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 215                     RTMP formats can also have the additional fields: page_url,
 216                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 217                     rtmp_protocol, rtmp_real_time
 218
 219     url:            Final video URL.
 220     ext:            Video filename extension.
 221     format:         The video format, defaults to ext (used for --get-format)
 222     player_url:     SWF Player URL (used for rtmpdump).
 223
 224     The following fields are optional:
 225
 226     direct:         True if a direct video file was given (must only be set by GenericIE)
 227     alt_title:      A secondary title of the video.
 228     display_id      An alternative identifier for the video, not necessarily
 229                     unique, but available before title. Typically, id is
 230                     something like "4234987", title "Dancing naked mole rats",
 231                     and display_id "dancing-naked-mole-rats"
 232     thumbnails:     A list of dictionaries, with the following entries:
 233                         * "id" (optional, string) - Thumbnail format ID
 234                         * "url"
 235                         * "preference" (optional, int) - quality of the image
 236                         * "width" (optional, int)
 237                         * "height" (optional, int)
 238                         * "resolution" (optional, string "{width}x{height}",
 239                                         deprecated)
 240                         * "filesize" (optional, int)
 241                         * "http_headers" (dict) - HTTP headers for the request
 242     thumbnail:      Full URL to a video thumbnail image.
 243     description:    Full video description.
 244     uploader:       Full name of the video uploader.
 245     license:        License name the video is licensed under.
 246     creator:        The creator of the video.
 247     timestamp:      UNIX timestamp of the moment the video was uploaded
 248     upload_date:    Video upload date in UTC (YYYYMMDD).
 249                     If not explicitly set, calculated from timestamp
 250     release_timestamp: UNIX timestamp of the moment the video was released.
 251                     If it is not clear whether to use timestamp or this, use the former
 252     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 253                     If not explicitly set, calculated from release_timestamp
 254     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 255     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 256                     If not explicitly set, calculated from modified_timestamp
 257     uploader_id:    Nickname or id of the video uploader.
 258     uploader_url:   Full URL to a personal webpage of the video uploader.
 259     channel:        Full name of the channel the video is uploaded on.
 260                     Note that channel fields may or may not repeat uploader
 261                     fields. This depends on a particular extractor.
 262     channel_id:     Id of the channel.
 263     channel_url:    Full URL to a channel webpage.
 264     channel_follower_count: Number of followers of the channel.
 265     location:       Physical location where the video was filmed.
 266     subtitles:      The available subtitles as a dictionary in the format
 267                     {tag: subformats}. "tag" is usually a language code, and
 268                     "subformats" is a list sorted from lower to higher
 269                     preference, each element is a dictionary with the "ext"
 270                     entry and one of:
 271                         * "data": The subtitles file contents
 272                         * "url": A URL pointing to the subtitles file
 273                     It can optionally also have:
 274                         * "name": Name or description of the subtitles
 275                         * "http_headers": A dictionary of additional HTTP headers
 276                                   to add to the request.
 277                     "ext" will be calculated from URL if missing
 278     automatic_captions: Like 'subtitles'; contains automatically generated
 279                     captions instead of normal subtitles
 280     duration:       Length of the video in seconds, as an integer or float.
 281     view_count:     How many users have watched the video on the platform.
 282     like_count:     Number of positive ratings of the video
 283     dislike_count:  Number of negative ratings of the video
 284     repost_count:   Number of reposts of the video
 285     average_rating: Average rating give by users, the scale used depends on the webpage
 286     comment_count:  Number of comments on the video
 287     comments:       A list of comments, each with one or more of the following
 288                     properties (all but one of text or html optional):
 289                         * "author" - human-readable name of the comment author
 290                         * "author_id" - user ID of the comment author
 291                         * "author_thumbnail" - The thumbnail of the comment author
 292                         * "id" - Comment ID
 293                         * "html" - Comment as HTML
 294                         * "text" - Plain text of the comment
 295                         * "timestamp" - UNIX timestamp of comment
 296                         * "parent" - ID of the comment this one is replying to.
 297                                      Set to "root" to indicate that this is a
 298                                      comment to the original video.
 299                         * "like_count" - Number of positive ratings of the comment
 300                         * "dislike_count" - Number of negative ratings of the comment
 301                         * "is_favorited" - Whether the comment is marked as
 302                                            favorite by the video uploader
 303                         * "author_is_uploader" - Whether the comment is made by
 304                                                  the video uploader
 305     age_limit:      Age restriction for the video, as an integer (years)
 306     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 307                     should allow to get the same result again. (It will be set
 308                     by YoutubeDL if it's missing)
 309     categories:     A list of categories that the video falls in, for example
 310                     ["Sports", "Berlin"]
 311     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 312     cast:           A list of the video cast
 313     is_live:        True, False, or None (=unknown). Whether this video is a
 314                     live stream that goes on instead of a fixed-length video.
 315     was_live:       True, False, or None (=unknown). Whether this video was
 316                     originally a live stream.
 317     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 318                     If absent, automatically set from is_live, was_live
 319     start_time:     Time in seconds where the reproduction should start, as
 320                     specified in the URL.
 321     end_time:       Time in seconds where the reproduction should end, as
 322                     specified in the URL.
 323     chapters:       A list of dictionaries, with the following entries:
 324                         * "start_time" - The start time of the chapter in seconds
 325                         * "end_time" - The end time of the chapter in seconds
 326                         * "title" (optional, string)
 327     playable_in_embed: Whether this video is allowed to play in embedded
 328                     players on other sites. Can be True (=always allowed),
 329                     False (=never allowed), None (=unknown), or a string
 330                     specifying the criteria for embedability (Eg: 'whitelist')
 331     availability:   Under what condition the video is available. One of
 332                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 333                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 334                     to set it
 335     __post_extractor: A function to be called just before the metadata is
 336                     written to either disk, logger or console. The function
 337                     must return a dict which will be added to the info_dict.
 338                     This is usefull for additional information that is
 339                     time-consuming to extract. Note that the fields thus
 340                     extracted will not be available to output template and
 341                     match_filter. So, only "comments" and "comment_count" are
 342                     currently allowed to be extracted via this method.
 343
 344     The following fields should only be used when the video belongs to some logical
 345     chapter or section:
 346
 347     chapter:        Name or title of the chapter the video belongs to.
 348     chapter_number: Number of the chapter the video belongs to, as an integer.
 349     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 350
 351     The following fields should only be used when the video is an episode of some
 352     series, programme or podcast:
 353
 354     series:         Title of the series or programme the video episode belongs to.
 355     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 356     season:         Title of the season the video episode belongs to.
 357     season_number:  Number of the season the video episode belongs to, as an integer.
 358     season_id:      Id of the season the video episode belongs to, as a unicode string.
 359     episode:        Title of the video episode. Unlike mandatory video title field,
 360                     this field should denote the exact title of the video episode
 361                     without any kind of decoration.
 362     episode_number: Number of the video episode within a season, as an integer.
 363     episode_id:     Id of the video episode, as a unicode string.
 364
 365     The following fields should only be used when the media is a track or a part of
 366     a music album:
 367
 368     track:          Title of the track.
 369     track_number:   Number of the track within an album or a disc, as an integer.
 370     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 371                     as a unicode string.
 372     artist:         Artist(s) of the track.
 373     genre:          Genre(s) of the track.
 374     album:          Title of the album the track belongs to.
 375     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 376     album_artist:   List of all artists appeared on the album (e.g.
 377                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 378                     and compilations).
 379     disc_number:    Number of the disc or other physical medium the track belongs to,
 380                     as an integer.
 381     release_year:   Year (YYYY) when the album was released.
 382     composer:       Composer of the piece
 383
 384     Unless mentioned otherwise, the fields should be Unicode strings.
 385
 386     Unless mentioned otherwise, None is equivalent to absence of information.
 387
 388
 389     _type "playlist" indicates multiple videos.
 390     There must be a key "entries", which is a list, an iterable, or a PagedList
 391     object, each element of which is a valid dictionary by this specification.
 392
 393     Additionally, playlists can have "id", "title", and any other relevent
 394     attributes with the same semantics as videos (see above).
 395
 396     It can also have the following optional fields:
 397
 398     playlist_count: The total number of videos in a playlist. If not given,
 399                     YoutubeDL tries to calculate it from "entries"
 400
 401
 402     _type "multi_video" indicates that there are multiple videos that
 403     form a single show, for examples multiple acts of an opera or TV episode.
 404     It must have an entries key like a playlist and contain all the keys
 405     required for a video at the same time.
 406
 407
 408     _type "url" indicates that the video must be extracted from another
 409     location, possibly by a different extractor. Its only required key is:
 410     "url" - the next URL to extract.
 411     The key "ie_key" can be set to the class name (minus the trailing "IE",
 412     e.g. "Youtube") if the extractor class is known in advance.
 413     Additionally, the dictionary may have any properties of the resolved entity
 414     known in advance, for example "title" if the title of the referred video is
 415     known ahead of time.
 416
 417
 418     _type "url_transparent" entities have the same specification as "url", but
 419     indicate that the given additional information is more precise than the one
 420     associated with the resolved URL.
 421     This is useful when a site employs a video service that hosts the video and
 422     its technical metadata, but that video service does not embed a useful
 423     title, description etc.
 424
 425
 426     Subclasses of this should define a _VALID_URL regexp and, re-define the
 427     _real_extract() and (optionally) _real_initialize() methods.
 428     Probably, they should also be added to the list of extractors.
 429
 430     Subclasses may also override suitable() if necessary, but ensure the function
 431     signature is preserved and that this function imports everything it needs
 432     (except other extractors), so that lazy_extractors works correctly.
 433
 434     To support username + password (or netrc) login, the extractor must define a
 435     _NETRC_MACHINE and re-define _perform_login(username, password) and
 436     (optionally) _initialize_pre_login() methods. The _perform_login method will
 437     be called between _initialize_pre_login and _real_initialize if credentials
 438     are passed by the user. In cases where it is necessary to have the login
 439     process as part of the extraction rather than initialization, _perform_login
 440     can be left undefined.
 441
 442     _GEO_BYPASS attribute may be set to False in order to disable
 443     geo restriction bypass mechanisms for a particular extractor.
 444     Though it won't disable explicit geo restriction bypass based on
 445     country code provided with geo_bypass_country.
 446
 447     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 448     countries for this extractor. One of these countries will be used by
 449     geo restriction bypass mechanism right away in order to bypass
 450     geo restriction, of course, if the mechanism is not disabled.
 451
 452     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 453     IP blocks in CIDR notation for this extractor. One of these IP blocks
 454     will be used by geo restriction bypass mechanism similarly
 455     to _GEO_COUNTRIES.
 456
 457     The _WORKING attribute should be set to False for broken IEs
 458     in order to warn the users and skip the tests.
 459     """
 460
 461     _ready = False
 462     _downloader = None
 463     _x_forwarded_for_ip = None
 464     _GEO_BYPASS = True
 465     _GEO_COUNTRIES = None
 466     _GEO_IP_BLOCKS = None
 467     _WORKING = True
 468     _NETRC_MACHINE = None
 469     IE_DESC = None
 470
 471     _LOGIN_HINTS = {
 472         'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials',
 473         'cookies': (
 474             'Use --cookies-from-browser or --cookies for the authentication. '
 475             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 476         'password': 'Use --username and --password, or --netrc to provide account credentials',
 477     }
 478
 479     def __init__(self, downloader=None):
 480         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 481         If a downloader is not passed during initialization,
 482         it must be set using "set_downloader()" before "extract()" is called"""
 483         self._ready = False
 484         self._x_forwarded_for_ip = None
 485         self._printed_messages = set()
 486         self.set_downloader(downloader)
 487
 488     @classmethod
 489     def _match_valid_url(cls, url):
 490         # This does not use has/getattr intentionally - we want to know whether
 491         # we have cached the regexp for *this* class, whereas getattr would also
 492         # match the superclass
 493         if '_VALID_URL_RE' not in cls.__dict__:
 494             if '_VALID_URL' not in cls.__dict__:
 495                 cls._VALID_URL = cls._make_valid_url()
 496             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 497         return cls._VALID_URL_RE.match(url)
 498
 499     @classmethod
 500     def suitable(cls, url):
 501         """Receives a URL and returns True if suitable for this IE."""
 502         # This function must import everything it needs (except other extractors),
 503         # so that lazy_extractors works correctly
 504         return cls._match_valid_url(url) is not None
 505
 506     @classmethod
 507     def _match_id(cls, url):
 508         return cls._match_valid_url(url).group('id')
 509
 510     @classmethod
 511     def get_temp_id(cls, url):
 512         try:
 513             return cls._match_id(url)
 514         except (IndexError, AttributeError):
 515             return None
 516
 517     @classmethod
 518     def working(cls):
 519         """Getter method for _WORKING."""
 520         return cls._WORKING
 521
 522     @classmethod
 523     def supports_login(cls):
 524         return bool(cls._NETRC_MACHINE)
 525
 526     def initialize(self):
 527         """Initializes an instance (authentication, etc)."""
 528         self._printed_messages = set()
 529         self._initialize_geo_bypass({
 530             'countries': self._GEO_COUNTRIES,
 531             'ip_blocks': self._GEO_IP_BLOCKS,
 532         })
 533         if not self._ready:
 534             self._initialize_pre_login()
 535             if self.supports_login():
 536                 username, password = self._get_login_info()
 537                 if username:
 538                     self._perform_login(username, password)
 539             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 540                 self.report_warning(f'Login with password is not supported for this website. {self._LOGIN_HINTS["cookies"]}')
 541             self._real_initialize()
 542             self._ready = True
 543
 544     def _initialize_geo_bypass(self, geo_bypass_context):
 545         """
 546         Initialize geo restriction bypass mechanism.
 547
 548         This method is used to initialize geo bypass mechanism based on faking
 549         X-Forwarded-For HTTP header. A random country from provided country list
 550         is selected and a random IP belonging to this country is generated. This
 551         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 552         HTTP requests.
 553
 554         This method will be used for initial geo bypass mechanism initialization
 555         during the instance initialization with _GEO_COUNTRIES and
 556         _GEO_IP_BLOCKS.
 557
 558         You may also manually call it from extractor's code if geo bypass
 559         information is not available beforehand (e.g. obtained during
 560         extraction) or due to some other reason. In this case you should pass
 561         this information in geo bypass context passed as first argument. It may
 562         contain following fields:
 563
 564         countries:  List of geo unrestricted countries (similar
 565                     to _GEO_COUNTRIES)
 566         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 567                     (similar to _GEO_IP_BLOCKS)
 568
 569         """
 570         if not self._x_forwarded_for_ip:
 571
 572             # Geo bypass mechanism is explicitly disabled by user
 573             if not self.get_param('geo_bypass', True):
 574                 return
 575
 576             if not geo_bypass_context:
 577                 geo_bypass_context = {}
 578
 579             # Backward compatibility: previously _initialize_geo_bypass
 580             # expected a list of countries, some 3rd party code may still use
 581             # it this way
 582             if isinstance(geo_bypass_context, (list, tuple)):
 583                 geo_bypass_context = {
 584                     'countries': geo_bypass_context,
 585                 }
 586
 587             # The whole point of geo bypass mechanism is to fake IP
 588             # as X-Forwarded-For HTTP header based on some IP block or
 589             # country code.
 590
 591             # Path 1: bypassing based on IP block in CIDR notation
 592
 593             # Explicit IP block specified by user, use it right away
 594             # regardless of whether extractor is geo bypassable or not
 595             ip_block = self.get_param('geo_bypass_ip_block', None)
 596
 597             # Otherwise use random IP block from geo bypass context but only
 598             # if extractor is known as geo bypassable
 599             if not ip_block:
 600                 ip_blocks = geo_bypass_context.get('ip_blocks')
 601                 if self._GEO_BYPASS and ip_blocks:
 602                     ip_block = random.choice(ip_blocks)
 603
 604             if ip_block:
 605                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 606                 self._downloader.write_debug(
 607                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 608                 return
 609
 610             # Path 2: bypassing based on country code
 611
 612             # Explicit country code specified by user, use it right away
 613             # regardless of whether extractor is geo bypassable or not
 614             country = self.get_param('geo_bypass_country', None)
 615
 616             # Otherwise use random country code from geo bypass context but
 617             # only if extractor is known as geo bypassable
 618             if not country:
 619                 countries = geo_bypass_context.get('countries')
 620                 if self._GEO_BYPASS and countries:
 621                     country = random.choice(countries)
 622
 623             if country:
 624                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 625                 self._downloader.write_debug(
 626                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 627
 628     def extract(self, url):
 629         """Extracts URL information and returns it in list of dicts."""
 630         try:
 631             for _ in range(2):
 632                 try:
 633                     self.initialize()
 634                     self.write_debug('Extracting URL: %s' % url)
 635                     ie_result = self._real_extract(url)
 636                     if ie_result is None:
 637                         return None
 638                     if self._x_forwarded_for_ip:
 639                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 640                     subtitles = ie_result.get('subtitles')
 641                     if (subtitles and 'live_chat' in subtitles
 642                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 643                         del subtitles['live_chat']
 644                     return ie_result
 645                 except GeoRestrictedError as e:
 646                     if self.__maybe_fake_ip_and_retry(e.countries):
 647                         continue
 648                     raise
 649         except UnsupportedError:
 650             raise
 651         except ExtractorError as e:
 652             kwargs = {
 653                 'video_id': e.video_id or self.get_temp_id(url),
 654                 'ie': self.IE_NAME,
 655                 'tb': e.traceback or sys.exc_info()[2],
 656                 'expected': e.expected,
 657                 'cause': e.cause
 658             }
 659             if hasattr(e, 'countries'):
 660                 kwargs['countries'] = e.countries
 661             raise type(e)(e.orig_msg, **kwargs)
 662         except compat_http_client.IncompleteRead as e:
 663             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 664         except (KeyError, StopIteration) as e:
 665             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 666
 667     def __maybe_fake_ip_and_retry(self, countries):
 668         if (not self.get_param('geo_bypass_country', None)
 669                 and self._GEO_BYPASS
 670                 and self.get_param('geo_bypass', True)
 671                 and not self._x_forwarded_for_ip
 672                 and countries):
 673             country_code = random.choice(countries)
 674             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 675             if self._x_forwarded_for_ip:
 676                 self.report_warning(
 677                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 678                     % (self._x_forwarded_for_ip, country_code.upper()))
 679                 return True
 680         return False
 681
 682     def set_downloader(self, downloader):
 683         """Sets a YoutubeDL instance as the downloader for this IE."""
 684         self._downloader = downloader
 685
 686     def _initialize_pre_login(self):
 687         """ Intialization before login. Redefine in subclasses."""
 688         pass
 689
 690     def _perform_login(self, username, password):
 691         """ Login with username and password. Redefine in subclasses."""
 692         pass
 693
 694     def _real_initialize(self):
 695         """Real initialization process. Redefine in subclasses."""
 696         pass
 697
 698     def _real_extract(self, url):
 699         """Real extraction process. Redefine in subclasses."""
 700         raise NotImplementedError('This method must be implemented by subclasses')
 701
 702     @classmethod
 703     def ie_key(cls):
 704         """A string for getting the InfoExtractor with get_info_extractor"""
 705         return cls.__name__[:-2]
 706
 707     @property
 708     def IE_NAME(self):
 709         return compat_str(type(self).__name__[:-2])
 710
 711     @staticmethod
 712     def __can_accept_status_code(err, expected_status):
 713         assert isinstance(err, compat_urllib_error.HTTPError)
 714         if expected_status is None:
 715             return False
 716         elif callable(expected_status):
 717             return expected_status(err.code) is True
 718         else:
 719             return err.code in variadic(expected_status)
 720
 721     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 722         """
 723         Return the response handle.
 724
 725         See _download_webpage docstring for arguments specification.
 726         """
 727         if not self._downloader._first_webpage_request:
 728             sleep_interval = self.get_param('sleep_interval_requests') or 0
 729             if sleep_interval > 0:
 730                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 731                 time.sleep(sleep_interval)
 732         else:
 733             self._downloader._first_webpage_request = False
 734
 735         if note is None:
 736             self.report_download_webpage(video_id)
 737         elif note is not False:
 738             if video_id is None:
 739                 self.to_screen(str(note))
 740             else:
 741                 self.to_screen(f'{video_id}: {note}')
 742
 743         # Some sites check X-Forwarded-For HTTP header in order to figure out
 744         # the origin of the client behind proxy. This allows bypassing geo
 745         # restriction by faking this header's value to IP that belongs to some
 746         # geo unrestricted country. We will do so once we encounter any
 747         # geo restriction error.
 748         if self._x_forwarded_for_ip:
 749             if 'X-Forwarded-For' not in headers:
 750                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 751
 752         if isinstance(url_or_request, compat_urllib_request.Request):
 753             url_or_request = update_Request(
 754                 url_or_request, data=data, headers=headers, query=query)
 755         else:
 756             if query:
 757                 url_or_request = update_url_query(url_or_request, query)
 758             if data is not None or headers:
 759                 url_or_request = sanitized_Request(url_or_request, data, headers)
 760         try:
 761             return self._downloader.urlopen(url_or_request)
 762         except network_exceptions as err:
 763             if isinstance(err, compat_urllib_error.HTTPError):
 764                 if self.__can_accept_status_code(err, expected_status):
 765                     # Retain reference to error to prevent file object from
 766                     # being closed before it can be read. Works around the
 767                     # effects of <https://bugs.python.org/issue15002>
 768                     # introduced in Python 3.4.1.
 769                     err.fp._error = err
 770                     return err.fp
 771
 772             if errnote is False:
 773                 return False
 774             if errnote is None:
 775                 errnote = 'Unable to download webpage'
 776
 777             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 778             if fatal:
 779                 raise ExtractorError(errmsg, cause=err)
 780             else:
 781                 self.report_warning(errmsg)
 782                 return False
 783
 784     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 785         """
 786         Return a tuple (page content as string, URL handle).
 787
 788         See _download_webpage docstring for arguments specification.
 789         """
 790         # Strip hashes from the URL (#1038)
 791         if isinstance(url_or_request, (compat_str, str)):
 792             url_or_request = url_or_request.partition('#')[0]
 793
 794         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 795         if urlh is False:
 796             assert not fatal
 797             return False
 798         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 799         return (content, urlh)
 800
 801     @staticmethod
 802     def _guess_encoding_from_content(content_type, webpage_bytes):
 803         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 804         if m:
 805             encoding = m.group(1)
 806         else:
 807             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 808                           webpage_bytes[:1024])
 809             if m:
 810                 encoding = m.group(1).decode('ascii')
 811             elif webpage_bytes.startswith(b'\xff\xfe'):
 812                 encoding = 'utf-16'
 813             else:
 814                 encoding = 'utf-8'
 815
 816         return encoding
 817
 818     def __check_blocked(self, content):
 819         first_block = content[:512]
 820         if ('<title>Access to this site is blocked</title>' in content
 821                 and 'Websense' in first_block):
 822             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 823             blocked_iframe = self._html_search_regex(
 824                 r'<iframe src="([^"]+)"', content,
 825                 'Websense information URL', default=None)
 826             if blocked_iframe:
 827                 msg += ' Visit %s for more details' % blocked_iframe
 828             raise ExtractorError(msg, expected=True)
 829         if '<title>The URL you requested has been blocked</title>' in first_block:
 830             msg = (
 831                 'Access to this webpage has been blocked by Indian censorship. '
 832                 'Use a VPN or proxy server (with --proxy) to route around it.')
 833             block_msg = self._html_search_regex(
 834                 r'</h1><p>(.*?)</p>',
 835                 content, 'block message', default=None)
 836             if block_msg:
 837                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 838             raise ExtractorError(msg, expected=True)
 839         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 840                 and 'blocklist.rkn.gov.ru' in content):
 841             raise ExtractorError(
 842                 'Access to this webpage has been blocked by decision of the Russian government. '
 843                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 844                 expected=True)
 845
 846     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 847         content_type = urlh.headers.get('Content-Type', '')
 848         webpage_bytes = urlh.read()
 849         if prefix is not None:
 850             webpage_bytes = prefix + webpage_bytes
 851         if not encoding:
 852             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 853         if self.get_param('dump_intermediate_pages', False):
 854             self.to_screen('Dumping request to ' + urlh.geturl())
 855             dump = base64.b64encode(webpage_bytes).decode('ascii')
 856             self._downloader.to_screen(dump)
 857         if self.get_param('write_pages', False):
 858             basen = f'{video_id}_{urlh.geturl()}'
 859             trim_length = self.get_param('trim_file_name') or 240
 860             if len(basen) > trim_length:
 861                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 862                 basen = basen[:trim_length - len(h)] + h
 863             raw_filename = basen + '.dump'
 864             filename = sanitize_filename(raw_filename, restricted=True)
 865             self.to_screen('Saving request to ' + filename)
 866             # Working around MAX_PATH limitation on Windows (see
 867             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 868             if compat_os_name == 'nt':
 869                 absfilepath = os.path.abspath(filename)
 870                 if len(absfilepath) > 259:
 871                     filename = '\\\\?\\' + absfilepath
 872             with open(filename, 'wb') as outf:
 873                 outf.write(webpage_bytes)
 874
 875         try:
 876             content = webpage_bytes.decode(encoding, 'replace')
 877         except LookupError:
 878             content = webpage_bytes.decode('utf-8', 'replace')
 879
 880         self.__check_blocked(content)
 881
 882         return content
 883
 884     def _download_webpage(
 885             self, url_or_request, video_id, note=None, errnote=None,
 886             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 887             headers={}, query={}, expected_status=None):
 888         """
 889         Return the data of the page as a string.
 890
 891         Arguments:
 892         url_or_request -- plain text URL as a string or
 893             a compat_urllib_request.Requestobject
 894         video_id -- Video/playlist/item identifier (string)
 895
 896         Keyword arguments:
 897         note -- note printed before downloading (string)
 898         errnote -- note printed in case of an error (string)
 899         fatal -- flag denoting whether error should be considered fatal,
 900             i.e. whether it should cause ExtractionError to be raised,
 901             otherwise a warning will be reported and extraction continued
 902         tries -- number of tries
 903         timeout -- sleep interval between tries
 904         encoding -- encoding for a page content decoding, guessed automatically
 905             when not explicitly specified
 906         data -- POST data (bytes)
 907         headers -- HTTP headers (dict)
 908         query -- URL query (dict)
 909         expected_status -- allows to accept failed HTTP requests (non 2xx
 910             status code) by explicitly specifying a set of accepted status
 911             codes. Can be any of the following entities:
 912                 - an integer type specifying an exact failed status code to
 913                   accept
 914                 - a list or a tuple of integer types specifying a list of
 915                   failed status codes to accept
 916                 - a callable accepting an actual failed status code and
 917                   returning True if it should be accepted
 918             Note that this argument does not affect success status codes (2xx)
 919             which are always accepted.
 920         """
 921
 922         success = False
 923         try_count = 0
 924         while success is False:
 925             try:
 926                 res = self._download_webpage_handle(
 927                     url_or_request, video_id, note, errnote, fatal,
 928                     encoding=encoding, data=data, headers=headers, query=query,
 929                     expected_status=expected_status)
 930                 success = True
 931             except compat_http_client.IncompleteRead as e:
 932                 try_count += 1
 933                 if try_count >= tries:
 934                     raise e
 935                 self._sleep(timeout, video_id)
 936         if res is False:
 937             return res
 938         else:
 939             content, _ = res
 940             return content
 941
 942     def _download_xml_handle(
 943             self, url_or_request, video_id, note='Downloading XML',
 944             errnote='Unable to download XML', transform_source=None,
 945             fatal=True, encoding=None, data=None, headers={}, query={},
 946             expected_status=None):
 947         """
 948         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
 949
 950         See _download_webpage docstring for arguments specification.
 951         """
 952         res = self._download_webpage_handle(
 953             url_or_request, video_id, note, errnote, fatal=fatal,
 954             encoding=encoding, data=data, headers=headers, query=query,
 955             expected_status=expected_status)
 956         if res is False:
 957             return res
 958         xml_string, urlh = res
 959         return self._parse_xml(
 960             xml_string, video_id, transform_source=transform_source,
 961             fatal=fatal), urlh
 962
 963     def _download_xml(
 964             self, url_or_request, video_id,
 965             note='Downloading XML', errnote='Unable to download XML',
 966             transform_source=None, fatal=True, encoding=None,
 967             data=None, headers={}, query={}, expected_status=None):
 968         """
 969         Return the xml as an xml.etree.ElementTree.Element.
 970
 971         See _download_webpage docstring for arguments specification.
 972         """
 973         res = self._download_xml_handle(
 974             url_or_request, video_id, note=note, errnote=errnote,
 975             transform_source=transform_source, fatal=fatal, encoding=encoding,
 976             data=data, headers=headers, query=query,
 977             expected_status=expected_status)
 978         return res if res is False else res[0]
 979
 980     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 981         if transform_source:
 982             xml_string = transform_source(xml_string)
 983         try:
 984             return compat_etree_fromstring(xml_string.encode('utf-8'))
 985         except xml.etree.ElementTree.ParseError as ve:
 986             errmsg = '%s: Failed to parse XML ' % video_id
 987             if fatal:
 988                 raise ExtractorError(errmsg, cause=ve)
 989             else:
 990                 self.report_warning(errmsg + str(ve))
 991
 992     def _download_json_handle(
 993             self, url_or_request, video_id, note='Downloading JSON metadata',
 994             errnote='Unable to download JSON metadata', transform_source=None,
 995             fatal=True, encoding=None, data=None, headers={}, query={},
 996             expected_status=None):
 997         """
 998         Return a tuple (JSON object, URL handle).
 999
1000         See _download_webpage docstring for arguments specification.
1001         """
1002         res = self._download_webpage_handle(
1003             url_or_request, video_id, note, errnote, fatal=fatal,
1004             encoding=encoding, data=data, headers=headers, query=query,
1005             expected_status=expected_status)
1006         if res is False:
1007             return res
1008         json_string, urlh = res
1009         return self._parse_json(
1010             json_string, video_id, transform_source=transform_source,
1011             fatal=fatal), urlh
1012
1013     def _download_json(
1014             self, url_or_request, video_id, note='Downloading JSON metadata',
1015             errnote='Unable to download JSON metadata', transform_source=None,
1016             fatal=True, encoding=None, data=None, headers={}, query={},
1017             expected_status=None):
1018         """
1019         Return the JSON object as a dict.
1020
1021         See _download_webpage docstring for arguments specification.
1022         """
1023         res = self._download_json_handle(
1024             url_or_request, video_id, note=note, errnote=errnote,
1025             transform_source=transform_source, fatal=fatal, encoding=encoding,
1026             data=data, headers=headers, query=query,
1027             expected_status=expected_status)
1028         return res if res is False else res[0]
1029
1030     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
1031         if transform_source:
1032             json_string = transform_source(json_string)
1033         try:
1034             return json.loads(json_string, strict=False)
1035         except ValueError as ve:
1036             errmsg = '%s: Failed to parse JSON ' % video_id
1037             if fatal:
1038                 raise ExtractorError(errmsg, cause=ve)
1039             else:
1040                 self.report_warning(errmsg + str(ve))
1041
1042     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1043         return self._parse_json(
1044             data[data.find('{'):data.rfind('}') + 1],
1045             video_id, transform_source, fatal)
1046
1047     def _download_socket_json_handle(
1048             self, url_or_request, video_id, note='Polling socket',
1049             errnote='Unable to poll socket', transform_source=None,
1050             fatal=True, encoding=None, data=None, headers={}, query={},
1051             expected_status=None):
1052         """
1053         Return a tuple (JSON object, URL handle).
1054
1055         See _download_webpage docstring for arguments specification.
1056         """
1057         res = self._download_webpage_handle(
1058             url_or_request, video_id, note, errnote, fatal=fatal,
1059             encoding=encoding, data=data, headers=headers, query=query,
1060             expected_status=expected_status)
1061         if res is False:
1062             return res
1063         webpage, urlh = res
1064         return self._parse_socket_response_as_json(
1065             webpage, video_id, transform_source=transform_source,
1066             fatal=fatal), urlh
1067
1068     def _download_socket_json(
1069             self, url_or_request, video_id, note='Polling socket',
1070             errnote='Unable to poll socket', transform_source=None,
1071             fatal=True, encoding=None, data=None, headers={}, query={},
1072             expected_status=None):
1073         """
1074         Return the JSON object as a dict.
1075
1076         See _download_webpage docstring for arguments specification.
1077         """
1078         res = self._download_socket_json_handle(
1079             url_or_request, video_id, note=note, errnote=errnote,
1080             transform_source=transform_source, fatal=fatal, encoding=encoding,
1081             data=data, headers=headers, query=query,
1082             expected_status=expected_status)
1083         return res if res is False else res[0]
1084
1085     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1086         idstr = format_field(video_id, template='%s: ')
1087         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1088         if only_once:
1089             if f'WARNING: {msg}' in self._printed_messages:
1090                 return
1091             self._printed_messages.add(f'WARNING: {msg}')
1092         self._downloader.report_warning(msg, *args, **kwargs)
1093
1094     def to_screen(self, msg, *args, **kwargs):
1095         """Print msg to screen, prefixing it with '[ie_name]'"""
1096         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1097
1098     def write_debug(self, msg, *args, **kwargs):
1099         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1100
1101     def get_param(self, name, default=None, *args, **kwargs):
1102         if self._downloader:
1103             return self._downloader.params.get(name, default, *args, **kwargs)
1104         return default
1105
1106     def report_drm(self, video_id, partial=False):
1107         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1108
1109     def report_extraction(self, id_or_name):
1110         """Report information extraction."""
1111         self.to_screen('%s: Extracting information' % id_or_name)
1112
1113     def report_download_webpage(self, video_id):
1114         """Report webpage download."""
1115         self.to_screen('%s: Downloading webpage' % video_id)
1116
1117     def report_age_confirmation(self):
1118         """Report attempt to confirm age."""
1119         self.to_screen('Confirming age')
1120
1121     def report_login(self):
1122         """Report attempt to log in."""
1123         self.to_screen('Logging in')
1124
1125     def raise_login_required(
1126             self, msg='This video is only available for registered users',
1127             metadata_available=False, method=NO_DEFAULT):
1128         if metadata_available and (
1129                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1130             self.report_warning(msg)
1131             return
1132         if method is NO_DEFAULT:
1133             method = 'any' if self.supports_login() else 'cookies'
1134         if method is not None:
1135             assert method in self._LOGIN_HINTS, 'Invalid login method'
1136             msg = f'{msg}. {self._LOGIN_HINTS[method]}'
1137         raise ExtractorError(msg, expected=True)
1138
1139     def raise_geo_restricted(
1140             self, msg='This video is not available from your location due to geo restriction',
1141             countries=None, metadata_available=False):
1142         if metadata_available and (
1143                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1144             self.report_warning(msg)
1145         else:
1146             raise GeoRestrictedError(msg, countries=countries)
1147
1148     def raise_no_formats(self, msg, expected=False, video_id=None):
1149         if expected and (
1150                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1151             self.report_warning(msg, video_id)
1152         elif isinstance(msg, ExtractorError):
1153             raise msg
1154         else:
1155             raise ExtractorError(msg, expected=expected, video_id=video_id)
1156
1157     # Methods for following #608
1158     @staticmethod
1159     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1160         """Returns a URL that points to a page that should be processed"""
1161         if ie is not None:
1162             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1163         if video_id is not None:
1164             kwargs['id'] = video_id
1165         if video_title is not None:
1166             kwargs['title'] = video_title
1167         return {
1168             **kwargs,
1169             '_type': 'url_transparent' if url_transparent else 'url',
1170             'url': url,
1171         }
1172
1173     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1174         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1175                 for m in orderedSet(map(getter, matches) if getter else matches))
1176         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1177
1178     @staticmethod
1179     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1180         """Returns a playlist"""
1181         if playlist_id:
1182             kwargs['id'] = playlist_id
1183         if playlist_title:
1184             kwargs['title'] = playlist_title
1185         if playlist_description is not None:
1186             kwargs['description'] = playlist_description
1187         return {
1188             **kwargs,
1189             '_type': 'multi_video' if multi_video else 'playlist',
1190             'entries': entries,
1191         }
1192
1193     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1194         """
1195         Perform a regex search on the given string, using a single or a list of
1196         patterns returning the first matching group.
1197         In case of failure return a default value or raise a WARNING or a
1198         RegexNotFoundError, depending on fatal, specifying the field name.
1199         """
1200         if string is None:
1201             mobj = None
1202         elif isinstance(pattern, (str, re.Pattern)):
1203             mobj = re.search(pattern, string, flags)
1204         else:
1205             for p in pattern:
1206                 mobj = re.search(p, string, flags)
1207                 if mobj:
1208                     break
1209
1210         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1211
1212         if mobj:
1213             if group is None:
1214                 # return the first matching group
1215                 return next(g for g in mobj.groups() if g is not None)
1216             elif isinstance(group, (list, tuple)):
1217                 return tuple(mobj.group(g) for g in group)
1218             else:
1219                 return mobj.group(group)
1220         elif default is not NO_DEFAULT:
1221             return default
1222         elif fatal:
1223             raise RegexNotFoundError('Unable to extract %s' % _name)
1224         else:
1225             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1226             return None
1227
1228     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1229         """
1230         Like _search_regex, but strips HTML tags and unescapes entities.
1231         """
1232         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1233         if res:
1234             return clean_html(res).strip()
1235         else:
1236             return res
1237
1238     def _get_netrc_login_info(self, netrc_machine=None):
1239         username = None
1240         password = None
1241         netrc_machine = netrc_machine or self._NETRC_MACHINE
1242
1243         if self.get_param('usenetrc', False):
1244             try:
1245                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1246                 if os.path.isdir(netrc_file):
1247                     netrc_file = os.path.join(netrc_file, '.netrc')
1248                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1249                 if info is not None:
1250                     username = info[0]
1251                     password = info[2]
1252                 else:
1253                     raise netrc.NetrcParseError(
1254                         'No authenticators for %s' % netrc_machine)
1255             except (OSError, netrc.NetrcParseError) as err:
1256                 self.report_warning(
1257                     'parsing .netrc: %s' % error_to_compat_str(err))
1258
1259         return username, password
1260
1261     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1262         """
1263         Get the login info as (username, password)
1264         First look for the manually specified credentials using username_option
1265         and password_option as keys in params dictionary. If no such credentials
1266         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1267         value.
1268         If there's no info available, return (None, None)
1269         """
1270
1271         # Attempt to use provided username and password or .netrc data
1272         username = self.get_param(username_option)
1273         if username is not None:
1274             password = self.get_param(password_option)
1275         else:
1276             username, password = self._get_netrc_login_info(netrc_machine)
1277
1278         return username, password
1279
1280     def _get_tfa_info(self, note='two-factor verification code'):
1281         """
1282         Get the two-factor authentication info
1283         TODO - asking the user will be required for sms/phone verify
1284         currently just uses the command line option
1285         If there's no info available, return None
1286         """
1287
1288         tfa = self.get_param('twofactor')
1289         if tfa is not None:
1290             return tfa
1291
1292         return compat_getpass('Type %s and press [Return]: ' % note)
1293
1294     # Helper functions for extracting OpenGraph info
1295     @staticmethod
1296     def _og_regexes(prop):
1297         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1298         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1299                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1300         template = r'<meta[^>]+?%s[^>]+?%s'
1301         return [
1302             template % (property_re, content_re),
1303             template % (content_re, property_re),
1304         ]
1305
1306     @staticmethod
1307     def _meta_regex(prop):
1308         return r'''(?isx)<meta
1309                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1310                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1311
1312     def _og_search_property(self, prop, html, name=None, **kargs):
1313         prop = variadic(prop)
1314         if name is None:
1315             name = 'OpenGraph %s' % prop[0]
1316         og_regexes = []
1317         for p in prop:
1318             og_regexes.extend(self._og_regexes(p))
1319         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1320         if escaped is None:
1321             return None
1322         return unescapeHTML(escaped)
1323
1324     def _og_search_thumbnail(self, html, **kargs):
1325         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1326
1327     def _og_search_description(self, html, **kargs):
1328         return self._og_search_property('description', html, fatal=False, **kargs)
1329
1330     def _og_search_title(self, html, *, fatal=False, **kargs):
1331         return self._og_search_property('title', html, fatal=fatal, **kargs)
1332
1333     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1334         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1335         if secure:
1336             regexes = self._og_regexes('video:secure_url') + regexes
1337         return self._html_search_regex(regexes, html, name, **kargs)
1338
1339     def _og_search_url(self, html, **kargs):
1340         return self._og_search_property('url', html, **kargs)
1341
1342     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1343         return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1344
1345     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1346         name = variadic(name)
1347         if display_name is None:
1348             display_name = name[0]
1349         return self._html_search_regex(
1350             [self._meta_regex(n) for n in name],
1351             html, display_name, fatal=fatal, group='content', **kwargs)
1352
1353     def _dc_search_uploader(self, html):
1354         return self._html_search_meta('dc.creator', html, 'uploader')
1355
1356     def _rta_search(self, html):
1357         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1358         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1359                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1360                      html):
1361             return 18
1362         return 0
1363
1364     def _media_rating_search(self, html):
1365         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1366         rating = self._html_search_meta('rating', html)
1367
1368         if not rating:
1369             return None
1370
1371         RATING_TABLE = {
1372             'safe for kids': 0,
1373             'general': 8,
1374             '14 years': 14,
1375             'mature': 17,
1376             'restricted': 19,
1377         }
1378         return RATING_TABLE.get(rating.lower())
1379
1380     def _family_friendly_search(self, html):
1381         # See http://schema.org/VideoObject
1382         family_friendly = self._html_search_meta(
1383             'isFamilyFriendly', html, default=None)
1384
1385         if not family_friendly:
1386             return None
1387
1388         RATING_TABLE = {
1389             '1': 0,
1390             'true': 0,
1391             '0': 18,
1392             'false': 18,
1393         }
1394         return RATING_TABLE.get(family_friendly.lower())
1395
1396     def _twitter_search_player(self, html):
1397         return self._html_search_meta('twitter:player', html,
1398                                       'twitter card player')
1399
1400     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1401         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1402         default = kwargs.get('default', NO_DEFAULT)
1403         # JSON-LD may be malformed and thus `fatal` should be respected.
1404         # At the same time `default` may be passed that assumes `fatal=False`
1405         # for _search_regex. Let's simulate the same behavior here as well.
1406         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1407         json_ld = []
1408         for mobj in json_ld_list:
1409             json_ld_item = self._parse_json(
1410                 mobj.group('json_ld'), video_id, fatal=fatal)
1411             if not json_ld_item:
1412                 continue
1413             if isinstance(json_ld_item, dict):
1414                 json_ld.append(json_ld_item)
1415             elif isinstance(json_ld_item, (list, tuple)):
1416                 json_ld.extend(json_ld_item)
1417         if json_ld:
1418             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1419         if json_ld:
1420             return json_ld
1421         if default is not NO_DEFAULT:
1422             return default
1423         elif fatal:
1424             raise RegexNotFoundError('Unable to extract JSON-LD')
1425         else:
1426             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1427             return {}
1428
1429     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1430         if isinstance(json_ld, compat_str):
1431             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1432         if not json_ld:
1433             return {}
1434         info = {}
1435         if not isinstance(json_ld, (list, tuple, dict)):
1436             return info
1437         if isinstance(json_ld, dict):
1438             json_ld = [json_ld]
1439
1440         INTERACTION_TYPE_MAP = {
1441             'CommentAction': 'comment',
1442             'AgreeAction': 'like',
1443             'DisagreeAction': 'dislike',
1444             'LikeAction': 'like',
1445             'DislikeAction': 'dislike',
1446             'ListenAction': 'view',
1447             'WatchAction': 'view',
1448             'ViewAction': 'view',
1449         }
1450
1451         def extract_interaction_type(e):
1452             interaction_type = e.get('interactionType')
1453             if isinstance(interaction_type, dict):
1454                 interaction_type = interaction_type.get('@type')
1455             return str_or_none(interaction_type)
1456
1457         def extract_interaction_statistic(e):
1458             interaction_statistic = e.get('interactionStatistic')
1459             if isinstance(interaction_statistic, dict):
1460                 interaction_statistic = [interaction_statistic]
1461             if not isinstance(interaction_statistic, list):
1462                 return
1463             for is_e in interaction_statistic:
1464                 if not isinstance(is_e, dict):
1465                     continue
1466                 if is_e.get('@type') != 'InteractionCounter':
1467                     continue
1468                 interaction_type = extract_interaction_type(is_e)
1469                 if not interaction_type:
1470                     continue
1471                 # For interaction count some sites provide string instead of
1472                 # an integer (as per spec) with non digit characters (e.g. ",")
1473                 # so extracting count with more relaxed str_to_int
1474                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1475                 if interaction_count is None:
1476                     continue
1477                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1478                 if not count_kind:
1479                     continue
1480                 count_key = '%s_count' % count_kind
1481                 if info.get(count_key) is not None:
1482                     continue
1483                 info[count_key] = interaction_count
1484
1485         def extract_chapter_information(e):
1486             chapters = [{
1487                 'title': part.get('name'),
1488                 'start_time': part.get('startOffset'),
1489                 'end_time': part.get('endOffset'),
1490             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1491             for idx, (last_c, current_c, next_c) in enumerate(zip(
1492                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1493                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1494                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1495                 if None in current_c.values():
1496                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1497                     return
1498             if chapters:
1499                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1500                 info['chapters'] = chapters
1501
1502         def extract_video_object(e):
1503             assert e['@type'] == 'VideoObject'
1504             author = e.get('author')
1505             info.update({
1506                 'url': url_or_none(e.get('contentUrl')),
1507                 'title': unescapeHTML(e.get('name')),
1508                 'description': unescapeHTML(e.get('description')),
1509                 'thumbnails': [{'url': url_or_none(url)}
1510                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1511                 'duration': parse_duration(e.get('duration')),
1512                 'timestamp': unified_timestamp(e.get('uploadDate')),
1513                 # author can be an instance of 'Organization' or 'Person' types.
1514                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1515                 # however some websites are using 'Text' type instead.
1516                 # 1. https://schema.org/VideoObject
1517                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1518                 'filesize': float_or_none(e.get('contentSize')),
1519                 'tbr': int_or_none(e.get('bitrate')),
1520                 'width': int_or_none(e.get('width')),
1521                 'height': int_or_none(e.get('height')),
1522                 'view_count': int_or_none(e.get('interactionCount')),
1523             })
1524             extract_interaction_statistic(e)
1525             extract_chapter_information(e)
1526
1527         def traverse_json_ld(json_ld, at_top_level=True):
1528             for e in json_ld:
1529                 if at_top_level and '@context' not in e:
1530                     continue
1531                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1532                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1533                     break
1534                 item_type = e.get('@type')
1535                 if expected_type is not None and expected_type != item_type:
1536                     continue
1537                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1538                 if rating is not None:
1539                     info['average_rating'] = rating
1540                 if item_type in ('TVEpisode', 'Episode'):
1541                     episode_name = unescapeHTML(e.get('name'))
1542                     info.update({
1543                         'episode': episode_name,
1544                         'episode_number': int_or_none(e.get('episodeNumber')),
1545                         'description': unescapeHTML(e.get('description')),
1546                     })
1547                     if not info.get('title') and episode_name:
1548                         info['title'] = episode_name
1549                     part_of_season = e.get('partOfSeason')
1550                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1551                         info.update({
1552                             'season': unescapeHTML(part_of_season.get('name')),
1553                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1554                         })
1555                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1556                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1557                         info['series'] = unescapeHTML(part_of_series.get('name'))
1558                 elif item_type == 'Movie':
1559                     info.update({
1560                         'title': unescapeHTML(e.get('name')),
1561                         'description': unescapeHTML(e.get('description')),
1562                         'duration': parse_duration(e.get('duration')),
1563                         'timestamp': unified_timestamp(e.get('dateCreated')),
1564                     })
1565                 elif item_type in ('Article', 'NewsArticle'):
1566                     info.update({
1567                         'timestamp': parse_iso8601(e.get('datePublished')),
1568                         'title': unescapeHTML(e.get('headline')),
1569                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1570                     })
1571                     if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1572                         extract_video_object(e['video'][0])
1573                 elif item_type == 'VideoObject':
1574                     extract_video_object(e)
1575                     if expected_type is None:
1576                         continue
1577                     else:
1578                         break
1579                 video = e.get('video')
1580                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1581                     extract_video_object(video)
1582                 if expected_type is None:
1583                     continue
1584                 else:
1585                     break
1586         traverse_json_ld(json_ld)
1587
1588         return filter_dict(info)
1589
1590     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1591         return self._parse_json(
1592             self._search_regex(
1593                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1594                 webpage, 'next.js data', fatal=fatal, **kw),
1595             video_id, transform_source=transform_source, fatal=fatal)
1596
1597     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1598         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1599         # not all website do this, but it can be changed
1600         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1601         rectx = re.escape(context_name)
1602         js, arg_keys, arg_vals = self._search_regex(
1603             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1604              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1605             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1606
1607         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1608
1609         for key, val in args.items():
1610             if val in ('undefined', 'void 0'):
1611                 args[key] = 'null'
1612
1613         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1614
1615     @staticmethod
1616     def _hidden_inputs(html):
1617         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1618         hidden_inputs = {}
1619         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1620             attrs = extract_attributes(input)
1621             if not input:
1622                 continue
1623             if attrs.get('type') not in ('hidden', 'submit'):
1624                 continue
1625             name = attrs.get('name') or attrs.get('id')
1626             value = attrs.get('value')
1627             if name and value is not None:
1628                 hidden_inputs[name] = value
1629         return hidden_inputs
1630
1631     def _form_hidden_inputs(self, form_id, html):
1632         form = self._search_regex(
1633             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1634             html, '%s form' % form_id, group='form')
1635         return self._hidden_inputs(form)
1636
1637     class FormatSort:
1638         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1639
1640         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1641                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1642                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1643         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1644                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1645                         'fps', 'fs_approx', 'source', 'id')
1646
1647         settings = {
1648             'vcodec': {'type': 'ordered', 'regex': True,
1649                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1650             'acodec': {'type': 'ordered', 'regex': True,
1651                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1652             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1653                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1654             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1655                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1656             'vext': {'type': 'ordered', 'field': 'video_ext',
1657                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1658                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1659             'aext': {'type': 'ordered', 'field': 'audio_ext',
1660                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1661                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1662             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1663             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1664                            'field': ('vcodec', 'acodec'),
1665                            'function': lambda it: int(any(v != 'none' for v in it))},
1666             'ie_pref': {'priority': True, 'type': 'extractor'},
1667             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1668             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1669             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1670             'quality': {'convert': 'float', 'default': -1},
1671             'filesize': {'convert': 'bytes'},
1672             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1673             'id': {'convert': 'string', 'field': 'format_id'},
1674             'height': {'convert': 'float_none'},
1675             'width': {'convert': 'float_none'},
1676             'fps': {'convert': 'float_none'},
1677             'tbr': {'convert': 'float_none'},
1678             'vbr': {'convert': 'float_none'},
1679             'abr': {'convert': 'float_none'},
1680             'asr': {'convert': 'float_none'},
1681             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1682
1683             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1684             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1685             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1686             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1687             'res': {'type': 'multiple', 'field': ('height', 'width'),
1688                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1689
1690             # For compatibility with youtube-dl
1691             'format_id': {'type': 'alias', 'field': 'id'},
1692             'preference': {'type': 'alias', 'field': 'ie_pref'},
1693             'language_preference': {'type': 'alias', 'field': 'lang'},
1694             'source_preference': {'type': 'alias', 'field': 'source'},
1695             'protocol': {'type': 'alias', 'field': 'proto'},
1696             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1697
1698             # Deprecated
1699             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1700             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1701             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1702             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1703             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1704             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1705             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1706             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1707             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1708             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1709             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1710             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1711             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1712             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1713             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1714             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1715             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1716             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1717             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1718             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1719         }
1720
1721         def __init__(self, ie, field_preference):
1722             self._order = []
1723             self.ydl = ie._downloader
1724             self.evaluate_params(self.ydl.params, field_preference)
1725             if ie.get_param('verbose'):
1726                 self.print_verbose_info(self.ydl.write_debug)
1727
1728         def _get_field_setting(self, field, key):
1729             if field not in self.settings:
1730                 if key in ('forced', 'priority'):
1731                     return False
1732                 self.ydl.deprecation_warning(
1733                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1734                     'and may be removed in a future version')
1735                 self.settings[field] = {}
1736             propObj = self.settings[field]
1737             if key not in propObj:
1738                 type = propObj.get('type')
1739                 if key == 'field':
1740                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1741                 elif key == 'convert':
1742                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1743                 else:
1744                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1745                 propObj[key] = default
1746             return propObj[key]
1747
1748         def _resolve_field_value(self, field, value, convertNone=False):
1749             if value is None:
1750                 if not convertNone:
1751                     return None
1752             else:
1753                 value = value.lower()
1754             conversion = self._get_field_setting(field, 'convert')
1755             if conversion == 'ignore':
1756                 return None
1757             if conversion == 'string':
1758                 return value
1759             elif conversion == 'float_none':
1760                 return float_or_none(value)
1761             elif conversion == 'bytes':
1762                 return FileDownloader.parse_bytes(value)
1763             elif conversion == 'order':
1764                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1765                 use_regex = self._get_field_setting(field, 'regex')
1766                 list_length = len(order_list)
1767                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1768                 if use_regex and value is not None:
1769                     for i, regex in enumerate(order_list):
1770                         if regex and re.match(regex, value):
1771                             return list_length - i
1772                     return list_length - empty_pos  # not in list
1773                 else:  # not regex or  value = None
1774                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1775             else:
1776                 if value.isnumeric():
1777                     return float(value)
1778                 else:
1779                     self.settings[field]['convert'] = 'string'
1780                     return value
1781
1782         def evaluate_params(self, params, sort_extractor):
1783             self._use_free_order = params.get('prefer_free_formats', False)
1784             self._sort_user = params.get('format_sort', [])
1785             self._sort_extractor = sort_extractor
1786
1787             def add_item(field, reverse, closest, limit_text):
1788                 field = field.lower()
1789                 if field in self._order:
1790                     return
1791                 self._order.append(field)
1792                 limit = self._resolve_field_value(field, limit_text)
1793                 data = {
1794                     'reverse': reverse,
1795                     'closest': False if limit is None else closest,
1796                     'limit_text': limit_text,
1797                     'limit': limit}
1798                 if field in self.settings:
1799                     self.settings[field].update(data)
1800                 else:
1801                     self.settings[field] = data
1802
1803             sort_list = (
1804                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1805                 + (tuple() if params.get('format_sort_force', False)
1806                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1807                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1808
1809             for item in sort_list:
1810                 match = re.match(self.regex, item)
1811                 if match is None:
1812                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1813                 field = match.group('field')
1814                 if field is None:
1815                     continue
1816                 if self._get_field_setting(field, 'type') == 'alias':
1817                     alias, field = field, self._get_field_setting(field, 'field')
1818                     if self._get_field_setting(alias, 'deprecated'):
1819                         self.ydl.deprecation_warning(
1820                             f'Format sorting alias {alias} is deprecated '
1821                             f'and may be removed in a future version. Please use {field} instead')
1822                 reverse = match.group('reverse') is not None
1823                 closest = match.group('separator') == '~'
1824                 limit_text = match.group('limit')
1825
1826                 has_limit = limit_text is not None
1827                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1828                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1829
1830                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1831                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1832                 limit_count = len(limits)
1833                 for (i, f) in enumerate(fields):
1834                     add_item(f, reverse, closest,
1835                              limits[i] if i < limit_count
1836                              else limits[0] if has_limit and not has_multiple_limits
1837                              else None)
1838
1839         def print_verbose_info(self, write_debug):
1840             if self._sort_user:
1841                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1842             if self._sort_extractor:
1843                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1844             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1845                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1846                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1847                               self._get_field_setting(field, 'limit_text'),
1848                               self._get_field_setting(field, 'limit'))
1849                 if self._get_field_setting(field, 'limit_text') is not None else '')
1850                 for field in self._order if self._get_field_setting(field, 'visible')]))
1851
1852         def _calculate_field_preference_from_value(self, format, field, type, value):
1853             reverse = self._get_field_setting(field, 'reverse')
1854             closest = self._get_field_setting(field, 'closest')
1855             limit = self._get_field_setting(field, 'limit')
1856
1857             if type == 'extractor':
1858                 maximum = self._get_field_setting(field, 'max')
1859                 if value is None or (maximum is not None and value >= maximum):
1860                     value = -1
1861             elif type == 'boolean':
1862                 in_list = self._get_field_setting(field, 'in_list')
1863                 not_in_list = self._get_field_setting(field, 'not_in_list')
1864                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1865             elif type == 'ordered':
1866                 value = self._resolve_field_value(field, value, True)
1867
1868             # try to convert to number
1869             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1870             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1871             if is_num:
1872                 value = val_num
1873
1874             return ((-10, 0) if value is None
1875                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1876                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1877                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1878                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1879                     else (-1, value, 0))
1880
1881         def _calculate_field_preference(self, format, field):
1882             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1883             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1884             if type == 'multiple':
1885                 type = 'field'  # Only 'field' is allowed in multiple for now
1886                 actual_fields = self._get_field_setting(field, 'field')
1887
1888                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1889             else:
1890                 value = get_value(field)
1891             return self._calculate_field_preference_from_value(format, field, type, value)
1892
1893         def calculate_preference(self, format):
1894             # Determine missing protocol
1895             if not format.get('protocol'):
1896                 format['protocol'] = determine_protocol(format)
1897
1898             # Determine missing ext
1899             if not format.get('ext') and 'url' in format:
1900                 format['ext'] = determine_ext(format['url'])
1901             if format.get('vcodec') == 'none':
1902                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1903                 format['video_ext'] = 'none'
1904             else:
1905                 format['video_ext'] = format['ext']
1906                 format['audio_ext'] = 'none'
1907             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1908             #    format['preference'] = -1000
1909
1910             # Determine missing bitrates
1911             if format.get('tbr') is None:
1912                 if format.get('vbr') is not None and format.get('abr') is not None:
1913                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1914             else:
1915                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1916                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1917                 if format.get('acodec') != 'none' and format.get('abr') is None:
1918                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1919
1920             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1921
1922     def _sort_formats(self, formats, field_preference=[]):
1923         if not formats:
1924             return
1925         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1926
1927     def _check_formats(self, formats, video_id):
1928         if formats:
1929             formats[:] = filter(
1930                 lambda f: self._is_valid_url(
1931                     f['url'], video_id,
1932                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1933                 formats)
1934
1935     @staticmethod
1936     def _remove_duplicate_formats(formats):
1937         format_urls = set()
1938         unique_formats = []
1939         for f in formats:
1940             if f['url'] not in format_urls:
1941                 format_urls.add(f['url'])
1942                 unique_formats.append(f)
1943         formats[:] = unique_formats
1944
1945     def _is_valid_url(self, url, video_id, item='video', headers={}):
1946         url = self._proto_relative_url(url, scheme='http:')
1947         # For now assume non HTTP(S) URLs always valid
1948         if not (url.startswith('http://') or url.startswith('https://')):
1949             return True
1950         try:
1951             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1952             return True
1953         except ExtractorError as e:
1954             self.to_screen(
1955                 '%s: %s URL is invalid, skipping: %s'
1956                 % (video_id, item, error_to_compat_str(e.cause)))
1957             return False
1958
1959     def http_scheme(self):
1960         """ Either "http:" or "https:", depending on the user's preferences """
1961         return (
1962             'http:'
1963             if self.get_param('prefer_insecure', False)
1964             else 'https:')
1965
1966     def _proto_relative_url(self, url, scheme=None):
1967         if url is None:
1968             return url
1969         if url.startswith('//'):
1970             if scheme is None:
1971                 scheme = self.http_scheme()
1972             return scheme + url
1973         else:
1974             return url
1975
1976     def _sleep(self, timeout, video_id, msg_template=None):
1977         if msg_template is None:
1978             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1979         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1980         self.to_screen(msg)
1981         time.sleep(timeout)
1982
1983     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1984                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1985                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1986         res = self._download_xml_handle(
1987             manifest_url, video_id, 'Downloading f4m manifest',
1988             'Unable to download f4m manifest',
1989             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1990             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1991             transform_source=transform_source,
1992             fatal=fatal, data=data, headers=headers, query=query)
1993         if res is False:
1994             return []
1995
1996         manifest, urlh = res
1997         manifest_url = urlh.geturl()
1998
1999         return self._parse_f4m_formats(
2000             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2001             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2002
2003     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2004                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2005                            fatal=True, m3u8_id=None):
2006         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2007             return []
2008
2009         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2010         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2011         if akamai_pv is not None and ';' in akamai_pv.text:
2012             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2013             if playerVerificationChallenge.strip() != '':
2014                 return []
2015
2016         formats = []
2017         manifest_version = '1.0'
2018         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2019         if not media_nodes:
2020             manifest_version = '2.0'
2021             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2022         # Remove unsupported DRM protected media from final formats
2023         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2024         media_nodes = remove_encrypted_media(media_nodes)
2025         if not media_nodes:
2026             return formats
2027
2028         manifest_base_url = get_base_url(manifest)
2029
2030         bootstrap_info = xpath_element(
2031             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2032             'bootstrap info', default=None)
2033
2034         vcodec = None
2035         mime_type = xpath_text(
2036             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2037             'base URL', default=None)
2038         if mime_type and mime_type.startswith('audio/'):
2039             vcodec = 'none'
2040
2041         for i, media_el in enumerate(media_nodes):
2042             tbr = int_or_none(media_el.attrib.get('bitrate'))
2043             width = int_or_none(media_el.attrib.get('width'))
2044             height = int_or_none(media_el.attrib.get('height'))
2045             format_id = join_nonempty(f4m_id, tbr or i)
2046             # If <bootstrapInfo> is present, the specified f4m is a
2047             # stream-level manifest, and only set-level manifests may refer to
2048             # external resources.  See section 11.4 and section 4 of F4M spec
2049             if bootstrap_info is None:
2050                 media_url = None
2051                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2052                 if manifest_version == '2.0':
2053                     media_url = media_el.attrib.get('href')
2054                 if media_url is None:
2055                     media_url = media_el.attrib.get('url')
2056                 if not media_url:
2057                     continue
2058                 manifest_url = (
2059                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2060                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2061                 # If media_url is itself a f4m manifest do the recursive extraction
2062                 # since bitrates in parent manifest (this one) and media_url manifest
2063                 # may differ leading to inability to resolve the format by requested
2064                 # bitrate in f4m downloader
2065                 ext = determine_ext(manifest_url)
2066                 if ext == 'f4m':
2067                     f4m_formats = self._extract_f4m_formats(
2068                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2069                         transform_source=transform_source, fatal=fatal)
2070                     # Sometimes stream-level manifest contains single media entry that
2071                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2072                     # At the same time parent's media entry in set-level manifest may
2073                     # contain it. We will copy it from parent in such cases.
2074                     if len(f4m_formats) == 1:
2075                         f = f4m_formats[0]
2076                         f.update({
2077                             'tbr': f.get('tbr') or tbr,
2078                             'width': f.get('width') or width,
2079                             'height': f.get('height') or height,
2080                             'format_id': f.get('format_id') if not tbr else format_id,
2081                             'vcodec': vcodec,
2082                         })
2083                     formats.extend(f4m_formats)
2084                     continue
2085                 elif ext == 'm3u8':
2086                     formats.extend(self._extract_m3u8_formats(
2087                         manifest_url, video_id, 'mp4', preference=preference,
2088                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2089                     continue
2090             formats.append({
2091                 'format_id': format_id,
2092                 'url': manifest_url,
2093                 'manifest_url': manifest_url,
2094                 'ext': 'flv' if bootstrap_info is not None else None,
2095                 'protocol': 'f4m',
2096                 'tbr': tbr,
2097                 'width': width,
2098                 'height': height,
2099                 'vcodec': vcodec,
2100                 'preference': preference,
2101                 'quality': quality,
2102             })
2103         return formats
2104
2105     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2106         return {
2107             'format_id': join_nonempty(m3u8_id, 'meta'),
2108             'url': m3u8_url,
2109             'ext': ext,
2110             'protocol': 'm3u8',
2111             'preference': preference - 100 if preference else -100,
2112             'quality': quality,
2113             'resolution': 'multiple',
2114             'format_note': 'Quality selection URL',
2115         }
2116
2117     def _report_ignoring_subs(self, name):
2118         self.report_warning(bug_reports_message(
2119             f'Ignoring subtitle tracks found in the {name} manifest; '
2120             'if any subtitle tracks are missing,'
2121         ), only_once=True)
2122
2123     def _extract_m3u8_formats(self, *args, **kwargs):
2124         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2125         if subs:
2126             self._report_ignoring_subs('HLS')
2127         return fmts
2128
2129     def _extract_m3u8_formats_and_subtitles(
2130             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2131             preference=None, quality=None, m3u8_id=None, note=None,
2132             errnote=None, fatal=True, live=False, data=None, headers={},
2133             query={}):
2134
2135         res = self._download_webpage_handle(
2136             m3u8_url, video_id,
2137             note='Downloading m3u8 information' if note is None else note,
2138             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2139             fatal=fatal, data=data, headers=headers, query=query)
2140
2141         if res is False:
2142             return [], {}
2143
2144         m3u8_doc, urlh = res
2145         m3u8_url = urlh.geturl()
2146
2147         return self._parse_m3u8_formats_and_subtitles(
2148             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2149             preference=preference, quality=quality, m3u8_id=m3u8_id,
2150             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2151             headers=headers, query=query, video_id=video_id)
2152
2153     def _parse_m3u8_formats_and_subtitles(
2154             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2155             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2156             errnote=None, fatal=True, data=None, headers={}, query={},
2157             video_id=None):
2158         formats, subtitles = [], {}
2159
2160         has_drm = re.search('|'.join([
2161             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2162             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2163         ]), m3u8_doc)
2164
2165         def format_url(url):
2166             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2167
2168         if self.get_param('hls_split_discontinuity', False):
2169             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2170                 if not m3u8_doc:
2171                     if not manifest_url:
2172                         return []
2173                     m3u8_doc = self._download_webpage(
2174                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2175                         note=False, errnote='Failed to download m3u8 playlist information')
2176                     if m3u8_doc is False:
2177                         return []
2178                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2179
2180         else:
2181             def _extract_m3u8_playlist_indices(*args, **kwargs):
2182                 return [None]
2183
2184         # References:
2185         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2186         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2187         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2188
2189         # We should try extracting formats only from master playlists [1, 4.3.4],
2190         # i.e. playlists that describe available qualities. On the other hand
2191         # media playlists [1, 4.3.3] should be returned as is since they contain
2192         # just the media without qualities renditions.
2193         # Fortunately, master playlist can be easily distinguished from media
2194         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2195         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2196         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2197         # media playlist and MUST NOT appear in master playlist thus we can
2198         # clearly detect media playlist with this criterion.
2199
2200         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2201             formats = [{
2202                 'format_id': join_nonempty(m3u8_id, idx),
2203                 'format_index': idx,
2204                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2205                 'ext': ext,
2206                 'protocol': entry_protocol,
2207                 'preference': preference,
2208                 'quality': quality,
2209                 'has_drm': has_drm,
2210             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2211
2212             return formats, subtitles
2213
2214         groups = {}
2215         last_stream_inf = {}
2216
2217         def extract_media(x_media_line):
2218             media = parse_m3u8_attributes(x_media_line)
2219             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2220             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2221             if not (media_type and group_id and name):
2222                 return
2223             groups.setdefault(group_id, []).append(media)
2224             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2225             if media_type == 'SUBTITLES':
2226                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2227                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2228                 # However, lack of URI has been spotted in the wild.
2229                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2230                 if not media.get('URI'):
2231                     return
2232                 url = format_url(media['URI'])
2233                 sub_info = {
2234                     'url': url,
2235                     'ext': determine_ext(url),
2236                 }
2237                 if sub_info['ext'] == 'm3u8':
2238                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2239                     # files may contain is WebVTT:
2240                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2241                     sub_info['ext'] = 'vtt'
2242                     sub_info['protocol'] = 'm3u8_native'
2243                 lang = media.get('LANGUAGE') or 'und'
2244                 subtitles.setdefault(lang, []).append(sub_info)
2245             if media_type not in ('VIDEO', 'AUDIO'):
2246                 return
2247             media_url = media.get('URI')
2248             if media_url:
2249                 manifest_url = format_url(media_url)
2250                 formats.extend({
2251                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2252                     'format_note': name,
2253                     'format_index': idx,
2254                     'url': manifest_url,
2255                     'manifest_url': m3u8_url,
2256                     'language': media.get('LANGUAGE'),
2257                     'ext': ext,
2258                     'protocol': entry_protocol,
2259                     'preference': preference,
2260                     'quality': quality,
2261                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2262                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2263
2264         def build_stream_name():
2265             # Despite specification does not mention NAME attribute for
2266             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2267             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2268             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2269             stream_name = last_stream_inf.get('NAME')
2270             if stream_name:
2271                 return stream_name
2272             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2273             # from corresponding rendition group
2274             stream_group_id = last_stream_inf.get('VIDEO')
2275             if not stream_group_id:
2276                 return
2277             stream_group = groups.get(stream_group_id)
2278             if not stream_group:
2279                 return stream_group_id
2280             rendition = stream_group[0]
2281             return rendition.get('NAME') or stream_group_id
2282
2283         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2284         # chance to detect video only formats when EXT-X-STREAM-INF tags
2285         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2286         for line in m3u8_doc.splitlines():
2287             if line.startswith('#EXT-X-MEDIA:'):
2288                 extract_media(line)
2289
2290         for line in m3u8_doc.splitlines():
2291             if line.startswith('#EXT-X-STREAM-INF:'):
2292                 last_stream_inf = parse_m3u8_attributes(line)
2293             elif line.startswith('#') or not line.strip():
2294                 continue
2295             else:
2296                 tbr = float_or_none(
2297                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2298                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2299                 manifest_url = format_url(line.strip())
2300
2301                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2302                     format_id = [m3u8_id, None, idx]
2303                     # Bandwidth of live streams may differ over time thus making
2304                     # format_id unpredictable. So it's better to keep provided
2305                     # format_id intact.
2306                     if not live:
2307                         stream_name = build_stream_name()
2308                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2309                     f = {
2310                         'format_id': join_nonempty(*format_id),
2311                         'format_index': idx,
2312                         'url': manifest_url,
2313                         'manifest_url': m3u8_url,
2314                         'tbr': tbr,
2315                         'ext': ext,
2316                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2317                         'protocol': entry_protocol,
2318                         'preference': preference,
2319                         'quality': quality,
2320                     }
2321                     resolution = last_stream_inf.get('RESOLUTION')
2322                     if resolution:
2323                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2324                         if mobj:
2325                             f['width'] = int(mobj.group('width'))
2326                             f['height'] = int(mobj.group('height'))
2327                     # Unified Streaming Platform
2328                     mobj = re.search(
2329                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2330                     if mobj:
2331                         abr, vbr = mobj.groups()
2332                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2333                         f.update({
2334                             'vbr': vbr,
2335                             'abr': abr,
2336                         })
2337                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2338                     f.update(codecs)
2339                     audio_group_id = last_stream_inf.get('AUDIO')
2340                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2341                     # references a rendition group MUST have a CODECS attribute.
2342                     # However, this is not always respected, for example, [2]
2343                     # contains EXT-X-STREAM-INF tag which references AUDIO
2344                     # rendition group but does not have CODECS and despite
2345                     # referencing an audio group it represents a complete
2346                     # (with audio and video) format. So, for such cases we will
2347                     # ignore references to rendition groups and treat them
2348                     # as complete formats.
2349                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2350                         audio_group = groups.get(audio_group_id)
2351                         if audio_group and audio_group[0].get('URI'):
2352                             # TODO: update acodec for audio only formats with
2353                             # the same GROUP-ID
2354                             f['acodec'] = 'none'
2355                     if not f.get('ext'):
2356                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2357                     formats.append(f)
2358
2359                     # for DailyMotion
2360                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2361                     if progressive_uri:
2362                         http_f = f.copy()
2363                         del http_f['manifest_url']
2364                         http_f.update({
2365                             'format_id': f['format_id'].replace('hls-', 'http-'),
2366                             'protocol': 'http',
2367                             'url': progressive_uri,
2368                         })
2369                         formats.append(http_f)
2370
2371                 last_stream_inf = {}
2372         return formats, subtitles
2373
2374     def _extract_m3u8_vod_duration(
2375             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2376
2377         m3u8_vod = self._download_webpage(
2378             m3u8_vod_url, video_id,
2379             note='Downloading m3u8 VOD manifest' if note is None else note,
2380             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2381             fatal=False, data=data, headers=headers, query=query)
2382
2383         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2384
2385     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2386         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2387             return None
2388
2389         return int(sum(
2390             float(line[len('#EXTINF:'):].split(',')[0])
2391             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2392
2393     @staticmethod
2394     def _xpath_ns(path, namespace=None):
2395         if not namespace:
2396             return path
2397         out = []
2398         for c in path.split('/'):
2399             if not c or c == '.':
2400                 out.append(c)
2401             else:
2402                 out.append('{%s}%s' % (namespace, c))
2403         return '/'.join(out)
2404
2405     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2406         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2407         if res is False:
2408             assert not fatal
2409             return [], {}
2410
2411         smil, urlh = res
2412         smil_url = urlh.geturl()
2413
2414         namespace = self._parse_smil_namespace(smil)
2415
2416         fmts = self._parse_smil_formats(
2417             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2418         subs = self._parse_smil_subtitles(
2419             smil, namespace=namespace)
2420
2421         return fmts, subs
2422
2423     def _extract_smil_formats(self, *args, **kwargs):
2424         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2425         if subs:
2426             self._report_ignoring_subs('SMIL')
2427         return fmts
2428
2429     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2430         res = self._download_smil(smil_url, video_id, fatal=fatal)
2431         if res is False:
2432             return {}
2433
2434         smil, urlh = res
2435         smil_url = urlh.geturl()
2436
2437         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2438
2439     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2440         return self._download_xml_handle(
2441             smil_url, video_id, 'Downloading SMIL file',
2442             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2443
2444     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2445         namespace = self._parse_smil_namespace(smil)
2446
2447         formats = self._parse_smil_formats(
2448             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2449         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2450
2451         video_id = os.path.splitext(url_basename(smil_url))[0]
2452         title = None
2453         description = None
2454         upload_date = None
2455         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2456             name = meta.attrib.get('name')
2457             content = meta.attrib.get('content')
2458             if not name or not content:
2459                 continue
2460             if not title and name == 'title':
2461                 title = content
2462             elif not description and name in ('description', 'abstract'):
2463                 description = content
2464             elif not upload_date and name == 'date':
2465                 upload_date = unified_strdate(content)
2466
2467         thumbnails = [{
2468             'id': image.get('type'),
2469             'url': image.get('src'),
2470             'width': int_or_none(image.get('width')),
2471             'height': int_or_none(image.get('height')),
2472         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2473
2474         return {
2475             'id': video_id,
2476             'title': title or video_id,
2477             'description': description,
2478             'upload_date': upload_date,
2479             'thumbnails': thumbnails,
2480             'formats': formats,
2481             'subtitles': subtitles,
2482         }
2483
2484     def _parse_smil_namespace(self, smil):
2485         return self._search_regex(
2486             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2487
2488     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2489         base = smil_url
2490         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2491             b = meta.get('base') or meta.get('httpBase')
2492             if b:
2493                 base = b
2494                 break
2495
2496         formats = []
2497         rtmp_count = 0
2498         http_count = 0
2499         m3u8_count = 0
2500         imgs_count = 0
2501
2502         srcs = set()
2503         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2504         for medium in media:
2505             src = medium.get('src')
2506             if not src or src in srcs:
2507                 continue
2508             srcs.add(src)
2509
2510             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2511             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2512             width = int_or_none(medium.get('width'))
2513             height = int_or_none(medium.get('height'))
2514             proto = medium.get('proto')
2515             ext = medium.get('ext')
2516             src_ext = determine_ext(src)
2517             streamer = medium.get('streamer') or base
2518
2519             if proto == 'rtmp' or streamer.startswith('rtmp'):
2520                 rtmp_count += 1
2521                 formats.append({
2522                     'url': streamer,
2523                     'play_path': src,
2524                     'ext': 'flv',
2525                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2526                     'tbr': bitrate,
2527                     'filesize': filesize,
2528                     'width': width,
2529                     'height': height,
2530                 })
2531                 if transform_rtmp_url:
2532                     streamer, src = transform_rtmp_url(streamer, src)
2533                     formats[-1].update({
2534                         'url': streamer,
2535                         'play_path': src,
2536                     })
2537                 continue
2538
2539             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2540             src_url = src_url.strip()
2541
2542             if proto == 'm3u8' or src_ext == 'm3u8':
2543                 m3u8_formats = self._extract_m3u8_formats(
2544                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2545                 if len(m3u8_formats) == 1:
2546                     m3u8_count += 1
2547                     m3u8_formats[0].update({
2548                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2549                         'tbr': bitrate,
2550                         'width': width,
2551                         'height': height,
2552                     })
2553                 formats.extend(m3u8_formats)
2554             elif src_ext == 'f4m':
2555                 f4m_url = src_url
2556                 if not f4m_params:
2557                     f4m_params = {
2558                         'hdcore': '3.2.0',
2559                         'plugin': 'flowplayer-3.2.0.1',
2560                     }
2561                 f4m_url += '&' if '?' in f4m_url else '?'
2562                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2563                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2564             elif src_ext == 'mpd':
2565                 formats.extend(self._extract_mpd_formats(
2566                     src_url, video_id, mpd_id='dash', fatal=False))
2567             elif re.search(r'\.ism/[Mm]anifest', src_url):
2568                 formats.extend(self._extract_ism_formats(
2569                     src_url, video_id, ism_id='mss', fatal=False))
2570             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2571                 http_count += 1
2572                 formats.append({
2573                     'url': src_url,
2574                     'ext': ext or src_ext or 'flv',
2575                     'format_id': 'http-%d' % (bitrate or http_count),
2576                     'tbr': bitrate,
2577                     'filesize': filesize,
2578                     'width': width,
2579                     'height': height,
2580                 })
2581
2582         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2583             src = medium.get('src')
2584             if not src or src in srcs:
2585                 continue
2586             srcs.add(src)
2587
2588             imgs_count += 1
2589             formats.append({
2590                 'format_id': 'imagestream-%d' % (imgs_count),
2591                 'url': src,
2592                 'ext': mimetype2ext(medium.get('type')),
2593                 'acodec': 'none',
2594                 'vcodec': 'none',
2595                 'width': int_or_none(medium.get('width')),
2596                 'height': int_or_none(medium.get('height')),
2597                 'format_note': 'SMIL storyboards',
2598             })
2599
2600         return formats
2601
2602     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2603         urls = []
2604         subtitles = {}
2605         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2606             src = textstream.get('src')
2607             if not src or src in urls:
2608                 continue
2609             urls.append(src)
2610             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2611             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2612             subtitles.setdefault(lang, []).append({
2613                 'url': src,
2614                 'ext': ext,
2615             })
2616         return subtitles
2617
2618     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2619         res = self._download_xml_handle(
2620             xspf_url, playlist_id, 'Downloading xpsf playlist',
2621             'Unable to download xspf manifest', fatal=fatal)
2622         if res is False:
2623             return []
2624
2625         xspf, urlh = res
2626         xspf_url = urlh.geturl()
2627
2628         return self._parse_xspf(
2629             xspf, playlist_id, xspf_url=xspf_url,
2630             xspf_base_url=base_url(xspf_url))
2631
2632     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2633         NS_MAP = {
2634             'xspf': 'http://xspf.org/ns/0/',
2635             's1': 'http://static.streamone.nl/player/ns/0',
2636         }
2637
2638         entries = []
2639         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2640             title = xpath_text(
2641                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2642             description = xpath_text(
2643                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2644             thumbnail = xpath_text(
2645                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2646             duration = float_or_none(
2647                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2648
2649             formats = []
2650             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2651                 format_url = urljoin(xspf_base_url, location.text)
2652                 if not format_url:
2653                     continue
2654                 formats.append({
2655                     'url': format_url,
2656                     'manifest_url': xspf_url,
2657                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2658                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2659                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2660                 })
2661             self._sort_formats(formats)
2662
2663             entries.append({
2664                 'id': playlist_id,
2665                 'title': title,
2666                 'description': description,
2667                 'thumbnail': thumbnail,
2668                 'duration': duration,
2669                 'formats': formats,
2670             })
2671         return entries
2672
2673     def _extract_mpd_formats(self, *args, **kwargs):
2674         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2675         if subs:
2676             self._report_ignoring_subs('DASH')
2677         return fmts
2678
2679     def _extract_mpd_formats_and_subtitles(
2680             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2681             fatal=True, data=None, headers={}, query={}):
2682         res = self._download_xml_handle(
2683             mpd_url, video_id,
2684             note='Downloading MPD manifest' if note is None else note,
2685             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2686             fatal=fatal, data=data, headers=headers, query=query)
2687         if res is False:
2688             return [], {}
2689         mpd_doc, urlh = res
2690         if mpd_doc is None:
2691             return [], {}
2692
2693         # We could have been redirected to a new url when we retrieved our mpd file.
2694         mpd_url = urlh.geturl()
2695         mpd_base_url = base_url(mpd_url)
2696
2697         return self._parse_mpd_formats_and_subtitles(
2698             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2699
2700     def _parse_mpd_formats(self, *args, **kwargs):
2701         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2702         if subs:
2703             self._report_ignoring_subs('DASH')
2704         return fmts
2705
2706     def _parse_mpd_formats_and_subtitles(
2707             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2708         """
2709         Parse formats from MPD manifest.
2710         References:
2711          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2712             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2713          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2714         """
2715         if not self.get_param('dynamic_mpd', True):
2716             if mpd_doc.get('type') == 'dynamic':
2717                 return [], {}
2718
2719         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2720
2721         def _add_ns(path):
2722             return self._xpath_ns(path, namespace)
2723
2724         def is_drm_protected(element):
2725             return element.find(_add_ns('ContentProtection')) is not None
2726
2727         def extract_multisegment_info(element, ms_parent_info):
2728             ms_info = ms_parent_info.copy()
2729
2730             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2731             # common attributes and elements.  We will only extract relevant
2732             # for us.
2733             def extract_common(source):
2734                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2735                 if segment_timeline is not None:
2736                     s_e = segment_timeline.findall(_add_ns('S'))
2737                     if s_e:
2738                         ms_info['total_number'] = 0
2739                         ms_info['s'] = []
2740                         for s in s_e:
2741                             r = int(s.get('r', 0))
2742                             ms_info['total_number'] += 1 + r
2743                             ms_info['s'].append({
2744                                 't': int(s.get('t', 0)),
2745                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2746                                 'd': int(s.attrib['d']),
2747                                 'r': r,
2748                             })
2749                 start_number = source.get('startNumber')
2750                 if start_number:
2751                     ms_info['start_number'] = int(start_number)
2752                 timescale = source.get('timescale')
2753                 if timescale:
2754                     ms_info['timescale'] = int(timescale)
2755                 segment_duration = source.get('duration')
2756                 if segment_duration:
2757                     ms_info['segment_duration'] = float(segment_duration)
2758
2759             def extract_Initialization(source):
2760                 initialization = source.find(_add_ns('Initialization'))
2761                 if initialization is not None:
2762                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2763
2764             segment_list = element.find(_add_ns('SegmentList'))
2765             if segment_list is not None:
2766                 extract_common(segment_list)
2767                 extract_Initialization(segment_list)
2768                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2769                 if segment_urls_e:
2770                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2771             else:
2772                 segment_template = element.find(_add_ns('SegmentTemplate'))
2773                 if segment_template is not None:
2774                     extract_common(segment_template)
2775                     media = segment_template.get('media')
2776                     if media:
2777                         ms_info['media'] = media
2778                     initialization = segment_template.get('initialization')
2779                     if initialization:
2780                         ms_info['initialization'] = initialization
2781                     else:
2782                         extract_Initialization(segment_template)
2783             return ms_info
2784
2785         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2786         formats, subtitles = [], {}
2787         stream_numbers = collections.defaultdict(int)
2788         for period in mpd_doc.findall(_add_ns('Period')):
2789             period_duration = parse_duration(period.get('duration')) or mpd_duration
2790             period_ms_info = extract_multisegment_info(period, {
2791                 'start_number': 1,
2792                 'timescale': 1,
2793             })
2794             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2795                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2796                 for representation in adaptation_set.findall(_add_ns('Representation')):
2797                     representation_attrib = adaptation_set.attrib.copy()
2798                     representation_attrib.update(representation.attrib)
2799                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2800                     mime_type = representation_attrib['mimeType']
2801                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2802
2803                     codecs = parse_codecs(representation_attrib.get('codecs', ''))
2804                     if content_type not in ('video', 'audio', 'text'):
2805                         if mime_type == 'image/jpeg':
2806                             content_type = mime_type
2807                         elif codecs['vcodec'] != 'none':
2808                             content_type = 'video'
2809                         elif codecs['acodec'] != 'none':
2810                             content_type = 'audio'
2811                         elif codecs.get('tcodec', 'none') != 'none':
2812                             content_type = 'text'
2813                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2814                             content_type = 'text'
2815                         else:
2816                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2817                             continue
2818
2819                     base_url = ''
2820                     for element in (representation, adaptation_set, period, mpd_doc):
2821                         base_url_e = element.find(_add_ns('BaseURL'))
2822                         if base_url_e is not None:
2823                             base_url = base_url_e.text + base_url
2824                             if re.match(r'^https?://', base_url):
2825                                 break
2826                     if mpd_base_url and base_url.startswith('/'):
2827                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2828                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2829                         if not mpd_base_url.endswith('/'):
2830                             mpd_base_url += '/'
2831                         base_url = mpd_base_url + base_url
2832                     representation_id = representation_attrib.get('id')
2833                     lang = representation_attrib.get('lang')
2834                     url_el = representation.find(_add_ns('BaseURL'))
2835                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2836                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2837                     if representation_id is not None:
2838                         format_id = representation_id
2839                     else:
2840                         format_id = content_type
2841                     if mpd_id:
2842                         format_id = mpd_id + '-' + format_id
2843                     if content_type in ('video', 'audio'):
2844                         f = {
2845                             'format_id': format_id,
2846                             'manifest_url': mpd_url,
2847                             'ext': mimetype2ext(mime_type),
2848                             'width': int_or_none(representation_attrib.get('width')),
2849                             'height': int_or_none(representation_attrib.get('height')),
2850                             'tbr': float_or_none(bandwidth, 1000),
2851                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2852                             'fps': int_or_none(representation_attrib.get('frameRate')),
2853                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2854                             'format_note': 'DASH %s' % content_type,
2855                             'filesize': filesize,
2856                             'container': mimetype2ext(mime_type) + '_dash',
2857                             **codecs
2858                         }
2859                     elif content_type == 'text':
2860                         f = {
2861                             'ext': mimetype2ext(mime_type),
2862                             'manifest_url': mpd_url,
2863                             'filesize': filesize,
2864                         }
2865                     elif content_type == 'image/jpeg':
2866                         # See test case in VikiIE
2867                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2868                         f = {
2869                             'format_id': format_id,
2870                             'ext': 'mhtml',
2871                             'manifest_url': mpd_url,
2872                             'format_note': 'DASH storyboards (jpeg)',
2873                             'acodec': 'none',
2874                             'vcodec': 'none',
2875                         }
2876                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2877                         f['has_drm'] = True
2878                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2879
2880                     def prepare_template(template_name, identifiers):
2881                         tmpl = representation_ms_info[template_name]
2882                         # First of, % characters outside $...$ templates
2883                         # must be escaped by doubling for proper processing
2884                         # by % operator string formatting used further (see
2885                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2886                         t = ''
2887                         in_template = False
2888                         for c in tmpl:
2889                             t += c
2890                             if c == '$':
2891                                 in_template = not in_template
2892                             elif c == '%' and not in_template:
2893                                 t += c
2894                         # Next, $...$ templates are translated to their
2895                         # %(...) counterparts to be used with % operator
2896                         if representation_id is not None:
2897                             t = t.replace('$RepresentationID$', representation_id)
2898                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2899                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2900                         t.replace('$$', '$')
2901                         return t
2902
2903                     # @initialization is a regular template like @media one
2904                     # so it should be handled just the same way (see
2905                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2906                     if 'initialization' in representation_ms_info:
2907                         initialization_template = prepare_template(
2908                             'initialization',
2909                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2910                             # $Time$ shall not be included for @initialization thus
2911                             # only $Bandwidth$ remains
2912                             ('Bandwidth', ))
2913                         representation_ms_info['initialization_url'] = initialization_template % {
2914                             'Bandwidth': bandwidth,
2915                         }
2916
2917                     def location_key(location):
2918                         return 'url' if re.match(r'^https?://', location) else 'path'
2919
2920                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2921
2922                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2923                         media_location_key = location_key(media_template)
2924
2925                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2926                         # can't be used at the same time
2927                         if '%(Number' in media_template and 's' not in representation_ms_info:
2928                             segment_duration = None
2929                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2930                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2931                                 representation_ms_info['total_number'] = int(math.ceil(
2932                                     float_or_none(period_duration, segment_duration, default=0)))
2933                             representation_ms_info['fragments'] = [{
2934                                 media_location_key: media_template % {
2935                                     'Number': segment_number,
2936                                     'Bandwidth': bandwidth,
2937                                 },
2938                                 'duration': segment_duration,
2939                             } for segment_number in range(
2940                                 representation_ms_info['start_number'],
2941                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2942                         else:
2943                             # $Number*$ or $Time$ in media template with S list available
2944                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2945                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2946                             representation_ms_info['fragments'] = []
2947                             segment_time = 0
2948                             segment_d = None
2949                             segment_number = representation_ms_info['start_number']
2950
2951                             def add_segment_url():
2952                                 segment_url = media_template % {
2953                                     'Time': segment_time,
2954                                     'Bandwidth': bandwidth,
2955                                     'Number': segment_number,
2956                                 }
2957                                 representation_ms_info['fragments'].append({
2958                                     media_location_key: segment_url,
2959                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2960                                 })
2961
2962                             for num, s in enumerate(representation_ms_info['s']):
2963                                 segment_time = s.get('t') or segment_time
2964                                 segment_d = s['d']
2965                                 add_segment_url()
2966                                 segment_number += 1
2967                                 for r in range(s.get('r', 0)):
2968                                     segment_time += segment_d
2969                                     add_segment_url()
2970                                     segment_number += 1
2971                                 segment_time += segment_d
2972                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2973                         # No media template
2974                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2975                         # or any YouTube dashsegments video
2976                         fragments = []
2977                         segment_index = 0
2978                         timescale = representation_ms_info['timescale']
2979                         for s in representation_ms_info['s']:
2980                             duration = float_or_none(s['d'], timescale)
2981                             for r in range(s.get('r', 0) + 1):
2982                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2983                                 fragments.append({
2984                                     location_key(segment_uri): segment_uri,
2985                                     'duration': duration,
2986                                 })
2987                                 segment_index += 1
2988                         representation_ms_info['fragments'] = fragments
2989                     elif 'segment_urls' in representation_ms_info:
2990                         # Segment URLs with no SegmentTimeline
2991                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2992                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2993                         fragments = []
2994                         segment_duration = float_or_none(
2995                             representation_ms_info['segment_duration'],
2996                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2997                         for segment_url in representation_ms_info['segment_urls']:
2998                             fragment = {
2999                                 location_key(segment_url): segment_url,
3000                             }
3001                             if segment_duration:
3002                                 fragment['duration'] = segment_duration
3003                             fragments.append(fragment)
3004                         representation_ms_info['fragments'] = fragments
3005                     # If there is a fragments key available then we correctly recognized fragmented media.
3006                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3007                     # assumption is not necessarily correct since we may simply have no support for
3008                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3009                     if 'fragments' in representation_ms_info:
3010                         f.update({
3011                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3012                             'url': mpd_url or base_url,
3013                             'fragment_base_url': base_url,
3014                             'fragments': [],
3015                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3016                         })
3017                         if 'initialization_url' in representation_ms_info:
3018                             initialization_url = representation_ms_info['initialization_url']
3019                             if not f.get('url'):
3020                                 f['url'] = initialization_url
3021                             f['fragments'].append({location_key(initialization_url): initialization_url})
3022                         f['fragments'].extend(representation_ms_info['fragments'])
3023                         if not period_duration:
3024                             period_duration = try_get(
3025                                 representation_ms_info,
3026                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3027                     else:
3028                         # Assuming direct URL to unfragmented media.
3029                         f['url'] = base_url
3030                     if content_type in ('video', 'audio', 'image/jpeg'):
3031                         f['manifest_stream_number'] = stream_numbers[f['url']]
3032                         stream_numbers[f['url']] += 1
3033                         formats.append(f)
3034                     elif content_type == 'text':
3035                         subtitles.setdefault(lang or 'und', []).append(f)
3036
3037         return formats, subtitles
3038
3039     def _extract_ism_formats(self, *args, **kwargs):
3040         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3041         if subs:
3042             self._report_ignoring_subs('ISM')
3043         return fmts
3044
3045     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3046         res = self._download_xml_handle(
3047             ism_url, video_id,
3048             note='Downloading ISM manifest' if note is None else note,
3049             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3050             fatal=fatal, data=data, headers=headers, query=query)
3051         if res is False:
3052             return [], {}
3053         ism_doc, urlh = res
3054         if ism_doc is None:
3055             return [], {}
3056
3057         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3058
3059     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3060         """
3061         Parse formats from ISM manifest.
3062         References:
3063          1. [MS-SSTR]: Smooth Streaming Protocol,
3064             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3065         """
3066         if ism_doc.get('IsLive') == 'TRUE':
3067             return [], {}
3068
3069         duration = int(ism_doc.attrib['Duration'])
3070         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3071
3072         formats = []
3073         subtitles = {}
3074         for stream in ism_doc.findall('StreamIndex'):
3075             stream_type = stream.get('Type')
3076             if stream_type not in ('video', 'audio', 'text'):
3077                 continue
3078             url_pattern = stream.attrib['Url']
3079             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3080             stream_name = stream.get('Name')
3081             stream_language = stream.get('Language', 'und')
3082             for track in stream.findall('QualityLevel'):
3083                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3084                 # TODO: add support for WVC1 and WMAP
3085                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3086                     self.report_warning('%s is not a supported codec' % fourcc)
3087                     continue
3088                 tbr = int(track.attrib['Bitrate']) // 1000
3089                 # [1] does not mention Width and Height attributes. However,
3090                 # they're often present while MaxWidth and MaxHeight are
3091                 # missing, so should be used as fallbacks
3092                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3093                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3094                 sampling_rate = int_or_none(track.get('SamplingRate'))
3095
3096                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3097                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3098
3099                 fragments = []
3100                 fragment_ctx = {
3101                     'time': 0,
3102                 }
3103                 stream_fragments = stream.findall('c')
3104                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3105                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3106                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3107                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3108                     if not fragment_ctx['duration']:
3109                         try:
3110                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3111                         except IndexError:
3112                             next_fragment_time = duration
3113                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3114                     for _ in range(fragment_repeat):
3115                         fragments.append({
3116                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3117                             'duration': fragment_ctx['duration'] / stream_timescale,
3118                         })
3119                         fragment_ctx['time'] += fragment_ctx['duration']
3120
3121                 if stream_type == 'text':
3122                     subtitles.setdefault(stream_language, []).append({
3123                         'ext': 'ismt',
3124                         'protocol': 'ism',
3125                         'url': ism_url,
3126                         'manifest_url': ism_url,
3127                         'fragments': fragments,
3128                         '_download_params': {
3129                             'stream_type': stream_type,
3130                             'duration': duration,
3131                             'timescale': stream_timescale,
3132                             'fourcc': fourcc,
3133                             'language': stream_language,
3134                             'codec_private_data': track.get('CodecPrivateData'),
3135                         }
3136                     })
3137                 elif stream_type in ('video', 'audio'):
3138                     formats.append({
3139                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3140                         'url': ism_url,
3141                         'manifest_url': ism_url,
3142                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3143                         'width': width,
3144                         'height': height,
3145                         'tbr': tbr,
3146                         'asr': sampling_rate,
3147                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3148                         'acodec': 'none' if stream_type == 'video' else fourcc,
3149                         'protocol': 'ism',
3150                         'fragments': fragments,
3151                         'has_drm': ism_doc.find('Protection') is not None,
3152                         '_download_params': {
3153                             'stream_type': stream_type,
3154                             'duration': duration,
3155                             'timescale': stream_timescale,
3156                             'width': width or 0,
3157                             'height': height or 0,
3158                             'fourcc': fourcc,
3159                             'language': stream_language,
3160                             'codec_private_data': track.get('CodecPrivateData'),
3161                             'sampling_rate': sampling_rate,
3162                             'channels': int_or_none(track.get('Channels', 2)),
3163                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3164                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3165                         },
3166                     })
3167         return formats, subtitles
3168
3169     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3170         def absolute_url(item_url):
3171             return urljoin(base_url, item_url)
3172
3173         def parse_content_type(content_type):
3174             if not content_type:
3175                 return {}
3176             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3177             if ctr:
3178                 mimetype, codecs = ctr.groups()
3179                 f = parse_codecs(codecs)
3180                 f['ext'] = mimetype2ext(mimetype)
3181                 return f
3182             return {}
3183
3184         def _media_formats(src, cur_media_type, type_info={}):
3185             full_url = absolute_url(src)
3186             ext = type_info.get('ext') or determine_ext(full_url)
3187             if ext == 'm3u8':
3188                 is_plain_url = False
3189                 formats = self._extract_m3u8_formats(
3190                     full_url, video_id, ext='mp4',
3191                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3192                     preference=preference, quality=quality, fatal=False)
3193             elif ext == 'mpd':
3194                 is_plain_url = False
3195                 formats = self._extract_mpd_formats(
3196                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3197             else:
3198                 is_plain_url = True
3199                 formats = [{
3200                     'url': full_url,
3201                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3202                 }]
3203             return is_plain_url, formats
3204
3205         entries = []
3206         # amp-video and amp-audio are very similar to their HTML5 counterparts
3207         # so we wll include them right here (see
3208         # https://www.ampproject.org/docs/reference/components/amp-video)
3209         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3210         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3211         media_tags = [(media_tag, media_tag_name, media_type, '')
3212                       for media_tag, media_tag_name, media_type
3213                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3214         media_tags.extend(re.findall(
3215             # We only allow video|audio followed by a whitespace or '>'.
3216             # Allowing more characters may end up in significant slow down (see
3217             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3218             # http://www.porntrex.com/maps/videositemap.xml).
3219             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3220         for media_tag, _, media_type, media_content in media_tags:
3221             media_info = {
3222                 'formats': [],
3223                 'subtitles': {},
3224             }
3225             media_attributes = extract_attributes(media_tag)
3226             src = strip_or_none(media_attributes.get('src'))
3227             if src:
3228                 _, formats = _media_formats(src, media_type)
3229                 media_info['formats'].extend(formats)
3230             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3231             if media_content:
3232                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3233                     s_attr = extract_attributes(source_tag)
3234                     # data-video-src and data-src are non standard but seen
3235                     # several times in the wild
3236                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3237                     if not src:
3238                         continue
3239                     f = parse_content_type(s_attr.get('type'))
3240                     is_plain_url, formats = _media_formats(src, media_type, f)
3241                     if is_plain_url:
3242                         # width, height, res, label and title attributes are
3243                         # all not standard but seen several times in the wild
3244                         labels = [
3245                             s_attr.get(lbl)
3246                             for lbl in ('label', 'title')
3247                             if str_or_none(s_attr.get(lbl))
3248                         ]
3249                         width = int_or_none(s_attr.get('width'))
3250                         height = (int_or_none(s_attr.get('height'))
3251                                   or int_or_none(s_attr.get('res')))
3252                         if not width or not height:
3253                             for lbl in labels:
3254                                 resolution = parse_resolution(lbl)
3255                                 if not resolution:
3256                                     continue
3257                                 width = width or resolution.get('width')
3258                                 height = height or resolution.get('height')
3259                         for lbl in labels:
3260                             tbr = parse_bitrate(lbl)
3261                             if tbr:
3262                                 break
3263                         else:
3264                             tbr = None
3265                         f.update({
3266                             'width': width,
3267                             'height': height,
3268                             'tbr': tbr,
3269                             'format_id': s_attr.get('label') or s_attr.get('title'),
3270                         })
3271                         f.update(formats[0])
3272                         media_info['formats'].append(f)
3273                     else:
3274                         media_info['formats'].extend(formats)
3275                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3276                     track_attributes = extract_attributes(track_tag)
3277                     kind = track_attributes.get('kind')
3278                     if not kind or kind in ('subtitles', 'captions'):
3279                         src = strip_or_none(track_attributes.get('src'))
3280                         if not src:
3281                             continue
3282                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3283                         media_info['subtitles'].setdefault(lang, []).append({
3284                             'url': absolute_url(src),
3285                         })
3286             for f in media_info['formats']:
3287                 f.setdefault('http_headers', {})['Referer'] = base_url
3288             if media_info['formats'] or media_info['subtitles']:
3289                 entries.append(media_info)
3290         return entries
3291
3292     def _extract_akamai_formats(self, *args, **kwargs):
3293         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3294         if subs:
3295             self._report_ignoring_subs('akamai')
3296         return fmts
3297
3298     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3299         signed = 'hdnea=' in manifest_url
3300         if not signed:
3301             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3302             manifest_url = re.sub(
3303                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3304                 '', manifest_url).strip('?')
3305
3306         formats = []
3307         subtitles = {}
3308
3309         hdcore_sign = 'hdcore=3.7.0'
3310         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3311         hds_host = hosts.get('hds')
3312         if hds_host:
3313             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3314         if 'hdcore=' not in f4m_url:
3315             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3316         f4m_formats = self._extract_f4m_formats(
3317             f4m_url, video_id, f4m_id='hds', fatal=False)
3318         for entry in f4m_formats:
3319             entry.update({'extra_param_to_segment_url': hdcore_sign})
3320         formats.extend(f4m_formats)
3321
3322         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3323         hls_host = hosts.get('hls')
3324         if hls_host:
3325             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3326         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3327             m3u8_url, video_id, 'mp4', 'm3u8_native',
3328             m3u8_id='hls', fatal=False)
3329         formats.extend(m3u8_formats)
3330         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3331
3332         http_host = hosts.get('http')
3333         if http_host and m3u8_formats and not signed:
3334             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3335             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3336             qualities_length = len(qualities)
3337             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3338                 i = 0
3339                 for f in m3u8_formats:
3340                     if f['vcodec'] != 'none':
3341                         for protocol in ('http', 'https'):
3342                             http_f = f.copy()
3343                             del http_f['manifest_url']
3344                             http_url = re.sub(
3345                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3346                             http_f.update({
3347                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3348                                 'url': http_url,
3349                                 'protocol': protocol,
3350                             })
3351                             formats.append(http_f)
3352                         i += 1
3353
3354         return formats, subtitles
3355
3356     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3357         query = compat_urlparse.urlparse(url).query
3358         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3359         mobj = re.search(
3360             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3361         url_base = mobj.group('url')
3362         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3363         formats = []
3364
3365         def manifest_url(manifest):
3366             m_url = f'{http_base_url}/{manifest}'
3367             if query:
3368                 m_url += '?%s' % query
3369             return m_url
3370
3371         if 'm3u8' not in skip_protocols:
3372             formats.extend(self._extract_m3u8_formats(
3373                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3374                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3375         if 'f4m' not in skip_protocols:
3376             formats.extend(self._extract_f4m_formats(
3377                 manifest_url('manifest.f4m'),
3378                 video_id, f4m_id='hds', fatal=False))
3379         if 'dash' not in skip_protocols:
3380             formats.extend(self._extract_mpd_formats(
3381                 manifest_url('manifest.mpd'),
3382                 video_id, mpd_id='dash', fatal=False))
3383         if re.search(r'(?:/smil:|\.smil)', url_base):
3384             if 'smil' not in skip_protocols:
3385                 rtmp_formats = self._extract_smil_formats(
3386                     manifest_url('jwplayer.smil'),
3387                     video_id, fatal=False)
3388                 for rtmp_format in rtmp_formats:
3389                     rtsp_format = rtmp_format.copy()
3390                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3391                     del rtsp_format['play_path']
3392                     del rtsp_format['ext']
3393                     rtsp_format.update({
3394                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3395                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3396                         'protocol': 'rtsp',
3397                     })
3398                     formats.extend([rtmp_format, rtsp_format])
3399         else:
3400             for protocol in ('rtmp', 'rtsp'):
3401                 if protocol not in skip_protocols:
3402                     formats.append({
3403                         'url': f'{protocol}:{url_base}',
3404                         'format_id': protocol,
3405                         'protocol': protocol,
3406                     })
3407         return formats
3408
3409     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3410         mobj = re.search(
3411             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3412             webpage)
3413         if mobj:
3414             try:
3415                 jwplayer_data = self._parse_json(mobj.group('options'),
3416                                                  video_id=video_id,
3417                                                  transform_source=transform_source)
3418             except ExtractorError:
3419                 pass
3420             else:
3421                 if isinstance(jwplayer_data, dict):
3422                     return jwplayer_data
3423
3424     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3425         jwplayer_data = self._find_jwplayer_data(
3426             webpage, video_id, transform_source=js_to_json)
3427         return self._parse_jwplayer_data(
3428             jwplayer_data, video_id, *args, **kwargs)
3429
3430     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3431                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3432         # JWPlayer backward compatibility: flattened playlists
3433         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3434         if 'playlist' not in jwplayer_data:
3435             jwplayer_data = {'playlist': [jwplayer_data]}
3436
3437         entries = []
3438
3439         # JWPlayer backward compatibility: single playlist item
3440         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3441         if not isinstance(jwplayer_data['playlist'], list):
3442             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3443
3444         for video_data in jwplayer_data['playlist']:
3445             # JWPlayer backward compatibility: flattened sources
3446             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3447             if 'sources' not in video_data:
3448                 video_data['sources'] = [video_data]
3449
3450             this_video_id = video_id or video_data['mediaid']
3451
3452             formats = self._parse_jwplayer_formats(
3453                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3454                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3455
3456             subtitles = {}
3457             tracks = video_data.get('tracks')
3458             if tracks and isinstance(tracks, list):
3459                 for track in tracks:
3460                     if not isinstance(track, dict):
3461                         continue
3462                     track_kind = track.get('kind')
3463                     if not track_kind or not isinstance(track_kind, compat_str):
3464                         continue
3465                     if track_kind.lower() not in ('captions', 'subtitles'):
3466                         continue
3467                     track_url = urljoin(base_url, track.get('file'))
3468                     if not track_url:
3469                         continue
3470                     subtitles.setdefault(track.get('label') or 'en', []).append({
3471                         'url': self._proto_relative_url(track_url)
3472                     })
3473
3474             entry = {
3475                 'id': this_video_id,
3476                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3477                 'description': clean_html(video_data.get('description')),
3478                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3479                 'timestamp': int_or_none(video_data.get('pubdate')),
3480                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3481                 'subtitles': subtitles,
3482             }
3483             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3484             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3485                 entry.update({
3486                     '_type': 'url_transparent',
3487                     'url': formats[0]['url'],
3488                 })
3489             else:
3490                 self._sort_formats(formats)
3491                 entry['formats'] = formats
3492             entries.append(entry)
3493         if len(entries) == 1:
3494             return entries[0]
3495         else:
3496             return self.playlist_result(entries)
3497
3498     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3499                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3500         urls = []
3501         formats = []
3502         for source in jwplayer_sources_data:
3503             if not isinstance(source, dict):
3504                 continue
3505             source_url = urljoin(
3506                 base_url, self._proto_relative_url(source.get('file')))
3507             if not source_url or source_url in urls:
3508                 continue
3509             urls.append(source_url)
3510             source_type = source.get('type') or ''
3511             ext = mimetype2ext(source_type) or determine_ext(source_url)
3512             if source_type == 'hls' or ext == 'm3u8':
3513                 formats.extend(self._extract_m3u8_formats(
3514                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3515                     m3u8_id=m3u8_id, fatal=False))
3516             elif source_type == 'dash' or ext == 'mpd':
3517                 formats.extend(self._extract_mpd_formats(
3518                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3519             elif ext == 'smil':
3520                 formats.extend(self._extract_smil_formats(
3521                     source_url, video_id, fatal=False))
3522             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3523             elif source_type.startswith('audio') or ext in (
3524                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3525                 formats.append({
3526                     'url': source_url,
3527                     'vcodec': 'none',
3528                     'ext': ext,
3529                 })
3530             else:
3531                 height = int_or_none(source.get('height'))
3532                 if height is None:
3533                     # Often no height is provided but there is a label in
3534                     # format like "1080p", "720p SD", or 1080.
3535                     height = int_or_none(self._search_regex(
3536                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3537                         'height', default=None))
3538                 a_format = {
3539                     'url': source_url,
3540                     'width': int_or_none(source.get('width')),
3541                     'height': height,
3542                     'tbr': int_or_none(source.get('bitrate')),
3543                     'ext': ext,
3544                 }
3545                 if source_url.startswith('rtmp'):
3546                     a_format['ext'] = 'flv'
3547                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3548                     # of jwplayer.flash.swf
3549                     rtmp_url_parts = re.split(
3550                         r'((?:mp4|mp3|flv):)', source_url, 1)
3551                     if len(rtmp_url_parts) == 3:
3552                         rtmp_url, prefix, play_path = rtmp_url_parts
3553                         a_format.update({
3554                             'url': rtmp_url,
3555                             'play_path': prefix + play_path,
3556                         })
3557                     if rtmp_params:
3558                         a_format.update(rtmp_params)
3559                 formats.append(a_format)
3560         return formats
3561
3562     def _live_title(self, name):
3563         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3564         return name
3565
3566     def _int(self, v, name, fatal=False, **kwargs):
3567         res = int_or_none(v, **kwargs)
3568         if res is None:
3569             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3570             if fatal:
3571                 raise ExtractorError(msg)
3572             else:
3573                 self.report_warning(msg)
3574         return res
3575
3576     def _float(self, v, name, fatal=False, **kwargs):
3577         res = float_or_none(v, **kwargs)
3578         if res is None:
3579             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3580             if fatal:
3581                 raise ExtractorError(msg)
3582             else:
3583                 self.report_warning(msg)
3584         return res
3585
3586     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3587                     path='/', secure=False, discard=False, rest={}, **kwargs):
3588         cookie = compat_cookiejar_Cookie(
3589             0, name, value, port, port is not None, domain, True,
3590             domain.startswith('.'), path, True, secure, expire_time,
3591             discard, None, None, rest)
3592         self._downloader.cookiejar.set_cookie(cookie)
3593
3594     def _get_cookies(self, url):
3595         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3596         req = sanitized_Request(url)
3597         self._downloader.cookiejar.add_cookie_header(req)
3598         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3599
3600     def _apply_first_set_cookie_header(self, url_handle, cookie):
3601         """
3602         Apply first Set-Cookie header instead of the last. Experimental.
3603
3604         Some sites (e.g. [1-3]) may serve two cookies under the same name
3605         in Set-Cookie header and expect the first (old) one to be set rather
3606         than second (new). However, as of RFC6265 the newer one cookie
3607         should be set into cookie store what actually happens.
3608         We will workaround this issue by resetting the cookie to
3609         the first one manually.
3610         1. https://new.vk.com/
3611         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3612         3. https://learning.oreilly.com/
3613         """
3614         for header, cookies in url_handle.headers.items():
3615             if header.lower() != 'set-cookie':
3616                 continue
3617             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3618             cookie_value = re.search(
3619                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3620             if cookie_value:
3621                 value, domain = cookie_value.groups()
3622                 self._set_cookie(domain, cookie, value)
3623                 break
3624
3625     def get_testcases(self, include_onlymatching=False):
3626         t = getattr(self, '_TEST', None)
3627         if t:
3628             assert not hasattr(self, '_TESTS'), \
3629                 '%s has _TEST and _TESTS' % type(self).__name__
3630             tests = [t]
3631         else:
3632             tests = getattr(self, '_TESTS', [])
3633         for t in tests:
3634             if not include_onlymatching and t.get('only_matching', False):
3635                 continue
3636             t['name'] = type(self).__name__[:-len('IE')]
3637             yield t
3638
3639     def is_suitable(self, age_limit):
3640         """ Test whether the extractor is generally suitable for the given
3641         age limit (i.e. pornographic sites are not, all others usually are) """
3642
3643         any_restricted = False
3644         for tc in self.get_testcases(include_onlymatching=False):
3645             if tc.get('playlist', []):
3646                 tc = tc['playlist'][0]
3647             is_restricted = age_restricted(
3648                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3649             if not is_restricted:
3650                 return True
3651             any_restricted = any_restricted or is_restricted
3652         return not any_restricted
3653
3654     def extract_subtitles(self, *args, **kwargs):
3655         if (self.get_param('writesubtitles', False)
3656                 or self.get_param('listsubtitles')):
3657             return self._get_subtitles(*args, **kwargs)
3658         return {}
3659
3660     def _get_subtitles(self, *args, **kwargs):
3661         raise NotImplementedError('This method must be implemented by subclasses')
3662
3663     def extract_comments(self, *args, **kwargs):
3664         if not self.get_param('getcomments'):
3665             return None
3666         generator = self._get_comments(*args, **kwargs)
3667
3668         def extractor():
3669             comments = []
3670             interrupted = True
3671             try:
3672                 while True:
3673                     comments.append(next(generator))
3674             except StopIteration:
3675                 interrupted = False
3676             except KeyboardInterrupt:
3677                 self.to_screen('Interrupted by user')
3678             except Exception as e:
3679                 if self.get_param('ignoreerrors') is not True:
3680                     raise
3681                 self._downloader.report_error(e)
3682             comment_count = len(comments)
3683             self.to_screen(f'Extracted {comment_count} comments')
3684             return {
3685                 'comments': comments,
3686                 'comment_count': None if interrupted else comment_count
3687             }
3688         return extractor
3689
3690     def _get_comments(self, *args, **kwargs):
3691         raise NotImplementedError('This method must be implemented by subclasses')
3692
3693     @staticmethod
3694     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3695         """ Merge subtitle items for one language. Items with duplicated URLs/data
3696         will be dropped. """
3697         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3698         ret = list(subtitle_list1)
3699         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3700         return ret
3701
3702     @classmethod
3703     def _merge_subtitles(cls, *dicts, target=None):
3704         """ Merge subtitle dictionaries, language by language. """
3705         if target is None:
3706             target = {}
3707         for d in dicts:
3708             for lang, subs in d.items():
3709                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3710         return target
3711
3712     def extract_automatic_captions(self, *args, **kwargs):
3713         if (self.get_param('writeautomaticsub', False)
3714                 or self.get_param('listsubtitles')):
3715             return self._get_automatic_captions(*args, **kwargs)
3716         return {}
3717
3718     def _get_automatic_captions(self, *args, **kwargs):
3719         raise NotImplementedError('This method must be implemented by subclasses')
3720
3721     def mark_watched(self, *args, **kwargs):
3722         if not self.get_param('mark_watched', False):
3723             return
3724         if (self.supports_login() and self._get_login_info()[0] is not None
3725                 or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')):
3726             self._mark_watched(*args, **kwargs)
3727
3728     def _mark_watched(self, *args, **kwargs):
3729         raise NotImplementedError('This method must be implemented by subclasses')
3730
3731     def geo_verification_headers(self):
3732         headers = {}
3733         geo_verification_proxy = self.get_param('geo_verification_proxy')
3734         if geo_verification_proxy:
3735             headers['Ytdl-request-proxy'] = geo_verification_proxy
3736         return headers
3737
3738     def _generic_id(self, url):
3739         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3740
3741     def _generic_title(self, url):
3742         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3743
3744     @staticmethod
3745     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3746         all_known = all(map(
3747             lambda x: x is not None,
3748             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3749         return (
3750             'private' if is_private
3751             else 'premium_only' if needs_premium
3752             else 'subscriber_only' if needs_subscription
3753             else 'needs_auth' if needs_auth
3754             else 'unlisted' if is_unlisted
3755             else 'public' if all_known
3756             else None)
3757
3758     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3759         '''
3760         @returns            A list of values for the extractor argument given by "key"
3761                             or "default" if no such key is present
3762         @param default      The default value to return when the key is not present (default: [])
3763         @param casesense    When false, the values are converted to lower case
3764         '''
3765         val = traverse_obj(
3766             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3767         if val is None:
3768             return [] if default is NO_DEFAULT else default
3769         return list(val) if casesense else [x.lower() for x in val]
3770
3771     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3772         if not playlist_id or not video_id:
3773             return not video_id
3774
3775         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3776         if no_playlist is not None:
3777             return not no_playlist
3778
3779         video_id = '' if video_id is True else f' {video_id}'
3780         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3781         if self.get_param('noplaylist'):
3782             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3783             return False
3784         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3785         return True
3786
3787
3788 class SearchInfoExtractor(InfoExtractor):
3789     """
3790     Base class for paged search queries extractors.
3791     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3792     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3793     """
3794
3795     _MAX_RESULTS = float('inf')
3796
3797     @classmethod
3798     def _make_valid_url(cls):
3799         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3800
3801     def _real_extract(self, query):
3802         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3803         if prefix == '':
3804             return self._get_n_results(query, 1)
3805         elif prefix == 'all':
3806             return self._get_n_results(query, self._MAX_RESULTS)
3807         else:
3808             n = int(prefix)
3809             if n <= 0:
3810                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3811             elif n > self._MAX_RESULTS:
3812                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3813                 n = self._MAX_RESULTS
3814             return self._get_n_results(query, n)
3815
3816     def _get_n_results(self, query, n):
3817         """Get a specified number of results for a query.
3818         Either this function or _search_results must be overridden by subclasses """
3819         return self.playlist_result(
3820             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3821             query, query)
3822
3823     def _search_results(self, query):
3824         """Returns an iterator of search results"""
3825         raise NotImplementedError('This method must be implemented by subclasses')
3826
3827     @property
3828     def SEARCH_KEY(self):
3829         return self._SEARCH_KEY