yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import hashlib
   4 import itertools
   5 import json
   6 import math
   7 import netrc
   8 import os
   9 import random
  10 import sys
  11 import time
  12 import xml.etree.ElementTree
  13
  14 from ..compat import (
  15     compat_cookiejar_Cookie,
  16     compat_cookies_SimpleCookie,
  17     compat_etree_fromstring,
  18     compat_expanduser,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_unquote,
  25     compat_urllib_parse_urlencode,
  26     compat_urllib_request,
  27     compat_urlparse,
  28     re,
  29 )
  30 from ..downloader import FileDownloader
  31 from ..downloader.f4m import get_base_url, remove_encrypted_media
  32 from ..utils import (
  33     JSON_LD_RE,
  34     NO_DEFAULT,
  35     ExtractorError,
  36     GeoRestrictedError,
  37     GeoUtils,
  38     RegexNotFoundError,
  39     UnsupportedError,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     encode_data_uri,
  48     error_to_compat_str,
  49     extract_attributes,
  50     filter_dict,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     int_or_none,
  55     join_nonempty,
  56     js_to_json,
  57     mimetype2ext,
  58     network_exceptions,
  59     orderedSet,
  60     parse_bitrate,
  61     parse_codecs,
  62     parse_duration,
  63     parse_iso8601,
  64     parse_m3u8_attributes,
  65     parse_resolution,
  66     sanitize_filename,
  67     sanitized_Request,
  68     str_or_none,
  69     str_to_int,
  70     strip_or_none,
  71     traverse_obj,
  72     try_get,
  73     unescapeHTML,
  74     unified_strdate,
  75     unified_timestamp,
  76     update_Request,
  77     update_url_query,
  78     url_basename,
  79     url_or_none,
  80     urljoin,
  81     variadic,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor:
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped.
 107
 108     Additionally, it must contain either a formats entry or a url one:
 109
 110     formats:        A list of dictionaries for each format available, ordered
 111                     from worst to best quality.
 112
 113                     Potential fields:
 114                     * url        The mandatory URL representing the media:
 115                                    for plain file media - HTTP URL of this file,
 116                                    for RTMP - RTMP URL,
 117                                    for HLS - URL of the M3U8 media playlist,
 118                                    for HDS - URL of the F4M manifest,
 119                                    for DASH
 120                                      - HTTP URL to plain file media (in case of
 121                                        unfragmented media)
 122                                      - URL of the MPD manifest or base URL
 123                                        representing the media if MPD manifest
 124                                        is parsed from a string (in case of
 125                                        fragmented media)
 126                                    for MSS - URL of the ISM manifest.
 127                     * manifest_url
 128                                  The URL of the manifest file in case of
 129                                  fragmented media:
 130                                    for HLS - URL of the M3U8 master playlist,
 131                                    for HDS - URL of the F4M manifest,
 132                                    for DASH - URL of the MPD manifest,
 133                                    for MSS - URL of the ISM manifest.
 134                     * manifest_stream_number  (For internal use only)
 135                                  The index of the stream in the manifest file
 136                     * ext        Will be calculated from URL if missing
 137                     * format     A human-readable description of the format
 138                                  ("mp4 container with h264/opus").
 139                                  Calculated from the format_id, width, height.
 140                                  and format_note fields if missing.
 141                     * format_id  A short description of the format
 142                                  ("mp4_h264_opus" or "19").
 143                                 Technically optional, but strongly recommended.
 144                     * format_note Additional info about the format
 145                                  ("3D" or "DASH video")
 146                     * width      Width of the video, if known
 147                     * height     Height of the video, if known
 148                     * resolution Textual description of width and height
 149                     * dynamic_range The dynamic range of the video. One of:
 150                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 151                     * tbr        Average bitrate of audio and video in KBit/s
 152                     * abr        Average audio bitrate in KBit/s
 153                     * acodec     Name of the audio codec in use
 154                     * asr        Audio sampling rate in Hertz
 155                     * vbr        Average video bitrate in KBit/s
 156                     * fps        Frame rate
 157                     * vcodec     Name of the video codec in use
 158                     * container  Name of the container format
 159                     * filesize   The number of bytes, if known in advance
 160                     * filesize_approx  An estimate for the number of bytes
 161                     * player_url SWF Player URL (used for rtmpdump).
 162                     * protocol   The protocol that will be used for the actual
 163                                  download, lower-case. One of "http", "https" or
 164                                  one of the protocols defined in downloader.PROTOCOL_MAP
 165                     * fragment_base_url
 166                                  Base URL for fragments. Each fragment's path
 167                                  value (if present) will be relative to
 168                                  this URL.
 169                     * fragments  A list of fragments of a fragmented media.
 170                                  Each fragment entry must contain either an url
 171                                  or a path. If an url is present it should be
 172                                  considered by a client. Otherwise both path and
 173                                  fragment_base_url must be present. Here is
 174                                  the list of all potential fields:
 175                                  * "url" - fragment's URL
 176                                  * "path" - fragment's path relative to
 177                                             fragment_base_url
 178                                  * "duration" (optional, int or float)
 179                                  * "filesize" (optional, int)
 180                     * is_from_start  Is a live format that can be downloaded
 181                                 from the start. Boolean
 182                     * preference Order number of this format. If this field is
 183                                  present and not None, the formats get sorted
 184                                  by this field, regardless of all other values.
 185                                  -1 for default (order by other properties),
 186                                  -2 or smaller for less than default.
 187                                  < -1000 to hide the format (if there is
 188                                     another one which is strictly better)
 189                     * language   Language code, e.g. "de" or "en-US".
 190                     * language_preference  Is this in the language mentioned in
 191                                  the URL?
 192                                  10 if it's what the URL is about,
 193                                  -1 for default (don't know),
 194                                  -10 otherwise, other values reserved for now.
 195                     * quality    Order number of the video quality of this
 196                                  format, irrespective of the file format.
 197                                  -1 for default (order by other properties),
 198                                  -2 or smaller for less than default.
 199                     * source_preference  Order number for this video source
 200                                   (quality takes higher priority)
 201                                  -1 for default (order by other properties),
 202                                  -2 or smaller for less than default.
 203                     * http_headers  A dictionary of additional HTTP headers
 204                                  to add to the request.
 205                     * stretched_ratio  If given and not 1, indicates that the
 206                                  video's pixels are not square.
 207                                  width : height ratio as float.
 208                     * no_resume  The server does not support resuming the
 209                                  (HTTP or RTMP) download. Boolean.
 210                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 211                     * downloader_options  A dictionary of downloader options
 212                                  (For internal use only)
 213                                  * http_chunk_size Chunk size for HTTP downloads
 214                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 215                     RTMP formats can also have the additional fields: page_url,
 216                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 217                     rtmp_protocol, rtmp_real_time
 218
 219     url:            Final video URL.
 220     ext:            Video filename extension.
 221     format:         The video format, defaults to ext (used for --get-format)
 222     player_url:     SWF Player URL (used for rtmpdump).
 223
 224     The following fields are optional:
 225
 226     direct:         True if a direct video file was given (must only be set by GenericIE)
 227     alt_title:      A secondary title of the video.
 228     display_id      An alternative identifier for the video, not necessarily
 229                     unique, but available before title. Typically, id is
 230                     something like "4234987", title "Dancing naked mole rats",
 231                     and display_id "dancing-naked-mole-rats"
 232     thumbnails:     A list of dictionaries, with the following entries:
 233                         * "id" (optional, string) - Thumbnail format ID
 234                         * "url"
 235                         * "preference" (optional, int) - quality of the image
 236                         * "width" (optional, int)
 237                         * "height" (optional, int)
 238                         * "resolution" (optional, string "{width}x{height}",
 239                                         deprecated)
 240                         * "filesize" (optional, int)
 241                         * "http_headers" (dict) - HTTP headers for the request
 242     thumbnail:      Full URL to a video thumbnail image.
 243     description:    Full video description.
 244     uploader:       Full name of the video uploader.
 245     license:        License name the video is licensed under.
 246     creator:        The creator of the video.
 247     timestamp:      UNIX timestamp of the moment the video was uploaded
 248     upload_date:    Video upload date in UTC (YYYYMMDD).
 249                     If not explicitly set, calculated from timestamp
 250     release_timestamp: UNIX timestamp of the moment the video was released.
 251                     If it is not clear whether to use timestamp or this, use the former
 252     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 253                     If not explicitly set, calculated from release_timestamp
 254     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 255     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 256                     If not explicitly set, calculated from modified_timestamp
 257     uploader_id:    Nickname or id of the video uploader.
 258     uploader_url:   Full URL to a personal webpage of the video uploader.
 259     channel:        Full name of the channel the video is uploaded on.
 260                     Note that channel fields may or may not repeat uploader
 261                     fields. This depends on a particular extractor.
 262     channel_id:     Id of the channel.
 263     channel_url:    Full URL to a channel webpage.
 264     channel_follower_count: Number of followers of the channel.
 265     location:       Physical location where the video was filmed.
 266     subtitles:      The available subtitles as a dictionary in the format
 267                     {tag: subformats}. "tag" is usually a language code, and
 268                     "subformats" is a list sorted from lower to higher
 269                     preference, each element is a dictionary with the "ext"
 270                     entry and one of:
 271                         * "data": The subtitles file contents
 272                         * "url": A URL pointing to the subtitles file
 273                     It can optionally also have:
 274                         * "name": Name or description of the subtitles
 275                         * "http_headers": A dictionary of additional HTTP headers
 276                                   to add to the request.
 277                     "ext" will be calculated from URL if missing
 278     automatic_captions: Like 'subtitles'; contains automatically generated
 279                     captions instead of normal subtitles
 280     duration:       Length of the video in seconds, as an integer or float.
 281     view_count:     How many users have watched the video on the platform.
 282     like_count:     Number of positive ratings of the video
 283     dislike_count:  Number of negative ratings of the video
 284     repost_count:   Number of reposts of the video
 285     average_rating: Average rating give by users, the scale used depends on the webpage
 286     comment_count:  Number of comments on the video
 287     comments:       A list of comments, each with one or more of the following
 288                     properties (all but one of text or html optional):
 289                         * "author" - human-readable name of the comment author
 290                         * "author_id" - user ID of the comment author
 291                         * "author_thumbnail" - The thumbnail of the comment author
 292                         * "id" - Comment ID
 293                         * "html" - Comment as HTML
 294                         * "text" - Plain text of the comment
 295                         * "timestamp" - UNIX timestamp of comment
 296                         * "parent" - ID of the comment this one is replying to.
 297                                      Set to "root" to indicate that this is a
 298                                      comment to the original video.
 299                         * "like_count" - Number of positive ratings of the comment
 300                         * "dislike_count" - Number of negative ratings of the comment
 301                         * "is_favorited" - Whether the comment is marked as
 302                                            favorite by the video uploader
 303                         * "author_is_uploader" - Whether the comment is made by
 304                                                  the video uploader
 305     age_limit:      Age restriction for the video, as an integer (years)
 306     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 307                     should allow to get the same result again. (It will be set
 308                     by YoutubeDL if it's missing)
 309     categories:     A list of categories that the video falls in, for example
 310                     ["Sports", "Berlin"]
 311     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 312     cast:           A list of the video cast
 313     is_live:        True, False, or None (=unknown). Whether this video is a
 314                     live stream that goes on instead of a fixed-length video.
 315     was_live:       True, False, or None (=unknown). Whether this video was
 316                     originally a live stream.
 317     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 318                     If absent, automatically set from is_live, was_live
 319     start_time:     Time in seconds where the reproduction should start, as
 320                     specified in the URL.
 321     end_time:       Time in seconds where the reproduction should end, as
 322                     specified in the URL.
 323     chapters:       A list of dictionaries, with the following entries:
 324                         * "start_time" - The start time of the chapter in seconds
 325                         * "end_time" - The end time of the chapter in seconds
 326                         * "title" (optional, string)
 327     playable_in_embed: Whether this video is allowed to play in embedded
 328                     players on other sites. Can be True (=always allowed),
 329                     False (=never allowed), None (=unknown), or a string
 330                     specifying the criteria for embedability (Eg: 'whitelist')
 331     availability:   Under what condition the video is available. One of
 332                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 333                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 334                     to set it
 335     __post_extractor: A function to be called just before the metadata is
 336                     written to either disk, logger or console. The function
 337                     must return a dict which will be added to the info_dict.
 338                     This is usefull for additional information that is
 339                     time-consuming to extract. Note that the fields thus
 340                     extracted will not be available to output template and
 341                     match_filter. So, only "comments" and "comment_count" are
 342                     currently allowed to be extracted via this method.
 343
 344     The following fields should only be used when the video belongs to some logical
 345     chapter or section:
 346
 347     chapter:        Name or title of the chapter the video belongs to.
 348     chapter_number: Number of the chapter the video belongs to, as an integer.
 349     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 350
 351     The following fields should only be used when the video is an episode of some
 352     series, programme or podcast:
 353
 354     series:         Title of the series or programme the video episode belongs to.
 355     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 356     season:         Title of the season the video episode belongs to.
 357     season_number:  Number of the season the video episode belongs to, as an integer.
 358     season_id:      Id of the season the video episode belongs to, as a unicode string.
 359     episode:        Title of the video episode. Unlike mandatory video title field,
 360                     this field should denote the exact title of the video episode
 361                     without any kind of decoration.
 362     episode_number: Number of the video episode within a season, as an integer.
 363     episode_id:     Id of the video episode, as a unicode string.
 364
 365     The following fields should only be used when the media is a track or a part of
 366     a music album:
 367
 368     track:          Title of the track.
 369     track_number:   Number of the track within an album or a disc, as an integer.
 370     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 371                     as a unicode string.
 372     artist:         Artist(s) of the track.
 373     genre:          Genre(s) of the track.
 374     album:          Title of the album the track belongs to.
 375     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 376     album_artist:   List of all artists appeared on the album (e.g.
 377                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 378                     and compilations).
 379     disc_number:    Number of the disc or other physical medium the track belongs to,
 380                     as an integer.
 381     release_year:   Year (YYYY) when the album was released.
 382     composer:       Composer of the piece
 383
 384     Unless mentioned otherwise, the fields should be Unicode strings.
 385
 386     Unless mentioned otherwise, None is equivalent to absence of information.
 387
 388
 389     _type "playlist" indicates multiple videos.
 390     There must be a key "entries", which is a list, an iterable, or a PagedList
 391     object, each element of which is a valid dictionary by this specification.
 392
 393     Additionally, playlists can have "id", "title", and any other relevent
 394     attributes with the same semantics as videos (see above).
 395
 396     It can also have the following optional fields:
 397
 398     playlist_count: The total number of videos in a playlist. If not given,
 399                     YoutubeDL tries to calculate it from "entries"
 400
 401
 402     _type "multi_video" indicates that there are multiple videos that
 403     form a single show, for examples multiple acts of an opera or TV episode.
 404     It must have an entries key like a playlist and contain all the keys
 405     required for a video at the same time.
 406
 407
 408     _type "url" indicates that the video must be extracted from another
 409     location, possibly by a different extractor. Its only required key is:
 410     "url" - the next URL to extract.
 411     The key "ie_key" can be set to the class name (minus the trailing "IE",
 412     e.g. "Youtube") if the extractor class is known in advance.
 413     Additionally, the dictionary may have any properties of the resolved entity
 414     known in advance, for example "title" if the title of the referred video is
 415     known ahead of time.
 416
 417
 418     _type "url_transparent" entities have the same specification as "url", but
 419     indicate that the given additional information is more precise than the one
 420     associated with the resolved URL.
 421     This is useful when a site employs a video service that hosts the video and
 422     its technical metadata, but that video service does not embed a useful
 423     title, description etc.
 424
 425
 426     Subclasses of this should define a _VALID_URL regexp and, re-define the
 427     _real_extract() and (optionally) _real_initialize() methods.
 428     Probably, they should also be added to the list of extractors.
 429
 430     Subclasses may also override suitable() if necessary, but ensure the function
 431     signature is preserved and that this function imports everything it needs
 432     (except other extractors), so that lazy_extractors works correctly.
 433
 434     To support username + password (or netrc) login, the extractor must define a
 435     _NETRC_MACHINE and re-define _perform_login(username, password) and
 436     (optionally) _initialize_pre_login() methods. The _perform_login method will
 437     be called between _initialize_pre_login and _real_initialize if credentials
 438     are passed by the user. In cases where it is necessary to have the login
 439     process as part of the extraction rather than initialization, _perform_login
 440     can be left undefined.
 441
 442     _GEO_BYPASS attribute may be set to False in order to disable
 443     geo restriction bypass mechanisms for a particular extractor.
 444     Though it won't disable explicit geo restriction bypass based on
 445     country code provided with geo_bypass_country.
 446
 447     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 448     countries for this extractor. One of these countries will be used by
 449     geo restriction bypass mechanism right away in order to bypass
 450     geo restriction, of course, if the mechanism is not disabled.
 451
 452     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 453     IP blocks in CIDR notation for this extractor. One of these IP blocks
 454     will be used by geo restriction bypass mechanism similarly
 455     to _GEO_COUNTRIES.
 456
 457     The _WORKING attribute should be set to False for broken IEs
 458     in order to warn the users and skip the tests.
 459     """
 460
 461     _ready = False
 462     _downloader = None
 463     _x_forwarded_for_ip = None
 464     _GEO_BYPASS = True
 465     _GEO_COUNTRIES = None
 466     _GEO_IP_BLOCKS = None
 467     _WORKING = True
 468     _NETRC_MACHINE = None
 469     IE_DESC = None
 470
 471     _LOGIN_HINTS = {
 472         'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials',
 473         'cookies': (
 474             'Use --cookies-from-browser or --cookies for the authentication. '
 475             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 476         'password': 'Use --username and --password, or --netrc to provide account credentials',
 477     }
 478
 479     def __init__(self, downloader=None):
 480         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 481         If a downloader is not passed during initialization,
 482         it must be set using "set_downloader()" before "extract()" is called"""
 483         self._ready = False
 484         self._x_forwarded_for_ip = None
 485         self._printed_messages = set()
 486         self.set_downloader(downloader)
 487
 488     @classmethod
 489     def _match_valid_url(cls, url):
 490         # This does not use has/getattr intentionally - we want to know whether
 491         # we have cached the regexp for *this* class, whereas getattr would also
 492         # match the superclass
 493         if '_VALID_URL_RE' not in cls.__dict__:
 494             if '_VALID_URL' not in cls.__dict__:
 495                 cls._VALID_URL = cls._make_valid_url()
 496             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 497         return cls._VALID_URL_RE.match(url)
 498
 499     @classmethod
 500     def suitable(cls, url):
 501         """Receives a URL and returns True if suitable for this IE."""
 502         # This function must import everything it needs (except other extractors),
 503         # so that lazy_extractors works correctly
 504         return cls._match_valid_url(url) is not None
 505
 506     @classmethod
 507     def _match_id(cls, url):
 508         return cls._match_valid_url(url).group('id')
 509
 510     @classmethod
 511     def get_temp_id(cls, url):
 512         try:
 513             return cls._match_id(url)
 514         except (IndexError, AttributeError):
 515             return None
 516
 517     @classmethod
 518     def working(cls):
 519         """Getter method for _WORKING."""
 520         return cls._WORKING
 521
 522     @classmethod
 523     def supports_login(cls):
 524         return bool(cls._NETRC_MACHINE)
 525
 526     def initialize(self):
 527         """Initializes an instance (authentication, etc)."""
 528         self._printed_messages = set()
 529         self._initialize_geo_bypass({
 530             'countries': self._GEO_COUNTRIES,
 531             'ip_blocks': self._GEO_IP_BLOCKS,
 532         })
 533         if not self._ready:
 534             self._initialize_pre_login()
 535             if self.supports_login():
 536                 username, password = self._get_login_info()
 537                 if username:
 538                     self._perform_login(username, password)
 539             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 540                 self.report_warning(f'Login with password is not supported for this website. {self._LOGIN_HINTS["cookies"]}')
 541             self._real_initialize()
 542             self._ready = True
 543
 544     def _initialize_geo_bypass(self, geo_bypass_context):
 545         """
 546         Initialize geo restriction bypass mechanism.
 547
 548         This method is used to initialize geo bypass mechanism based on faking
 549         X-Forwarded-For HTTP header. A random country from provided country list
 550         is selected and a random IP belonging to this country is generated. This
 551         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 552         HTTP requests.
 553
 554         This method will be used for initial geo bypass mechanism initialization
 555         during the instance initialization with _GEO_COUNTRIES and
 556         _GEO_IP_BLOCKS.
 557
 558         You may also manually call it from extractor's code if geo bypass
 559         information is not available beforehand (e.g. obtained during
 560         extraction) or due to some other reason. In this case you should pass
 561         this information in geo bypass context passed as first argument. It may
 562         contain following fields:
 563
 564         countries:  List of geo unrestricted countries (similar
 565                     to _GEO_COUNTRIES)
 566         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 567                     (similar to _GEO_IP_BLOCKS)
 568
 569         """
 570         if not self._x_forwarded_for_ip:
 571
 572             # Geo bypass mechanism is explicitly disabled by user
 573             if not self.get_param('geo_bypass', True):
 574                 return
 575
 576             if not geo_bypass_context:
 577                 geo_bypass_context = {}
 578
 579             # Backward compatibility: previously _initialize_geo_bypass
 580             # expected a list of countries, some 3rd party code may still use
 581             # it this way
 582             if isinstance(geo_bypass_context, (list, tuple)):
 583                 geo_bypass_context = {
 584                     'countries': geo_bypass_context,
 585                 }
 586
 587             # The whole point of geo bypass mechanism is to fake IP
 588             # as X-Forwarded-For HTTP header based on some IP block or
 589             # country code.
 590
 591             # Path 1: bypassing based on IP block in CIDR notation
 592
 593             # Explicit IP block specified by user, use it right away
 594             # regardless of whether extractor is geo bypassable or not
 595             ip_block = self.get_param('geo_bypass_ip_block', None)
 596
 597             # Otherwise use random IP block from geo bypass context but only
 598             # if extractor is known as geo bypassable
 599             if not ip_block:
 600                 ip_blocks = geo_bypass_context.get('ip_blocks')
 601                 if self._GEO_BYPASS and ip_blocks:
 602                     ip_block = random.choice(ip_blocks)
 603
 604             if ip_block:
 605                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 606                 self._downloader.write_debug(
 607                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 608                 return
 609
 610             # Path 2: bypassing based on country code
 611
 612             # Explicit country code specified by user, use it right away
 613             # regardless of whether extractor is geo bypassable or not
 614             country = self.get_param('geo_bypass_country', None)
 615
 616             # Otherwise use random country code from geo bypass context but
 617             # only if extractor is known as geo bypassable
 618             if not country:
 619                 countries = geo_bypass_context.get('countries')
 620                 if self._GEO_BYPASS and countries:
 621                     country = random.choice(countries)
 622
 623             if country:
 624                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 625                 self._downloader.write_debug(
 626                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 627
 628     def extract(self, url):
 629         """Extracts URL information and returns it in list of dicts."""
 630         try:
 631             for _ in range(2):
 632                 try:
 633                     self.initialize()
 634                     self.write_debug('Extracting URL: %s' % url)
 635                     ie_result = self._real_extract(url)
 636                     if ie_result is None:
 637                         return None
 638                     if self._x_forwarded_for_ip:
 639                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 640                     subtitles = ie_result.get('subtitles')
 641                     if (subtitles and 'live_chat' in subtitles
 642                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 643                         del subtitles['live_chat']
 644                     return ie_result
 645                 except GeoRestrictedError as e:
 646                     if self.__maybe_fake_ip_and_retry(e.countries):
 647                         continue
 648                     raise
 649         except UnsupportedError:
 650             raise
 651         except ExtractorError as e:
 652             kwargs = {
 653                 'video_id': e.video_id or self.get_temp_id(url),
 654                 'ie': self.IE_NAME,
 655                 'tb': e.traceback or sys.exc_info()[2],
 656                 'expected': e.expected,
 657                 'cause': e.cause
 658             }
 659             if hasattr(e, 'countries'):
 660                 kwargs['countries'] = e.countries
 661             raise type(e)(e.orig_msg, **kwargs)
 662         except compat_http_client.IncompleteRead as e:
 663             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 664         except (KeyError, StopIteration) as e:
 665             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 666
 667     def __maybe_fake_ip_and_retry(self, countries):
 668         if (not self.get_param('geo_bypass_country', None)
 669                 and self._GEO_BYPASS
 670                 and self.get_param('geo_bypass', True)
 671                 and not self._x_forwarded_for_ip
 672                 and countries):
 673             country_code = random.choice(countries)
 674             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 675             if self._x_forwarded_for_ip:
 676                 self.report_warning(
 677                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 678                     % (self._x_forwarded_for_ip, country_code.upper()))
 679                 return True
 680         return False
 681
 682     def set_downloader(self, downloader):
 683         """Sets a YoutubeDL instance as the downloader for this IE."""
 684         self._downloader = downloader
 685
 686     def _initialize_pre_login(self):
 687         """ Intialization before login. Redefine in subclasses."""
 688         pass
 689
 690     def _perform_login(self, username, password):
 691         """ Login with username and password. Redefine in subclasses."""
 692         pass
 693
 694     def _real_initialize(self):
 695         """Real initialization process. Redefine in subclasses."""
 696         pass
 697
 698     def _real_extract(self, url):
 699         """Real extraction process. Redefine in subclasses."""
 700         raise NotImplementedError('This method must be implemented by subclasses')
 701
 702     @classmethod
 703     def ie_key(cls):
 704         """A string for getting the InfoExtractor with get_info_extractor"""
 705         return cls.__name__[:-2]
 706
 707     @property
 708     def IE_NAME(self):
 709         return compat_str(type(self).__name__[:-2])
 710
 711     @staticmethod
 712     def __can_accept_status_code(err, expected_status):
 713         assert isinstance(err, compat_urllib_error.HTTPError)
 714         if expected_status is None:
 715             return False
 716         elif callable(expected_status):
 717             return expected_status(err.code) is True
 718         else:
 719             return err.code in variadic(expected_status)
 720
 721     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 722         """
 723         Return the response handle.
 724
 725         See _download_webpage docstring for arguments specification.
 726         """
 727         if not self._downloader._first_webpage_request:
 728             sleep_interval = self.get_param('sleep_interval_requests') or 0
 729             if sleep_interval > 0:
 730                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 731                 time.sleep(sleep_interval)
 732         else:
 733             self._downloader._first_webpage_request = False
 734
 735         if note is None:
 736             self.report_download_webpage(video_id)
 737         elif note is not False:
 738             if video_id is None:
 739                 self.to_screen(str(note))
 740             else:
 741                 self.to_screen(f'{video_id}: {note}')
 742
 743         # Some sites check X-Forwarded-For HTTP header in order to figure out
 744         # the origin of the client behind proxy. This allows bypassing geo
 745         # restriction by faking this header's value to IP that belongs to some
 746         # geo unrestricted country. We will do so once we encounter any
 747         # geo restriction error.
 748         if self._x_forwarded_for_ip:
 749             if 'X-Forwarded-For' not in headers:
 750                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 751
 752         if isinstance(url_or_request, compat_urllib_request.Request):
 753             url_or_request = update_Request(
 754                 url_or_request, data=data, headers=headers, query=query)
 755         else:
 756             if query:
 757                 url_or_request = update_url_query(url_or_request, query)
 758             if data is not None or headers:
 759                 url_or_request = sanitized_Request(url_or_request, data, headers)
 760         try:
 761             return self._downloader.urlopen(url_or_request)
 762         except network_exceptions as err:
 763             if isinstance(err, compat_urllib_error.HTTPError):
 764                 if self.__can_accept_status_code(err, expected_status):
 765                     # Retain reference to error to prevent file object from
 766                     # being closed before it can be read. Works around the
 767                     # effects of <https://bugs.python.org/issue15002>
 768                     # introduced in Python 3.4.1.
 769                     err.fp._error = err
 770                     return err.fp
 771
 772             if errnote is False:
 773                 return False
 774             if errnote is None:
 775                 errnote = 'Unable to download webpage'
 776
 777             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 778             if fatal:
 779                 raise ExtractorError(errmsg, cause=err)
 780             else:
 781                 self.report_warning(errmsg)
 782                 return False
 783
 784     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 785         """
 786         Return a tuple (page content as string, URL handle).
 787
 788         See _download_webpage docstring for arguments specification.
 789         """
 790         # Strip hashes from the URL (#1038)
 791         if isinstance(url_or_request, (compat_str, str)):
 792             url_or_request = url_or_request.partition('#')[0]
 793
 794         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 795         if urlh is False:
 796             assert not fatal
 797             return False
 798         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 799         return (content, urlh)
 800
 801     @staticmethod
 802     def _guess_encoding_from_content(content_type, webpage_bytes):
 803         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 804         if m:
 805             encoding = m.group(1)
 806         else:
 807             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 808                           webpage_bytes[:1024])
 809             if m:
 810                 encoding = m.group(1).decode('ascii')
 811             elif webpage_bytes.startswith(b'\xff\xfe'):
 812                 encoding = 'utf-16'
 813             else:
 814                 encoding = 'utf-8'
 815
 816         return encoding
 817
 818     def __check_blocked(self, content):
 819         first_block = content[:512]
 820         if ('<title>Access to this site is blocked</title>' in content
 821                 and 'Websense' in first_block):
 822             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 823             blocked_iframe = self._html_search_regex(
 824                 r'<iframe src="([^"]+)"', content,
 825                 'Websense information URL', default=None)
 826             if blocked_iframe:
 827                 msg += ' Visit %s for more details' % blocked_iframe
 828             raise ExtractorError(msg, expected=True)
 829         if '<title>The URL you requested has been blocked</title>' in first_block:
 830             msg = (
 831                 'Access to this webpage has been blocked by Indian censorship. '
 832                 'Use a VPN or proxy server (with --proxy) to route around it.')
 833             block_msg = self._html_search_regex(
 834                 r'</h1><p>(.*?)</p>',
 835                 content, 'block message', default=None)
 836             if block_msg:
 837                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 838             raise ExtractorError(msg, expected=True)
 839         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 840                 and 'blocklist.rkn.gov.ru' in content):
 841             raise ExtractorError(
 842                 'Access to this webpage has been blocked by decision of the Russian government. '
 843                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 844                 expected=True)
 845
 846     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 847         content_type = urlh.headers.get('Content-Type', '')
 848         webpage_bytes = urlh.read()
 849         if prefix is not None:
 850             webpage_bytes = prefix + webpage_bytes
 851         if not encoding:
 852             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 853         if self.get_param('dump_intermediate_pages', False):
 854             self.to_screen('Dumping request to ' + urlh.geturl())
 855             dump = base64.b64encode(webpage_bytes).decode('ascii')
 856             self._downloader.to_screen(dump)
 857         if self.get_param('write_pages', False):
 858             basen = f'{video_id}_{urlh.geturl()}'
 859             trim_length = self.get_param('trim_file_name') or 240
 860             if len(basen) > trim_length:
 861                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 862                 basen = basen[:trim_length - len(h)] + h
 863             raw_filename = basen + '.dump'
 864             filename = sanitize_filename(raw_filename, restricted=True)
 865             self.to_screen('Saving request to ' + filename)
 866             # Working around MAX_PATH limitation on Windows (see
 867             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 868             if compat_os_name == 'nt':
 869                 absfilepath = os.path.abspath(filename)
 870                 if len(absfilepath) > 259:
 871                     filename = '\\\\?\\' + absfilepath
 872             with open(filename, 'wb') as outf:
 873                 outf.write(webpage_bytes)
 874
 875         try:
 876             content = webpage_bytes.decode(encoding, 'replace')
 877         except LookupError:
 878             content = webpage_bytes.decode('utf-8', 'replace')
 879
 880         self.__check_blocked(content)
 881
 882         return content
 883
 884     def _download_webpage(
 885             self, url_or_request, video_id, note=None, errnote=None,
 886             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 887             headers={}, query={}, expected_status=None):
 888         """
 889         Return the data of the page as a string.
 890
 891         Arguments:
 892         url_or_request -- plain text URL as a string or
 893             a compat_urllib_request.Requestobject
 894         video_id -- Video/playlist/item identifier (string)
 895
 896         Keyword arguments:
 897         note -- note printed before downloading (string)
 898         errnote -- note printed in case of an error (string)
 899         fatal -- flag denoting whether error should be considered fatal,
 900             i.e. whether it should cause ExtractionError to be raised,
 901             otherwise a warning will be reported and extraction continued
 902         tries -- number of tries
 903         timeout -- sleep interval between tries
 904         encoding -- encoding for a page content decoding, guessed automatically
 905             when not explicitly specified
 906         data -- POST data (bytes)
 907         headers -- HTTP headers (dict)
 908         query -- URL query (dict)
 909         expected_status -- allows to accept failed HTTP requests (non 2xx
 910             status code) by explicitly specifying a set of accepted status
 911             codes. Can be any of the following entities:
 912                 - an integer type specifying an exact failed status code to
 913                   accept
 914                 - a list or a tuple of integer types specifying a list of
 915                   failed status codes to accept
 916                 - a callable accepting an actual failed status code and
 917                   returning True if it should be accepted
 918             Note that this argument does not affect success status codes (2xx)
 919             which are always accepted.
 920         """
 921
 922         success = False
 923         try_count = 0
 924         while success is False:
 925             try:
 926                 res = self._download_webpage_handle(
 927                     url_or_request, video_id, note, errnote, fatal,
 928                     encoding=encoding, data=data, headers=headers, query=query,
 929                     expected_status=expected_status)
 930                 success = True
 931             except compat_http_client.IncompleteRead as e:
 932                 try_count += 1
 933                 if try_count >= tries:
 934                     raise e
 935                 self._sleep(timeout, video_id)
 936         if res is False:
 937             return res
 938         else:
 939             content, _ = res
 940             return content
 941
 942     def _download_xml_handle(
 943             self, url_or_request, video_id, note='Downloading XML',
 944             errnote='Unable to download XML', transform_source=None,
 945             fatal=True, encoding=None, data=None, headers={}, query={},
 946             expected_status=None):
 947         """
 948         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
 949
 950         See _download_webpage docstring for arguments specification.
 951         """
 952         res = self._download_webpage_handle(
 953             url_or_request, video_id, note, errnote, fatal=fatal,
 954             encoding=encoding, data=data, headers=headers, query=query,
 955             expected_status=expected_status)
 956         if res is False:
 957             return res
 958         xml_string, urlh = res
 959         return self._parse_xml(
 960             xml_string, video_id, transform_source=transform_source,
 961             fatal=fatal), urlh
 962
 963     def _download_xml(
 964             self, url_or_request, video_id,
 965             note='Downloading XML', errnote='Unable to download XML',
 966             transform_source=None, fatal=True, encoding=None,
 967             data=None, headers={}, query={}, expected_status=None):
 968         """
 969         Return the xml as an xml.etree.ElementTree.Element.
 970
 971         See _download_webpage docstring for arguments specification.
 972         """
 973         res = self._download_xml_handle(
 974             url_or_request, video_id, note=note, errnote=errnote,
 975             transform_source=transform_source, fatal=fatal, encoding=encoding,
 976             data=data, headers=headers, query=query,
 977             expected_status=expected_status)
 978         return res if res is False else res[0]
 979
 980     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 981         if transform_source:
 982             xml_string = transform_source(xml_string)
 983         try:
 984             return compat_etree_fromstring(xml_string.encode('utf-8'))
 985         except xml.etree.ElementTree.ParseError as ve:
 986             errmsg = '%s: Failed to parse XML ' % video_id
 987             if fatal:
 988                 raise ExtractorError(errmsg, cause=ve)
 989             else:
 990                 self.report_warning(errmsg + str(ve))
 991
 992     def _download_json_handle(
 993             self, url_or_request, video_id, note='Downloading JSON metadata',
 994             errnote='Unable to download JSON metadata', transform_source=None,
 995             fatal=True, encoding=None, data=None, headers={}, query={},
 996             expected_status=None):
 997         """
 998         Return a tuple (JSON object, URL handle).
 999
1000         See _download_webpage docstring for arguments specification.
1001         """
1002         res = self._download_webpage_handle(
1003             url_or_request, video_id, note, errnote, fatal=fatal,
1004             encoding=encoding, data=data, headers=headers, query=query,
1005             expected_status=expected_status)
1006         if res is False:
1007             return res
1008         json_string, urlh = res
1009         return self._parse_json(
1010             json_string, video_id, transform_source=transform_source,
1011             fatal=fatal), urlh
1012
1013     def _download_json(
1014             self, url_or_request, video_id, note='Downloading JSON metadata',
1015             errnote='Unable to download JSON metadata', transform_source=None,
1016             fatal=True, encoding=None, data=None, headers={}, query={},
1017             expected_status=None):
1018         """
1019         Return the JSON object as a dict.
1020
1021         See _download_webpage docstring for arguments specification.
1022         """
1023         res = self._download_json_handle(
1024             url_or_request, video_id, note=note, errnote=errnote,
1025             transform_source=transform_source, fatal=fatal, encoding=encoding,
1026             data=data, headers=headers, query=query,
1027             expected_status=expected_status)
1028         return res if res is False else res[0]
1029
1030     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
1031         if transform_source:
1032             json_string = transform_source(json_string)
1033         try:
1034             return json.loads(json_string, strict=False)
1035         except ValueError as ve:
1036             errmsg = '%s: Failed to parse JSON ' % video_id
1037             if fatal:
1038                 raise ExtractorError(errmsg, cause=ve)
1039             else:
1040                 self.report_warning(errmsg + str(ve))
1041
1042     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1043         return self._parse_json(
1044             data[data.find('{'):data.rfind('}') + 1],
1045             video_id, transform_source, fatal)
1046
1047     def _download_socket_json_handle(
1048             self, url_or_request, video_id, note='Polling socket',
1049             errnote='Unable to poll socket', transform_source=None,
1050             fatal=True, encoding=None, data=None, headers={}, query={},
1051             expected_status=None):
1052         """
1053         Return a tuple (JSON object, URL handle).
1054
1055         See _download_webpage docstring for arguments specification.
1056         """
1057         res = self._download_webpage_handle(
1058             url_or_request, video_id, note, errnote, fatal=fatal,
1059             encoding=encoding, data=data, headers=headers, query=query,
1060             expected_status=expected_status)
1061         if res is False:
1062             return res
1063         webpage, urlh = res
1064         return self._parse_socket_response_as_json(
1065             webpage, video_id, transform_source=transform_source,
1066             fatal=fatal), urlh
1067
1068     def _download_socket_json(
1069             self, url_or_request, video_id, note='Polling socket',
1070             errnote='Unable to poll socket', transform_source=None,
1071             fatal=True, encoding=None, data=None, headers={}, query={},
1072             expected_status=None):
1073         """
1074         Return the JSON object as a dict.
1075
1076         See _download_webpage docstring for arguments specification.
1077         """
1078         res = self._download_socket_json_handle(
1079             url_or_request, video_id, note=note, errnote=errnote,
1080             transform_source=transform_source, fatal=fatal, encoding=encoding,
1081             data=data, headers=headers, query=query,
1082             expected_status=expected_status)
1083         return res if res is False else res[0]
1084
1085     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1086         idstr = format_field(video_id, template='%s: ')
1087         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1088         if only_once:
1089             if f'WARNING: {msg}' in self._printed_messages:
1090                 return
1091             self._printed_messages.add(f'WARNING: {msg}')
1092         self._downloader.report_warning(msg, *args, **kwargs)
1093
1094     def to_screen(self, msg, *args, **kwargs):
1095         """Print msg to screen, prefixing it with '[ie_name]'"""
1096         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1097
1098     def write_debug(self, msg, *args, **kwargs):
1099         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1100
1101     def get_param(self, name, default=None, *args, **kwargs):
1102         if self._downloader:
1103             return self._downloader.params.get(name, default, *args, **kwargs)
1104         return default
1105
1106     def report_drm(self, video_id, partial=False):
1107         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1108
1109     def report_extraction(self, id_or_name):
1110         """Report information extraction."""
1111         self.to_screen('%s: Extracting information' % id_or_name)
1112
1113     def report_download_webpage(self, video_id):
1114         """Report webpage download."""
1115         self.to_screen('%s: Downloading webpage' % video_id)
1116
1117     def report_age_confirmation(self):
1118         """Report attempt to confirm age."""
1119         self.to_screen('Confirming age')
1120
1121     def report_login(self):
1122         """Report attempt to log in."""
1123         self.to_screen('Logging in')
1124
1125     def raise_login_required(
1126             self, msg='This video is only available for registered users',
1127             metadata_available=False, method=NO_DEFAULT):
1128         if metadata_available and (
1129                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1130             self.report_warning(msg)
1131             return
1132         if method is NO_DEFAULT:
1133             method = 'any' if self.supports_login() else 'cookies'
1134         if method is not None:
1135             assert method in self._LOGIN_HINTS, 'Invalid login method'
1136             msg = f'{msg}. {self._LOGIN_HINTS[method]}'
1137         raise ExtractorError(msg, expected=True)
1138
1139     def raise_geo_restricted(
1140             self, msg='This video is not available from your location due to geo restriction',
1141             countries=None, metadata_available=False):
1142         if metadata_available and (
1143                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1144             self.report_warning(msg)
1145         else:
1146             raise GeoRestrictedError(msg, countries=countries)
1147
1148     def raise_no_formats(self, msg, expected=False, video_id=None):
1149         if expected and (
1150                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1151             self.report_warning(msg, video_id)
1152         elif isinstance(msg, ExtractorError):
1153             raise msg
1154         else:
1155             raise ExtractorError(msg, expected=expected, video_id=video_id)
1156
1157     # Methods for following #608
1158     @staticmethod
1159     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1160         """Returns a URL that points to a page that should be processed"""
1161         if ie is not None:
1162             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1163         if video_id is not None:
1164             kwargs['id'] = video_id
1165         if video_title is not None:
1166             kwargs['title'] = video_title
1167         return {
1168             **kwargs,
1169             '_type': 'url_transparent' if url_transparent else 'url',
1170             'url': url,
1171         }
1172
1173     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1174         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1175                 for m in orderedSet(map(getter, matches) if getter else matches))
1176         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1177
1178     @staticmethod
1179     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1180         """Returns a playlist"""
1181         if playlist_id:
1182             kwargs['id'] = playlist_id
1183         if playlist_title:
1184             kwargs['title'] = playlist_title
1185         if playlist_description is not None:
1186             kwargs['description'] = playlist_description
1187         return {
1188             **kwargs,
1189             '_type': 'multi_video' if multi_video else 'playlist',
1190             'entries': entries,
1191         }
1192
1193     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1194         """
1195         Perform a regex search on the given string, using a single or a list of
1196         patterns returning the first matching group.
1197         In case of failure return a default value or raise a WARNING or a
1198         RegexNotFoundError, depending on fatal, specifying the field name.
1199         """
1200         if string is None:
1201             mobj = None
1202         elif isinstance(pattern, (str, re.Pattern)):
1203             mobj = re.search(pattern, string, flags)
1204         else:
1205             for p in pattern:
1206                 mobj = re.search(p, string, flags)
1207                 if mobj:
1208                     break
1209
1210         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1211
1212         if mobj:
1213             if group is None:
1214                 # return the first matching group
1215                 return next(g for g in mobj.groups() if g is not None)
1216             elif isinstance(group, (list, tuple)):
1217                 return tuple(mobj.group(g) for g in group)
1218             else:
1219                 return mobj.group(group)
1220         elif default is not NO_DEFAULT:
1221             return default
1222         elif fatal:
1223             raise RegexNotFoundError('Unable to extract %s' % _name)
1224         else:
1225             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1226             return None
1227
1228     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1229         """
1230         Like _search_regex, but strips HTML tags and unescapes entities.
1231         """
1232         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1233         if res:
1234             return clean_html(res).strip()
1235         else:
1236             return res
1237
1238     def _get_netrc_login_info(self, netrc_machine=None):
1239         username = None
1240         password = None
1241         netrc_machine = netrc_machine or self._NETRC_MACHINE
1242
1243         if self.get_param('usenetrc', False):
1244             try:
1245                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1246                 if os.path.isdir(netrc_file):
1247                     netrc_file = os.path.join(netrc_file, '.netrc')
1248                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1249                 if info is not None:
1250                     username = info[0]
1251                     password = info[2]
1252                 else:
1253                     raise netrc.NetrcParseError(
1254                         'No authenticators for %s' % netrc_machine)
1255             except (OSError, netrc.NetrcParseError) as err:
1256                 self.report_warning(
1257                     'parsing .netrc: %s' % error_to_compat_str(err))
1258
1259         return username, password
1260
1261     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1262         """
1263         Get the login info as (username, password)
1264         First look for the manually specified credentials using username_option
1265         and password_option as keys in params dictionary. If no such credentials
1266         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1267         value.
1268         If there's no info available, return (None, None)
1269         """
1270
1271         # Attempt to use provided username and password or .netrc data
1272         username = self.get_param(username_option)
1273         if username is not None:
1274             password = self.get_param(password_option)
1275         else:
1276             username, password = self._get_netrc_login_info(netrc_machine)
1277
1278         return username, password
1279
1280     def _get_tfa_info(self, note='two-factor verification code'):
1281         """
1282         Get the two-factor authentication info
1283         TODO - asking the user will be required for sms/phone verify
1284         currently just uses the command line option
1285         If there's no info available, return None
1286         """
1287
1288         tfa = self.get_param('twofactor')
1289         if tfa is not None:
1290             return tfa
1291
1292         return compat_getpass('Type %s and press [Return]: ' % note)
1293
1294     # Helper functions for extracting OpenGraph info
1295     @staticmethod
1296     def _og_regexes(prop):
1297         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1298         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1299                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1300         template = r'<meta[^>]+?%s[^>]+?%s'
1301         return [
1302             template % (property_re, content_re),
1303             template % (content_re, property_re),
1304         ]
1305
1306     @staticmethod
1307     def _meta_regex(prop):
1308         return r'''(?isx)<meta
1309                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1310                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1311
1312     def _og_search_property(self, prop, html, name=None, **kargs):
1313         prop = variadic(prop)
1314         if name is None:
1315             name = 'OpenGraph %s' % prop[0]
1316         og_regexes = []
1317         for p in prop:
1318             og_regexes.extend(self._og_regexes(p))
1319         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1320         if escaped is None:
1321             return None
1322         return unescapeHTML(escaped)
1323
1324     def _og_search_thumbnail(self, html, **kargs):
1325         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1326
1327     def _og_search_description(self, html, **kargs):
1328         return self._og_search_property('description', html, fatal=False, **kargs)
1329
1330     def _og_search_title(self, html, *, fatal=False, **kargs):
1331         return self._og_search_property('title', html, fatal=fatal, **kargs)
1332
1333     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1334         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1335         if secure:
1336             regexes = self._og_regexes('video:secure_url') + regexes
1337         return self._html_search_regex(regexes, html, name, **kargs)
1338
1339     def _og_search_url(self, html, **kargs):
1340         return self._og_search_property('url', html, **kargs)
1341
1342     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1343         return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1344
1345     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1346         name = variadic(name)
1347         if display_name is None:
1348             display_name = name[0]
1349         return self._html_search_regex(
1350             [self._meta_regex(n) for n in name],
1351             html, display_name, fatal=fatal, group='content', **kwargs)
1352
1353     def _dc_search_uploader(self, html):
1354         return self._html_search_meta('dc.creator', html, 'uploader')
1355
1356     def _rta_search(self, html):
1357         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1358         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1359                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1360                      html):
1361             return 18
1362         return 0
1363
1364     def _media_rating_search(self, html):
1365         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1366         rating = self._html_search_meta('rating', html)
1367
1368         if not rating:
1369             return None
1370
1371         RATING_TABLE = {
1372             'safe for kids': 0,
1373             'general': 8,
1374             '14 years': 14,
1375             'mature': 17,
1376             'restricted': 19,
1377         }
1378         return RATING_TABLE.get(rating.lower())
1379
1380     def _family_friendly_search(self, html):
1381         # See http://schema.org/VideoObject
1382         family_friendly = self._html_search_meta(
1383             'isFamilyFriendly', html, default=None)
1384
1385         if not family_friendly:
1386             return None
1387
1388         RATING_TABLE = {
1389             '1': 0,
1390             'true': 0,
1391             '0': 18,
1392             'false': 18,
1393         }
1394         return RATING_TABLE.get(family_friendly.lower())
1395
1396     def _twitter_search_player(self, html):
1397         return self._html_search_meta('twitter:player', html,
1398                                       'twitter card player')
1399
1400     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1401         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1402         default = kwargs.get('default', NO_DEFAULT)
1403         # JSON-LD may be malformed and thus `fatal` should be respected.
1404         # At the same time `default` may be passed that assumes `fatal=False`
1405         # for _search_regex. Let's simulate the same behavior here as well.
1406         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1407         json_ld = []
1408         for mobj in json_ld_list:
1409             json_ld_item = self._parse_json(
1410                 mobj.group('json_ld'), video_id, fatal=fatal)
1411             if not json_ld_item:
1412                 continue
1413             if isinstance(json_ld_item, dict):
1414                 json_ld.append(json_ld_item)
1415             elif isinstance(json_ld_item, (list, tuple)):
1416                 json_ld.extend(json_ld_item)
1417         if json_ld:
1418             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1419         if json_ld:
1420             return json_ld
1421         if default is not NO_DEFAULT:
1422             return default
1423         elif fatal:
1424             raise RegexNotFoundError('Unable to extract JSON-LD')
1425         else:
1426             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1427             return {}
1428
1429     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1430         if isinstance(json_ld, compat_str):
1431             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1432         if not json_ld:
1433             return {}
1434         info = {}
1435         if not isinstance(json_ld, (list, tuple, dict)):
1436             return info
1437         if isinstance(json_ld, dict):
1438             json_ld = [json_ld]
1439
1440         INTERACTION_TYPE_MAP = {
1441             'CommentAction': 'comment',
1442             'AgreeAction': 'like',
1443             'DisagreeAction': 'dislike',
1444             'LikeAction': 'like',
1445             'DislikeAction': 'dislike',
1446             'ListenAction': 'view',
1447             'WatchAction': 'view',
1448             'ViewAction': 'view',
1449         }
1450
1451         def extract_interaction_type(e):
1452             interaction_type = e.get('interactionType')
1453             if isinstance(interaction_type, dict):
1454                 interaction_type = interaction_type.get('@type')
1455             return str_or_none(interaction_type)
1456
1457         def extract_interaction_statistic(e):
1458             interaction_statistic = e.get('interactionStatistic')
1459             if isinstance(interaction_statistic, dict):
1460                 interaction_statistic = [interaction_statistic]
1461             if not isinstance(interaction_statistic, list):
1462                 return
1463             for is_e in interaction_statistic:
1464                 if not isinstance(is_e, dict):
1465                     continue
1466                 if is_e.get('@type') != 'InteractionCounter':
1467                     continue
1468                 interaction_type = extract_interaction_type(is_e)
1469                 if not interaction_type:
1470                     continue
1471                 # For interaction count some sites provide string instead of
1472                 # an integer (as per spec) with non digit characters (e.g. ",")
1473                 # so extracting count with more relaxed str_to_int
1474                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1475                 if interaction_count is None:
1476                     continue
1477                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1478                 if not count_kind:
1479                     continue
1480                 count_key = '%s_count' % count_kind
1481                 if info.get(count_key) is not None:
1482                     continue
1483                 info[count_key] = interaction_count
1484
1485         def extract_chapter_information(e):
1486             chapters = [{
1487                 'title': part.get('name'),
1488                 'start_time': part.get('startOffset'),
1489                 'end_time': part.get('endOffset'),
1490             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1491             for idx, (last_c, current_c, next_c) in enumerate(zip(
1492                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1493                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1494                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1495                 if None in current_c.values():
1496                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1497                     return
1498             if chapters:
1499                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1500                 info['chapters'] = chapters
1501
1502         def extract_video_object(e):
1503             assert e['@type'] == 'VideoObject'
1504             author = e.get('author')
1505             info.update({
1506                 'url': url_or_none(e.get('contentUrl')),
1507                 'title': unescapeHTML(e.get('name')),
1508                 'description': unescapeHTML(e.get('description')),
1509                 'thumbnails': [{'url': url_or_none(url)}
1510                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1511                 'duration': parse_duration(e.get('duration')),
1512                 'timestamp': unified_timestamp(e.get('uploadDate')),
1513                 # author can be an instance of 'Organization' or 'Person' types.
1514                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1515                 # however some websites are using 'Text' type instead.
1516                 # 1. https://schema.org/VideoObject
1517                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1518                 'filesize': float_or_none(e.get('contentSize')),
1519                 'tbr': int_or_none(e.get('bitrate')),
1520                 'width': int_or_none(e.get('width')),
1521                 'height': int_or_none(e.get('height')),
1522                 'view_count': int_or_none(e.get('interactionCount')),
1523             })
1524             extract_interaction_statistic(e)
1525             extract_chapter_information(e)
1526
1527         def traverse_json_ld(json_ld, at_top_level=True):
1528             for e in json_ld:
1529                 if at_top_level and '@context' not in e:
1530                     continue
1531                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1532                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1533                     break
1534                 item_type = e.get('@type')
1535                 if expected_type is not None and expected_type != item_type:
1536                     continue
1537                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1538                 if rating is not None:
1539                     info['average_rating'] = rating
1540                 if item_type in ('TVEpisode', 'Episode'):
1541                     episode_name = unescapeHTML(e.get('name'))
1542                     info.update({
1543                         'episode': episode_name,
1544                         'episode_number': int_or_none(e.get('episodeNumber')),
1545                         'description': unescapeHTML(e.get('description')),
1546                     })
1547                     if not info.get('title') and episode_name:
1548                         info['title'] = episode_name
1549                     part_of_season = e.get('partOfSeason')
1550                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1551                         info.update({
1552                             'season': unescapeHTML(part_of_season.get('name')),
1553                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1554                         })
1555                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1556                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1557                         info['series'] = unescapeHTML(part_of_series.get('name'))
1558                 elif item_type == 'Movie':
1559                     info.update({
1560                         'title': unescapeHTML(e.get('name')),
1561                         'description': unescapeHTML(e.get('description')),
1562                         'duration': parse_duration(e.get('duration')),
1563                         'timestamp': unified_timestamp(e.get('dateCreated')),
1564                     })
1565                 elif item_type in ('Article', 'NewsArticle'):
1566                     info.update({
1567                         'timestamp': parse_iso8601(e.get('datePublished')),
1568                         'title': unescapeHTML(e.get('headline')),
1569                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1570                     })
1571                     if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1572                         extract_video_object(e['video'][0])
1573                 elif item_type == 'VideoObject':
1574                     extract_video_object(e)
1575                     if expected_type is None:
1576                         continue
1577                     else:
1578                         break
1579                 video = e.get('video')
1580                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1581                     extract_video_object(video)
1582                 if expected_type is None:
1583                     continue
1584                 else:
1585                     break
1586         traverse_json_ld(json_ld)
1587
1588         return filter_dict(info)
1589
1590     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1591         return self._parse_json(
1592             self._search_regex(
1593                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1594                 webpage, 'next.js data', fatal=fatal, **kw),
1595             video_id, transform_source=transform_source, fatal=fatal)
1596
1597     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1598         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1599         # not all website do this, but it can be changed
1600         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1601         rectx = re.escape(context_name)
1602         js, arg_keys, arg_vals = self._search_regex(
1603             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1604              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1605             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1606
1607         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1608
1609         for key, val in args.items():
1610             if val in ('undefined', 'void 0'):
1611                 args[key] = 'null'
1612
1613         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1614
1615     @staticmethod
1616     def _hidden_inputs(html):
1617         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1618         hidden_inputs = {}
1619         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1620             attrs = extract_attributes(input)
1621             if not input:
1622                 continue
1623             if attrs.get('type') not in ('hidden', 'submit'):
1624                 continue
1625             name = attrs.get('name') or attrs.get('id')
1626             value = attrs.get('value')
1627             if name and value is not None:
1628                 hidden_inputs[name] = value
1629         return hidden_inputs
1630
1631     def _form_hidden_inputs(self, form_id, html):
1632         form = self._search_regex(
1633             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1634             html, '%s form' % form_id, group='form')
1635         return self._hidden_inputs(form)
1636
1637     class FormatSort:
1638         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1639
1640         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1641                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1642                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1643         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1644                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1645                         'fps', 'fs_approx', 'source', 'id')
1646
1647         settings = {
1648             'vcodec': {'type': 'ordered', 'regex': True,
1649                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1650             'acodec': {'type': 'ordered', 'regex': True,
1651                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1652             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1653                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1654             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1655                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1656             'vext': {'type': 'ordered', 'field': 'video_ext',
1657                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1658                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1659             'aext': {'type': 'ordered', 'field': 'audio_ext',
1660                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1661                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1662             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1663             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1664                            'field': ('vcodec', 'acodec'),
1665                            'function': lambda it: int(any(v != 'none' for v in it))},
1666             'ie_pref': {'priority': True, 'type': 'extractor'},
1667             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1668             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1669             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1670             'quality': {'convert': 'float', 'default': -1},
1671             'filesize': {'convert': 'bytes'},
1672             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1673             'id': {'convert': 'string', 'field': 'format_id'},
1674             'height': {'convert': 'float_none'},
1675             'width': {'convert': 'float_none'},
1676             'fps': {'convert': 'float_none'},
1677             'tbr': {'convert': 'float_none'},
1678             'vbr': {'convert': 'float_none'},
1679             'abr': {'convert': 'float_none'},
1680             'asr': {'convert': 'float_none'},
1681             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1682
1683             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1684             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1685             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1686             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1687             'res': {'type': 'multiple', 'field': ('height', 'width'),
1688                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1689
1690             # For compatibility with youtube-dl
1691             'format_id': {'type': 'alias', 'field': 'id'},
1692             'preference': {'type': 'alias', 'field': 'ie_pref'},
1693             'language_preference': {'type': 'alias', 'field': 'lang'},
1694             'source_preference': {'type': 'alias', 'field': 'source'},
1695             'protocol': {'type': 'alias', 'field': 'proto'},
1696             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1697
1698             # Deprecated
1699             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1700             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1701             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1702             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1703             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1704             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1705             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1706             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1707             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1708             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1709             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1710             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1711             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1712             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1713             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1714             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1715             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1716             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1717             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1718             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1719         }
1720
1721         def __init__(self, ie, field_preference):
1722             self._order = []
1723             self.ydl = ie._downloader
1724             self.evaluate_params(self.ydl.params, field_preference)
1725             if ie.get_param('verbose'):
1726                 self.print_verbose_info(self.ydl.write_debug)
1727
1728         def _get_field_setting(self, field, key):
1729             if field not in self.settings:
1730                 if key in ('forced', 'priority'):
1731                     return False
1732                 self.ydl.deprecation_warning(
1733                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1734                     'and may be removed in a future version')
1735                 self.settings[field] = {}
1736             propObj = self.settings[field]
1737             if key not in propObj:
1738                 type = propObj.get('type')
1739                 if key == 'field':
1740                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1741                 elif key == 'convert':
1742                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1743                 else:
1744                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1745                 propObj[key] = default
1746             return propObj[key]
1747
1748         def _resolve_field_value(self, field, value, convertNone=False):
1749             if value is None:
1750                 if not convertNone:
1751                     return None
1752             else:
1753                 value = value.lower()
1754             conversion = self._get_field_setting(field, 'convert')
1755             if conversion == 'ignore':
1756                 return None
1757             if conversion == 'string':
1758                 return value
1759             elif conversion == 'float_none':
1760                 return float_or_none(value)
1761             elif conversion == 'bytes':
1762                 return FileDownloader.parse_bytes(value)
1763             elif conversion == 'order':
1764                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1765                 use_regex = self._get_field_setting(field, 'regex')
1766                 list_length = len(order_list)
1767                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1768                 if use_regex and value is not None:
1769                     for i, regex in enumerate(order_list):
1770                         if regex and re.match(regex, value):
1771                             return list_length - i
1772                     return list_length - empty_pos  # not in list
1773                 else:  # not regex or  value = None
1774                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1775             else:
1776                 if value.isnumeric():
1777                     return float(value)
1778                 else:
1779                     self.settings[field]['convert'] = 'string'
1780                     return value
1781
1782         def evaluate_params(self, params, sort_extractor):
1783             self._use_free_order = params.get('prefer_free_formats', False)
1784             self._sort_user = params.get('format_sort', [])
1785             self._sort_extractor = sort_extractor
1786
1787             def add_item(field, reverse, closest, limit_text):
1788                 field = field.lower()
1789                 if field in self._order:
1790                     return
1791                 self._order.append(field)
1792                 limit = self._resolve_field_value(field, limit_text)
1793                 data = {
1794                     'reverse': reverse,
1795                     'closest': False if limit is None else closest,
1796                     'limit_text': limit_text,
1797                     'limit': limit}
1798                 if field in self.settings:
1799                     self.settings[field].update(data)
1800                 else:
1801                     self.settings[field] = data
1802
1803             sort_list = (
1804                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1805                 + (tuple() if params.get('format_sort_force', False)
1806                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1807                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1808
1809             for item in sort_list:
1810                 match = re.match(self.regex, item)
1811                 if match is None:
1812                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1813                 field = match.group('field')
1814                 if field is None:
1815                     continue
1816                 if self._get_field_setting(field, 'type') == 'alias':
1817                     alias, field = field, self._get_field_setting(field, 'field')
1818                     if self._get_field_setting(alias, 'deprecated'):
1819                         self.ydl.deprecation_warning(
1820                             f'Format sorting alias {alias} is deprecated '
1821                             f'and may be removed in a future version. Please use {field} instead')
1822                 reverse = match.group('reverse') is not None
1823                 closest = match.group('separator') == '~'
1824                 limit_text = match.group('limit')
1825
1826                 has_limit = limit_text is not None
1827                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1828                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1829
1830                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1831                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1832                 limit_count = len(limits)
1833                 for (i, f) in enumerate(fields):
1834                     add_item(f, reverse, closest,
1835                              limits[i] if i < limit_count
1836                              else limits[0] if has_limit and not has_multiple_limits
1837                              else None)
1838
1839         def print_verbose_info(self, write_debug):
1840             if self._sort_user:
1841                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1842             if self._sort_extractor:
1843                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1844             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1845                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1846                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1847                               self._get_field_setting(field, 'limit_text'),
1848                               self._get_field_setting(field, 'limit'))
1849                 if self._get_field_setting(field, 'limit_text') is not None else '')
1850                 for field in self._order if self._get_field_setting(field, 'visible')]))
1851
1852         def _calculate_field_preference_from_value(self, format, field, type, value):
1853             reverse = self._get_field_setting(field, 'reverse')
1854             closest = self._get_field_setting(field, 'closest')
1855             limit = self._get_field_setting(field, 'limit')
1856
1857             if type == 'extractor':
1858                 maximum = self._get_field_setting(field, 'max')
1859                 if value is None or (maximum is not None and value >= maximum):
1860                     value = -1
1861             elif type == 'boolean':
1862                 in_list = self._get_field_setting(field, 'in_list')
1863                 not_in_list = self._get_field_setting(field, 'not_in_list')
1864                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1865             elif type == 'ordered':
1866                 value = self._resolve_field_value(field, value, True)
1867
1868             # try to convert to number
1869             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1870             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1871             if is_num:
1872                 value = val_num
1873
1874             return ((-10, 0) if value is None
1875                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1876                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1877                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1878                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1879                     else (-1, value, 0))
1880
1881         def _calculate_field_preference(self, format, field):
1882             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1883             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1884             if type == 'multiple':
1885                 type = 'field'  # Only 'field' is allowed in multiple for now
1886                 actual_fields = self._get_field_setting(field, 'field')
1887
1888                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1889             else:
1890                 value = get_value(field)
1891             return self._calculate_field_preference_from_value(format, field, type, value)
1892
1893         def calculate_preference(self, format):
1894             # Determine missing protocol
1895             if not format.get('protocol'):
1896                 format['protocol'] = determine_protocol(format)
1897
1898             # Determine missing ext
1899             if not format.get('ext') and 'url' in format:
1900                 format['ext'] = determine_ext(format['url'])
1901             if format.get('vcodec') == 'none':
1902                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1903                 format['video_ext'] = 'none'
1904             else:
1905                 format['video_ext'] = format['ext']
1906                 format['audio_ext'] = 'none'
1907             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1908             #    format['preference'] = -1000
1909
1910             # Determine missing bitrates
1911             if format.get('tbr') is None:
1912                 if format.get('vbr') is not None and format.get('abr') is not None:
1913                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1914             else:
1915                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1916                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1917                 if format.get('acodec') != 'none' and format.get('abr') is None:
1918                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1919
1920             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1921
1922     def _sort_formats(self, formats, field_preference=[]):
1923         if not formats:
1924             return
1925         format_sort = self.FormatSort(self, field_preference)
1926         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1927
1928     def _check_formats(self, formats, video_id):
1929         if formats:
1930             formats[:] = filter(
1931                 lambda f: self._is_valid_url(
1932                     f['url'], video_id,
1933                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1934                 formats)
1935
1936     @staticmethod
1937     def _remove_duplicate_formats(formats):
1938         format_urls = set()
1939         unique_formats = []
1940         for f in formats:
1941             if f['url'] not in format_urls:
1942                 format_urls.add(f['url'])
1943                 unique_formats.append(f)
1944         formats[:] = unique_formats
1945
1946     def _is_valid_url(self, url, video_id, item='video', headers={}):
1947         url = self._proto_relative_url(url, scheme='http:')
1948         # For now assume non HTTP(S) URLs always valid
1949         if not (url.startswith('http://') or url.startswith('https://')):
1950             return True
1951         try:
1952             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1953             return True
1954         except ExtractorError as e:
1955             self.to_screen(
1956                 '%s: %s URL is invalid, skipping: %s'
1957                 % (video_id, item, error_to_compat_str(e.cause)))
1958             return False
1959
1960     def http_scheme(self):
1961         """ Either "http:" or "https:", depending on the user's preferences """
1962         return (
1963             'http:'
1964             if self.get_param('prefer_insecure', False)
1965             else 'https:')
1966
1967     def _proto_relative_url(self, url, scheme=None):
1968         if url is None:
1969             return url
1970         if url.startswith('//'):
1971             if scheme is None:
1972                 scheme = self.http_scheme()
1973             return scheme + url
1974         else:
1975             return url
1976
1977     def _sleep(self, timeout, video_id, msg_template=None):
1978         if msg_template is None:
1979             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1980         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1981         self.to_screen(msg)
1982         time.sleep(timeout)
1983
1984     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1985                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1986                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1987         res = self._download_xml_handle(
1988             manifest_url, video_id, 'Downloading f4m manifest',
1989             'Unable to download f4m manifest',
1990             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1991             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1992             transform_source=transform_source,
1993             fatal=fatal, data=data, headers=headers, query=query)
1994         if res is False:
1995             return []
1996
1997         manifest, urlh = res
1998         manifest_url = urlh.geturl()
1999
2000         return self._parse_f4m_formats(
2001             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2002             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2003
2004     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2005                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2006                            fatal=True, m3u8_id=None):
2007         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2008             return []
2009
2010         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2011         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2012         if akamai_pv is not None and ';' in akamai_pv.text:
2013             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2014             if playerVerificationChallenge.strip() != '':
2015                 return []
2016
2017         formats = []
2018         manifest_version = '1.0'
2019         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2020         if not media_nodes:
2021             manifest_version = '2.0'
2022             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2023         # Remove unsupported DRM protected media from final formats
2024         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2025         media_nodes = remove_encrypted_media(media_nodes)
2026         if not media_nodes:
2027             return formats
2028
2029         manifest_base_url = get_base_url(manifest)
2030
2031         bootstrap_info = xpath_element(
2032             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2033             'bootstrap info', default=None)
2034
2035         vcodec = None
2036         mime_type = xpath_text(
2037             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2038             'base URL', default=None)
2039         if mime_type and mime_type.startswith('audio/'):
2040             vcodec = 'none'
2041
2042         for i, media_el in enumerate(media_nodes):
2043             tbr = int_or_none(media_el.attrib.get('bitrate'))
2044             width = int_or_none(media_el.attrib.get('width'))
2045             height = int_or_none(media_el.attrib.get('height'))
2046             format_id = join_nonempty(f4m_id, tbr or i)
2047             # If <bootstrapInfo> is present, the specified f4m is a
2048             # stream-level manifest, and only set-level manifests may refer to
2049             # external resources.  See section 11.4 and section 4 of F4M spec
2050             if bootstrap_info is None:
2051                 media_url = None
2052                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2053                 if manifest_version == '2.0':
2054                     media_url = media_el.attrib.get('href')
2055                 if media_url is None:
2056                     media_url = media_el.attrib.get('url')
2057                 if not media_url:
2058                     continue
2059                 manifest_url = (
2060                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2061                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2062                 # If media_url is itself a f4m manifest do the recursive extraction
2063                 # since bitrates in parent manifest (this one) and media_url manifest
2064                 # may differ leading to inability to resolve the format by requested
2065                 # bitrate in f4m downloader
2066                 ext = determine_ext(manifest_url)
2067                 if ext == 'f4m':
2068                     f4m_formats = self._extract_f4m_formats(
2069                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2070                         transform_source=transform_source, fatal=fatal)
2071                     # Sometimes stream-level manifest contains single media entry that
2072                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2073                     # At the same time parent's media entry in set-level manifest may
2074                     # contain it. We will copy it from parent in such cases.
2075                     if len(f4m_formats) == 1:
2076                         f = f4m_formats[0]
2077                         f.update({
2078                             'tbr': f.get('tbr') or tbr,
2079                             'width': f.get('width') or width,
2080                             'height': f.get('height') or height,
2081                             'format_id': f.get('format_id') if not tbr else format_id,
2082                             'vcodec': vcodec,
2083                         })
2084                     formats.extend(f4m_formats)
2085                     continue
2086                 elif ext == 'm3u8':
2087                     formats.extend(self._extract_m3u8_formats(
2088                         manifest_url, video_id, 'mp4', preference=preference,
2089                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2090                     continue
2091             formats.append({
2092                 'format_id': format_id,
2093                 'url': manifest_url,
2094                 'manifest_url': manifest_url,
2095                 'ext': 'flv' if bootstrap_info is not None else None,
2096                 'protocol': 'f4m',
2097                 'tbr': tbr,
2098                 'width': width,
2099                 'height': height,
2100                 'vcodec': vcodec,
2101                 'preference': preference,
2102                 'quality': quality,
2103             })
2104         return formats
2105
2106     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2107         return {
2108             'format_id': join_nonempty(m3u8_id, 'meta'),
2109             'url': m3u8_url,
2110             'ext': ext,
2111             'protocol': 'm3u8',
2112             'preference': preference - 100 if preference else -100,
2113             'quality': quality,
2114             'resolution': 'multiple',
2115             'format_note': 'Quality selection URL',
2116         }
2117
2118     def _report_ignoring_subs(self, name):
2119         self.report_warning(bug_reports_message(
2120             f'Ignoring subtitle tracks found in the {name} manifest; '
2121             'if any subtitle tracks are missing,'
2122         ), only_once=True)
2123
2124     def _extract_m3u8_formats(self, *args, **kwargs):
2125         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2126         if subs:
2127             self._report_ignoring_subs('HLS')
2128         return fmts
2129
2130     def _extract_m3u8_formats_and_subtitles(
2131             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2132             preference=None, quality=None, m3u8_id=None, note=None,
2133             errnote=None, fatal=True, live=False, data=None, headers={},
2134             query={}):
2135
2136         res = self._download_webpage_handle(
2137             m3u8_url, video_id,
2138             note='Downloading m3u8 information' if note is None else note,
2139             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2140             fatal=fatal, data=data, headers=headers, query=query)
2141
2142         if res is False:
2143             return [], {}
2144
2145         m3u8_doc, urlh = res
2146         m3u8_url = urlh.geturl()
2147
2148         return self._parse_m3u8_formats_and_subtitles(
2149             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2150             preference=preference, quality=quality, m3u8_id=m3u8_id,
2151             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2152             headers=headers, query=query, video_id=video_id)
2153
2154     def _parse_m3u8_formats_and_subtitles(
2155             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2156             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2157             errnote=None, fatal=True, data=None, headers={}, query={},
2158             video_id=None):
2159         formats, subtitles = [], {}
2160
2161         has_drm = re.search('|'.join([
2162             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2163             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2164         ]), m3u8_doc)
2165
2166         def format_url(url):
2167             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2168
2169         if self.get_param('hls_split_discontinuity', False):
2170             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2171                 if not m3u8_doc:
2172                     if not manifest_url:
2173                         return []
2174                     m3u8_doc = self._download_webpage(
2175                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2176                         note=False, errnote='Failed to download m3u8 playlist information')
2177                     if m3u8_doc is False:
2178                         return []
2179                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2180
2181         else:
2182             def _extract_m3u8_playlist_indices(*args, **kwargs):
2183                 return [None]
2184
2185         # References:
2186         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2187         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2188         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2189
2190         # We should try extracting formats only from master playlists [1, 4.3.4],
2191         # i.e. playlists that describe available qualities. On the other hand
2192         # media playlists [1, 4.3.3] should be returned as is since they contain
2193         # just the media without qualities renditions.
2194         # Fortunately, master playlist can be easily distinguished from media
2195         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2196         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2197         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2198         # media playlist and MUST NOT appear in master playlist thus we can
2199         # clearly detect media playlist with this criterion.
2200
2201         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2202             formats = [{
2203                 'format_id': join_nonempty(m3u8_id, idx),
2204                 'format_index': idx,
2205                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2206                 'ext': ext,
2207                 'protocol': entry_protocol,
2208                 'preference': preference,
2209                 'quality': quality,
2210                 'has_drm': has_drm,
2211             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2212
2213             return formats, subtitles
2214
2215         groups = {}
2216         last_stream_inf = {}
2217
2218         def extract_media(x_media_line):
2219             media = parse_m3u8_attributes(x_media_line)
2220             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2221             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2222             if not (media_type and group_id and name):
2223                 return
2224             groups.setdefault(group_id, []).append(media)
2225             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2226             if media_type == 'SUBTITLES':
2227                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2228                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2229                 # However, lack of URI has been spotted in the wild.
2230                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2231                 if not media.get('URI'):
2232                     return
2233                 url = format_url(media['URI'])
2234                 sub_info = {
2235                     'url': url,
2236                     'ext': determine_ext(url),
2237                 }
2238                 if sub_info['ext'] == 'm3u8':
2239                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2240                     # files may contain is WebVTT:
2241                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2242                     sub_info['ext'] = 'vtt'
2243                     sub_info['protocol'] = 'm3u8_native'
2244                 lang = media.get('LANGUAGE') or 'und'
2245                 subtitles.setdefault(lang, []).append(sub_info)
2246             if media_type not in ('VIDEO', 'AUDIO'):
2247                 return
2248             media_url = media.get('URI')
2249             if media_url:
2250                 manifest_url = format_url(media_url)
2251                 formats.extend({
2252                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2253                     'format_note': name,
2254                     'format_index': idx,
2255                     'url': manifest_url,
2256                     'manifest_url': m3u8_url,
2257                     'language': media.get('LANGUAGE'),
2258                     'ext': ext,
2259                     'protocol': entry_protocol,
2260                     'preference': preference,
2261                     'quality': quality,
2262                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2263                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2264
2265         def build_stream_name():
2266             # Despite specification does not mention NAME attribute for
2267             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2268             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2269             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2270             stream_name = last_stream_inf.get('NAME')
2271             if stream_name:
2272                 return stream_name
2273             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2274             # from corresponding rendition group
2275             stream_group_id = last_stream_inf.get('VIDEO')
2276             if not stream_group_id:
2277                 return
2278             stream_group = groups.get(stream_group_id)
2279             if not stream_group:
2280                 return stream_group_id
2281             rendition = stream_group[0]
2282             return rendition.get('NAME') or stream_group_id
2283
2284         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2285         # chance to detect video only formats when EXT-X-STREAM-INF tags
2286         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2287         for line in m3u8_doc.splitlines():
2288             if line.startswith('#EXT-X-MEDIA:'):
2289                 extract_media(line)
2290
2291         for line in m3u8_doc.splitlines():
2292             if line.startswith('#EXT-X-STREAM-INF:'):
2293                 last_stream_inf = parse_m3u8_attributes(line)
2294             elif line.startswith('#') or not line.strip():
2295                 continue
2296             else:
2297                 tbr = float_or_none(
2298                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2299                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2300                 manifest_url = format_url(line.strip())
2301
2302                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2303                     format_id = [m3u8_id, None, idx]
2304                     # Bandwidth of live streams may differ over time thus making
2305                     # format_id unpredictable. So it's better to keep provided
2306                     # format_id intact.
2307                     if not live:
2308                         stream_name = build_stream_name()
2309                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2310                     f = {
2311                         'format_id': join_nonempty(*format_id),
2312                         'format_index': idx,
2313                         'url': manifest_url,
2314                         'manifest_url': m3u8_url,
2315                         'tbr': tbr,
2316                         'ext': ext,
2317                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2318                         'protocol': entry_protocol,
2319                         'preference': preference,
2320                         'quality': quality,
2321                     }
2322                     resolution = last_stream_inf.get('RESOLUTION')
2323                     if resolution:
2324                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2325                         if mobj:
2326                             f['width'] = int(mobj.group('width'))
2327                             f['height'] = int(mobj.group('height'))
2328                     # Unified Streaming Platform
2329                     mobj = re.search(
2330                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2331                     if mobj:
2332                         abr, vbr = mobj.groups()
2333                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2334                         f.update({
2335                             'vbr': vbr,
2336                             'abr': abr,
2337                         })
2338                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2339                     f.update(codecs)
2340                     audio_group_id = last_stream_inf.get('AUDIO')
2341                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2342                     # references a rendition group MUST have a CODECS attribute.
2343                     # However, this is not always respected, for example, [2]
2344                     # contains EXT-X-STREAM-INF tag which references AUDIO
2345                     # rendition group but does not have CODECS and despite
2346                     # referencing an audio group it represents a complete
2347                     # (with audio and video) format. So, for such cases we will
2348                     # ignore references to rendition groups and treat them
2349                     # as complete formats.
2350                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2351                         audio_group = groups.get(audio_group_id)
2352                         if audio_group and audio_group[0].get('URI'):
2353                             # TODO: update acodec for audio only formats with
2354                             # the same GROUP-ID
2355                             f['acodec'] = 'none'
2356                     if not f.get('ext'):
2357                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2358                     formats.append(f)
2359
2360                     # for DailyMotion
2361                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2362                     if progressive_uri:
2363                         http_f = f.copy()
2364                         del http_f['manifest_url']
2365                         http_f.update({
2366                             'format_id': f['format_id'].replace('hls-', 'http-'),
2367                             'protocol': 'http',
2368                             'url': progressive_uri,
2369                         })
2370                         formats.append(http_f)
2371
2372                 last_stream_inf = {}
2373         return formats, subtitles
2374
2375     def _extract_m3u8_vod_duration(
2376             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2377
2378         m3u8_vod = self._download_webpage(
2379             m3u8_vod_url, video_id,
2380             note='Downloading m3u8 VOD manifest' if note is None else note,
2381             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2382             fatal=False, data=data, headers=headers, query=query)
2383
2384         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2385
2386     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2387         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2388             return None
2389
2390         return int(sum(
2391             float(line[len('#EXTINF:'):].split(',')[0])
2392             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2393
2394     @staticmethod
2395     def _xpath_ns(path, namespace=None):
2396         if not namespace:
2397             return path
2398         out = []
2399         for c in path.split('/'):
2400             if not c or c == '.':
2401                 out.append(c)
2402             else:
2403                 out.append('{%s}%s' % (namespace, c))
2404         return '/'.join(out)
2405
2406     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2407         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2408         if res is False:
2409             assert not fatal
2410             return [], {}
2411
2412         smil, urlh = res
2413         smil_url = urlh.geturl()
2414
2415         namespace = self._parse_smil_namespace(smil)
2416
2417         fmts = self._parse_smil_formats(
2418             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2419         subs = self._parse_smil_subtitles(
2420             smil, namespace=namespace)
2421
2422         return fmts, subs
2423
2424     def _extract_smil_formats(self, *args, **kwargs):
2425         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2426         if subs:
2427             self._report_ignoring_subs('SMIL')
2428         return fmts
2429
2430     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2431         res = self._download_smil(smil_url, video_id, fatal=fatal)
2432         if res is False:
2433             return {}
2434
2435         smil, urlh = res
2436         smil_url = urlh.geturl()
2437
2438         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2439
2440     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2441         return self._download_xml_handle(
2442             smil_url, video_id, 'Downloading SMIL file',
2443             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2444
2445     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2446         namespace = self._parse_smil_namespace(smil)
2447
2448         formats = self._parse_smil_formats(
2449             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2450         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2451
2452         video_id = os.path.splitext(url_basename(smil_url))[0]
2453         title = None
2454         description = None
2455         upload_date = None
2456         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2457             name = meta.attrib.get('name')
2458             content = meta.attrib.get('content')
2459             if not name or not content:
2460                 continue
2461             if not title and name == 'title':
2462                 title = content
2463             elif not description and name in ('description', 'abstract'):
2464                 description = content
2465             elif not upload_date and name == 'date':
2466                 upload_date = unified_strdate(content)
2467
2468         thumbnails = [{
2469             'id': image.get('type'),
2470             'url': image.get('src'),
2471             'width': int_or_none(image.get('width')),
2472             'height': int_or_none(image.get('height')),
2473         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2474
2475         return {
2476             'id': video_id,
2477             'title': title or video_id,
2478             'description': description,
2479             'upload_date': upload_date,
2480             'thumbnails': thumbnails,
2481             'formats': formats,
2482             'subtitles': subtitles,
2483         }
2484
2485     def _parse_smil_namespace(self, smil):
2486         return self._search_regex(
2487             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2488
2489     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2490         base = smil_url
2491         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2492             b = meta.get('base') or meta.get('httpBase')
2493             if b:
2494                 base = b
2495                 break
2496
2497         formats = []
2498         rtmp_count = 0
2499         http_count = 0
2500         m3u8_count = 0
2501         imgs_count = 0
2502
2503         srcs = set()
2504         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2505         for medium in media:
2506             src = medium.get('src')
2507             if not src or src in srcs:
2508                 continue
2509             srcs.add(src)
2510
2511             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2512             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2513             width = int_or_none(medium.get('width'))
2514             height = int_or_none(medium.get('height'))
2515             proto = medium.get('proto')
2516             ext = medium.get('ext')
2517             src_ext = determine_ext(src)
2518             streamer = medium.get('streamer') or base
2519
2520             if proto == 'rtmp' or streamer.startswith('rtmp'):
2521                 rtmp_count += 1
2522                 formats.append({
2523                     'url': streamer,
2524                     'play_path': src,
2525                     'ext': 'flv',
2526                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2527                     'tbr': bitrate,
2528                     'filesize': filesize,
2529                     'width': width,
2530                     'height': height,
2531                 })
2532                 if transform_rtmp_url:
2533                     streamer, src = transform_rtmp_url(streamer, src)
2534                     formats[-1].update({
2535                         'url': streamer,
2536                         'play_path': src,
2537                     })
2538                 continue
2539
2540             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2541             src_url = src_url.strip()
2542
2543             if proto == 'm3u8' or src_ext == 'm3u8':
2544                 m3u8_formats = self._extract_m3u8_formats(
2545                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2546                 if len(m3u8_formats) == 1:
2547                     m3u8_count += 1
2548                     m3u8_formats[0].update({
2549                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2550                         'tbr': bitrate,
2551                         'width': width,
2552                         'height': height,
2553                     })
2554                 formats.extend(m3u8_formats)
2555             elif src_ext == 'f4m':
2556                 f4m_url = src_url
2557                 if not f4m_params:
2558                     f4m_params = {
2559                         'hdcore': '3.2.0',
2560                         'plugin': 'flowplayer-3.2.0.1',
2561                     }
2562                 f4m_url += '&' if '?' in f4m_url else '?'
2563                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2564                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2565             elif src_ext == 'mpd':
2566                 formats.extend(self._extract_mpd_formats(
2567                     src_url, video_id, mpd_id='dash', fatal=False))
2568             elif re.search(r'\.ism/[Mm]anifest', src_url):
2569                 formats.extend(self._extract_ism_formats(
2570                     src_url, video_id, ism_id='mss', fatal=False))
2571             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2572                 http_count += 1
2573                 formats.append({
2574                     'url': src_url,
2575                     'ext': ext or src_ext or 'flv',
2576                     'format_id': 'http-%d' % (bitrate or http_count),
2577                     'tbr': bitrate,
2578                     'filesize': filesize,
2579                     'width': width,
2580                     'height': height,
2581                 })
2582
2583         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2584             src = medium.get('src')
2585             if not src or src in srcs:
2586                 continue
2587             srcs.add(src)
2588
2589             imgs_count += 1
2590             formats.append({
2591                 'format_id': 'imagestream-%d' % (imgs_count),
2592                 'url': src,
2593                 'ext': mimetype2ext(medium.get('type')),
2594                 'acodec': 'none',
2595                 'vcodec': 'none',
2596                 'width': int_or_none(medium.get('width')),
2597                 'height': int_or_none(medium.get('height')),
2598                 'format_note': 'SMIL storyboards',
2599             })
2600
2601         return formats
2602
2603     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2604         urls = []
2605         subtitles = {}
2606         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2607             src = textstream.get('src')
2608             if not src or src in urls:
2609                 continue
2610             urls.append(src)
2611             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2612             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2613             subtitles.setdefault(lang, []).append({
2614                 'url': src,
2615                 'ext': ext,
2616             })
2617         return subtitles
2618
2619     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2620         res = self._download_xml_handle(
2621             xspf_url, playlist_id, 'Downloading xpsf playlist',
2622             'Unable to download xspf manifest', fatal=fatal)
2623         if res is False:
2624             return []
2625
2626         xspf, urlh = res
2627         xspf_url = urlh.geturl()
2628
2629         return self._parse_xspf(
2630             xspf, playlist_id, xspf_url=xspf_url,
2631             xspf_base_url=base_url(xspf_url))
2632
2633     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2634         NS_MAP = {
2635             'xspf': 'http://xspf.org/ns/0/',
2636             's1': 'http://static.streamone.nl/player/ns/0',
2637         }
2638
2639         entries = []
2640         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2641             title = xpath_text(
2642                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2643             description = xpath_text(
2644                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2645             thumbnail = xpath_text(
2646                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2647             duration = float_or_none(
2648                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2649
2650             formats = []
2651             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2652                 format_url = urljoin(xspf_base_url, location.text)
2653                 if not format_url:
2654                     continue
2655                 formats.append({
2656                     'url': format_url,
2657                     'manifest_url': xspf_url,
2658                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2659                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2660                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2661                 })
2662             self._sort_formats(formats)
2663
2664             entries.append({
2665                 'id': playlist_id,
2666                 'title': title,
2667                 'description': description,
2668                 'thumbnail': thumbnail,
2669                 'duration': duration,
2670                 'formats': formats,
2671             })
2672         return entries
2673
2674     def _extract_mpd_formats(self, *args, **kwargs):
2675         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2676         if subs:
2677             self._report_ignoring_subs('DASH')
2678         return fmts
2679
2680     def _extract_mpd_formats_and_subtitles(
2681             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2682             fatal=True, data=None, headers={}, query={}):
2683         res = self._download_xml_handle(
2684             mpd_url, video_id,
2685             note='Downloading MPD manifest' if note is None else note,
2686             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2687             fatal=fatal, data=data, headers=headers, query=query)
2688         if res is False:
2689             return [], {}
2690         mpd_doc, urlh = res
2691         if mpd_doc is None:
2692             return [], {}
2693
2694         # We could have been redirected to a new url when we retrieved our mpd file.
2695         mpd_url = urlh.geturl()
2696         mpd_base_url = base_url(mpd_url)
2697
2698         return self._parse_mpd_formats_and_subtitles(
2699             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2700
2701     def _parse_mpd_formats(self, *args, **kwargs):
2702         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2703         if subs:
2704             self._report_ignoring_subs('DASH')
2705         return fmts
2706
2707     def _parse_mpd_formats_and_subtitles(
2708             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2709         """
2710         Parse formats from MPD manifest.
2711         References:
2712          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2713             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2714          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2715         """
2716         if not self.get_param('dynamic_mpd', True):
2717             if mpd_doc.get('type') == 'dynamic':
2718                 return [], {}
2719
2720         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2721
2722         def _add_ns(path):
2723             return self._xpath_ns(path, namespace)
2724
2725         def is_drm_protected(element):
2726             return element.find(_add_ns('ContentProtection')) is not None
2727
2728         def extract_multisegment_info(element, ms_parent_info):
2729             ms_info = ms_parent_info.copy()
2730
2731             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2732             # common attributes and elements.  We will only extract relevant
2733             # for us.
2734             def extract_common(source):
2735                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2736                 if segment_timeline is not None:
2737                     s_e = segment_timeline.findall(_add_ns('S'))
2738                     if s_e:
2739                         ms_info['total_number'] = 0
2740                         ms_info['s'] = []
2741                         for s in s_e:
2742                             r = int(s.get('r', 0))
2743                             ms_info['total_number'] += 1 + r
2744                             ms_info['s'].append({
2745                                 't': int(s.get('t', 0)),
2746                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2747                                 'd': int(s.attrib['d']),
2748                                 'r': r,
2749                             })
2750                 start_number = source.get('startNumber')
2751                 if start_number:
2752                     ms_info['start_number'] = int(start_number)
2753                 timescale = source.get('timescale')
2754                 if timescale:
2755                     ms_info['timescale'] = int(timescale)
2756                 segment_duration = source.get('duration')
2757                 if segment_duration:
2758                     ms_info['segment_duration'] = float(segment_duration)
2759
2760             def extract_Initialization(source):
2761                 initialization = source.find(_add_ns('Initialization'))
2762                 if initialization is not None:
2763                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2764
2765             segment_list = element.find(_add_ns('SegmentList'))
2766             if segment_list is not None:
2767                 extract_common(segment_list)
2768                 extract_Initialization(segment_list)
2769                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2770                 if segment_urls_e:
2771                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2772             else:
2773                 segment_template = element.find(_add_ns('SegmentTemplate'))
2774                 if segment_template is not None:
2775                     extract_common(segment_template)
2776                     media = segment_template.get('media')
2777                     if media:
2778                         ms_info['media'] = media
2779                     initialization = segment_template.get('initialization')
2780                     if initialization:
2781                         ms_info['initialization'] = initialization
2782                     else:
2783                         extract_Initialization(segment_template)
2784             return ms_info
2785
2786         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2787         formats, subtitles = [], {}
2788         stream_numbers = collections.defaultdict(int)
2789         for period in mpd_doc.findall(_add_ns('Period')):
2790             period_duration = parse_duration(period.get('duration')) or mpd_duration
2791             period_ms_info = extract_multisegment_info(period, {
2792                 'start_number': 1,
2793                 'timescale': 1,
2794             })
2795             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2796                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2797                 for representation in adaptation_set.findall(_add_ns('Representation')):
2798                     representation_attrib = adaptation_set.attrib.copy()
2799                     representation_attrib.update(representation.attrib)
2800                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2801                     mime_type = representation_attrib['mimeType']
2802                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2803
2804                     codecs = parse_codecs(representation_attrib.get('codecs', ''))
2805                     if content_type not in ('video', 'audio', 'text'):
2806                         if mime_type == 'image/jpeg':
2807                             content_type = mime_type
2808                         elif codecs['vcodec'] != 'none':
2809                             content_type = 'video'
2810                         elif codecs['acodec'] != 'none':
2811                             content_type = 'audio'
2812                         elif codecs.get('tcodec', 'none') != 'none':
2813                             content_type = 'text'
2814                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2815                             content_type = 'text'
2816                         else:
2817                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2818                             continue
2819
2820                     base_url = ''
2821                     for element in (representation, adaptation_set, period, mpd_doc):
2822                         base_url_e = element.find(_add_ns('BaseURL'))
2823                         if base_url_e is not None:
2824                             base_url = base_url_e.text + base_url
2825                             if re.match(r'^https?://', base_url):
2826                                 break
2827                     if mpd_base_url and base_url.startswith('/'):
2828                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2829                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2830                         if not mpd_base_url.endswith('/'):
2831                             mpd_base_url += '/'
2832                         base_url = mpd_base_url + base_url
2833                     representation_id = representation_attrib.get('id')
2834                     lang = representation_attrib.get('lang')
2835                     url_el = representation.find(_add_ns('BaseURL'))
2836                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2837                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2838                     if representation_id is not None:
2839                         format_id = representation_id
2840                     else:
2841                         format_id = content_type
2842                     if mpd_id:
2843                         format_id = mpd_id + '-' + format_id
2844                     if content_type in ('video', 'audio'):
2845                         f = {
2846                             'format_id': format_id,
2847                             'manifest_url': mpd_url,
2848                             'ext': mimetype2ext(mime_type),
2849                             'width': int_or_none(representation_attrib.get('width')),
2850                             'height': int_or_none(representation_attrib.get('height')),
2851                             'tbr': float_or_none(bandwidth, 1000),
2852                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2853                             'fps': int_or_none(representation_attrib.get('frameRate')),
2854                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2855                             'format_note': 'DASH %s' % content_type,
2856                             'filesize': filesize,
2857                             'container': mimetype2ext(mime_type) + '_dash',
2858                             **codecs
2859                         }
2860                     elif content_type == 'text':
2861                         f = {
2862                             'ext': mimetype2ext(mime_type),
2863                             'manifest_url': mpd_url,
2864                             'filesize': filesize,
2865                         }
2866                     elif content_type == 'image/jpeg':
2867                         # See test case in VikiIE
2868                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2869                         f = {
2870                             'format_id': format_id,
2871                             'ext': 'mhtml',
2872                             'manifest_url': mpd_url,
2873                             'format_note': 'DASH storyboards (jpeg)',
2874                             'acodec': 'none',
2875                             'vcodec': 'none',
2876                         }
2877                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2878                         f['has_drm'] = True
2879                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2880
2881                     def prepare_template(template_name, identifiers):
2882                         tmpl = representation_ms_info[template_name]
2883                         # First of, % characters outside $...$ templates
2884                         # must be escaped by doubling for proper processing
2885                         # by % operator string formatting used further (see
2886                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2887                         t = ''
2888                         in_template = False
2889                         for c in tmpl:
2890                             t += c
2891                             if c == '$':
2892                                 in_template = not in_template
2893                             elif c == '%' and not in_template:
2894                                 t += c
2895                         # Next, $...$ templates are translated to their
2896                         # %(...) counterparts to be used with % operator
2897                         if representation_id is not None:
2898                             t = t.replace('$RepresentationID$', representation_id)
2899                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2900                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2901                         t.replace('$$', '$')
2902                         return t
2903
2904                     # @initialization is a regular template like @media one
2905                     # so it should be handled just the same way (see
2906                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2907                     if 'initialization' in representation_ms_info:
2908                         initialization_template = prepare_template(
2909                             'initialization',
2910                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2911                             # $Time$ shall not be included for @initialization thus
2912                             # only $Bandwidth$ remains
2913                             ('Bandwidth', ))
2914                         representation_ms_info['initialization_url'] = initialization_template % {
2915                             'Bandwidth': bandwidth,
2916                         }
2917
2918                     def location_key(location):
2919                         return 'url' if re.match(r'^https?://', location) else 'path'
2920
2921                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2922
2923                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2924                         media_location_key = location_key(media_template)
2925
2926                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2927                         # can't be used at the same time
2928                         if '%(Number' in media_template and 's' not in representation_ms_info:
2929                             segment_duration = None
2930                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2931                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2932                                 representation_ms_info['total_number'] = int(math.ceil(
2933                                     float_or_none(period_duration, segment_duration, default=0)))
2934                             representation_ms_info['fragments'] = [{
2935                                 media_location_key: media_template % {
2936                                     'Number': segment_number,
2937                                     'Bandwidth': bandwidth,
2938                                 },
2939                                 'duration': segment_duration,
2940                             } for segment_number in range(
2941                                 representation_ms_info['start_number'],
2942                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2943                         else:
2944                             # $Number*$ or $Time$ in media template with S list available
2945                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2946                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2947                             representation_ms_info['fragments'] = []
2948                             segment_time = 0
2949                             segment_d = None
2950                             segment_number = representation_ms_info['start_number']
2951
2952                             def add_segment_url():
2953                                 segment_url = media_template % {
2954                                     'Time': segment_time,
2955                                     'Bandwidth': bandwidth,
2956                                     'Number': segment_number,
2957                                 }
2958                                 representation_ms_info['fragments'].append({
2959                                     media_location_key: segment_url,
2960                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2961                                 })
2962
2963                             for num, s in enumerate(representation_ms_info['s']):
2964                                 segment_time = s.get('t') or segment_time
2965                                 segment_d = s['d']
2966                                 add_segment_url()
2967                                 segment_number += 1
2968                                 for r in range(s.get('r', 0)):
2969                                     segment_time += segment_d
2970                                     add_segment_url()
2971                                     segment_number += 1
2972                                 segment_time += segment_d
2973                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2974                         # No media template
2975                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2976                         # or any YouTube dashsegments video
2977                         fragments = []
2978                         segment_index = 0
2979                         timescale = representation_ms_info['timescale']
2980                         for s in representation_ms_info['s']:
2981                             duration = float_or_none(s['d'], timescale)
2982                             for r in range(s.get('r', 0) + 1):
2983                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2984                                 fragments.append({
2985                                     location_key(segment_uri): segment_uri,
2986                                     'duration': duration,
2987                                 })
2988                                 segment_index += 1
2989                         representation_ms_info['fragments'] = fragments
2990                     elif 'segment_urls' in representation_ms_info:
2991                         # Segment URLs with no SegmentTimeline
2992                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2993                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2994                         fragments = []
2995                         segment_duration = float_or_none(
2996                             representation_ms_info['segment_duration'],
2997                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2998                         for segment_url in representation_ms_info['segment_urls']:
2999                             fragment = {
3000                                 location_key(segment_url): segment_url,
3001                             }
3002                             if segment_duration:
3003                                 fragment['duration'] = segment_duration
3004                             fragments.append(fragment)
3005                         representation_ms_info['fragments'] = fragments
3006                     # If there is a fragments key available then we correctly recognized fragmented media.
3007                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3008                     # assumption is not necessarily correct since we may simply have no support for
3009                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3010                     if 'fragments' in representation_ms_info:
3011                         f.update({
3012                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3013                             'url': mpd_url or base_url,
3014                             'fragment_base_url': base_url,
3015                             'fragments': [],
3016                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3017                         })
3018                         if 'initialization_url' in representation_ms_info:
3019                             initialization_url = representation_ms_info['initialization_url']
3020                             if not f.get('url'):
3021                                 f['url'] = initialization_url
3022                             f['fragments'].append({location_key(initialization_url): initialization_url})
3023                         f['fragments'].extend(representation_ms_info['fragments'])
3024                         if not period_duration:
3025                             period_duration = try_get(
3026                                 representation_ms_info,
3027                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3028                     else:
3029                         # Assuming direct URL to unfragmented media.
3030                         f['url'] = base_url
3031                     if content_type in ('video', 'audio', 'image/jpeg'):
3032                         f['manifest_stream_number'] = stream_numbers[f['url']]
3033                         stream_numbers[f['url']] += 1
3034                         formats.append(f)
3035                     elif content_type == 'text':
3036                         subtitles.setdefault(lang or 'und', []).append(f)
3037
3038         return formats, subtitles
3039
3040     def _extract_ism_formats(self, *args, **kwargs):
3041         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3042         if subs:
3043             self._report_ignoring_subs('ISM')
3044         return fmts
3045
3046     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3047         res = self._download_xml_handle(
3048             ism_url, video_id,
3049             note='Downloading ISM manifest' if note is None else note,
3050             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3051             fatal=fatal, data=data, headers=headers, query=query)
3052         if res is False:
3053             return [], {}
3054         ism_doc, urlh = res
3055         if ism_doc is None:
3056             return [], {}
3057
3058         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3059
3060     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3061         """
3062         Parse formats from ISM manifest.
3063         References:
3064          1. [MS-SSTR]: Smooth Streaming Protocol,
3065             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3066         """
3067         if ism_doc.get('IsLive') == 'TRUE':
3068             return [], {}
3069
3070         duration = int(ism_doc.attrib['Duration'])
3071         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3072
3073         formats = []
3074         subtitles = {}
3075         for stream in ism_doc.findall('StreamIndex'):
3076             stream_type = stream.get('Type')
3077             if stream_type not in ('video', 'audio', 'text'):
3078                 continue
3079             url_pattern = stream.attrib['Url']
3080             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3081             stream_name = stream.get('Name')
3082             stream_language = stream.get('Language', 'und')
3083             for track in stream.findall('QualityLevel'):
3084                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3085                 # TODO: add support for WVC1 and WMAP
3086                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3087                     self.report_warning('%s is not a supported codec' % fourcc)
3088                     continue
3089                 tbr = int(track.attrib['Bitrate']) // 1000
3090                 # [1] does not mention Width and Height attributes. However,
3091                 # they're often present while MaxWidth and MaxHeight are
3092                 # missing, so should be used as fallbacks
3093                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3094                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3095                 sampling_rate = int_or_none(track.get('SamplingRate'))
3096
3097                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3098                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3099
3100                 fragments = []
3101                 fragment_ctx = {
3102                     'time': 0,
3103                 }
3104                 stream_fragments = stream.findall('c')
3105                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3106                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3107                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3108                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3109                     if not fragment_ctx['duration']:
3110                         try:
3111                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3112                         except IndexError:
3113                             next_fragment_time = duration
3114                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3115                     for _ in range(fragment_repeat):
3116                         fragments.append({
3117                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3118                             'duration': fragment_ctx['duration'] / stream_timescale,
3119                         })
3120                         fragment_ctx['time'] += fragment_ctx['duration']
3121
3122                 if stream_type == 'text':
3123                     subtitles.setdefault(stream_language, []).append({
3124                         'ext': 'ismt',
3125                         'protocol': 'ism',
3126                         'url': ism_url,
3127                         'manifest_url': ism_url,
3128                         'fragments': fragments,
3129                         '_download_params': {
3130                             'stream_type': stream_type,
3131                             'duration': duration,
3132                             'timescale': stream_timescale,
3133                             'fourcc': fourcc,
3134                             'language': stream_language,
3135                             'codec_private_data': track.get('CodecPrivateData'),
3136                         }
3137                     })
3138                 elif stream_type in ('video', 'audio'):
3139                     formats.append({
3140                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3141                         'url': ism_url,
3142                         'manifest_url': ism_url,
3143                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3144                         'width': width,
3145                         'height': height,
3146                         'tbr': tbr,
3147                         'asr': sampling_rate,
3148                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3149                         'acodec': 'none' if stream_type == 'video' else fourcc,
3150                         'protocol': 'ism',
3151                         'fragments': fragments,
3152                         'has_drm': ism_doc.find('Protection') is not None,
3153                         '_download_params': {
3154                             'stream_type': stream_type,
3155                             'duration': duration,
3156                             'timescale': stream_timescale,
3157                             'width': width or 0,
3158                             'height': height or 0,
3159                             'fourcc': fourcc,
3160                             'language': stream_language,
3161                             'codec_private_data': track.get('CodecPrivateData'),
3162                             'sampling_rate': sampling_rate,
3163                             'channels': int_or_none(track.get('Channels', 2)),
3164                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3165                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3166                         },
3167                     })
3168         return formats, subtitles
3169
3170     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3171         def absolute_url(item_url):
3172             return urljoin(base_url, item_url)
3173
3174         def parse_content_type(content_type):
3175             if not content_type:
3176                 return {}
3177             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3178             if ctr:
3179                 mimetype, codecs = ctr.groups()
3180                 f = parse_codecs(codecs)
3181                 f['ext'] = mimetype2ext(mimetype)
3182                 return f
3183             return {}
3184
3185         def _media_formats(src, cur_media_type, type_info={}):
3186             full_url = absolute_url(src)
3187             ext = type_info.get('ext') or determine_ext(full_url)
3188             if ext == 'm3u8':
3189                 is_plain_url = False
3190                 formats = self._extract_m3u8_formats(
3191                     full_url, video_id, ext='mp4',
3192                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3193                     preference=preference, quality=quality, fatal=False)
3194             elif ext == 'mpd':
3195                 is_plain_url = False
3196                 formats = self._extract_mpd_formats(
3197                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3198             else:
3199                 is_plain_url = True
3200                 formats = [{
3201                     'url': full_url,
3202                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3203                 }]
3204             return is_plain_url, formats
3205
3206         entries = []
3207         # amp-video and amp-audio are very similar to their HTML5 counterparts
3208         # so we wll include them right here (see
3209         # https://www.ampproject.org/docs/reference/components/amp-video)
3210         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3211         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3212         media_tags = [(media_tag, media_tag_name, media_type, '')
3213                       for media_tag, media_tag_name, media_type
3214                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3215         media_tags.extend(re.findall(
3216             # We only allow video|audio followed by a whitespace or '>'.
3217             # Allowing more characters may end up in significant slow down (see
3218             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3219             # http://www.porntrex.com/maps/videositemap.xml).
3220             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3221         for media_tag, _, media_type, media_content in media_tags:
3222             media_info = {
3223                 'formats': [],
3224                 'subtitles': {},
3225             }
3226             media_attributes = extract_attributes(media_tag)
3227             src = strip_or_none(media_attributes.get('src'))
3228             if src:
3229                 _, formats = _media_formats(src, media_type)
3230                 media_info['formats'].extend(formats)
3231             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3232             if media_content:
3233                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3234                     s_attr = extract_attributes(source_tag)
3235                     # data-video-src and data-src are non standard but seen
3236                     # several times in the wild
3237                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3238                     if not src:
3239                         continue
3240                     f = parse_content_type(s_attr.get('type'))
3241                     is_plain_url, formats = _media_formats(src, media_type, f)
3242                     if is_plain_url:
3243                         # width, height, res, label and title attributes are
3244                         # all not standard but seen several times in the wild
3245                         labels = [
3246                             s_attr.get(lbl)
3247                             for lbl in ('label', 'title')
3248                             if str_or_none(s_attr.get(lbl))
3249                         ]
3250                         width = int_or_none(s_attr.get('width'))
3251                         height = (int_or_none(s_attr.get('height'))
3252                                   or int_or_none(s_attr.get('res')))
3253                         if not width or not height:
3254                             for lbl in labels:
3255                                 resolution = parse_resolution(lbl)
3256                                 if not resolution:
3257                                     continue
3258                                 width = width or resolution.get('width')
3259                                 height = height or resolution.get('height')
3260                         for lbl in labels:
3261                             tbr = parse_bitrate(lbl)
3262                             if tbr:
3263                                 break
3264                         else:
3265                             tbr = None
3266                         f.update({
3267                             'width': width,
3268                             'height': height,
3269                             'tbr': tbr,
3270                             'format_id': s_attr.get('label') or s_attr.get('title'),
3271                         })
3272                         f.update(formats[0])
3273                         media_info['formats'].append(f)
3274                     else:
3275                         media_info['formats'].extend(formats)
3276                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3277                     track_attributes = extract_attributes(track_tag)
3278                     kind = track_attributes.get('kind')
3279                     if not kind or kind in ('subtitles', 'captions'):
3280                         src = strip_or_none(track_attributes.get('src'))
3281                         if not src:
3282                             continue
3283                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3284                         media_info['subtitles'].setdefault(lang, []).append({
3285                             'url': absolute_url(src),
3286                         })
3287             for f in media_info['formats']:
3288                 f.setdefault('http_headers', {})['Referer'] = base_url
3289             if media_info['formats'] or media_info['subtitles']:
3290                 entries.append(media_info)
3291         return entries
3292
3293     def _extract_akamai_formats(self, *args, **kwargs):
3294         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3295         if subs:
3296             self._report_ignoring_subs('akamai')
3297         return fmts
3298
3299     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3300         signed = 'hdnea=' in manifest_url
3301         if not signed:
3302             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3303             manifest_url = re.sub(
3304                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3305                 '', manifest_url).strip('?')
3306
3307         formats = []
3308         subtitles = {}
3309
3310         hdcore_sign = 'hdcore=3.7.0'
3311         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3312         hds_host = hosts.get('hds')
3313         if hds_host:
3314             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3315         if 'hdcore=' not in f4m_url:
3316             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3317         f4m_formats = self._extract_f4m_formats(
3318             f4m_url, video_id, f4m_id='hds', fatal=False)
3319         for entry in f4m_formats:
3320             entry.update({'extra_param_to_segment_url': hdcore_sign})
3321         formats.extend(f4m_formats)
3322
3323         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3324         hls_host = hosts.get('hls')
3325         if hls_host:
3326             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3327         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3328             m3u8_url, video_id, 'mp4', 'm3u8_native',
3329             m3u8_id='hls', fatal=False)
3330         formats.extend(m3u8_formats)
3331         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3332
3333         http_host = hosts.get('http')
3334         if http_host and m3u8_formats and not signed:
3335             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3336             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3337             qualities_length = len(qualities)
3338             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3339                 i = 0
3340                 for f in m3u8_formats:
3341                     if f['vcodec'] != 'none':
3342                         for protocol in ('http', 'https'):
3343                             http_f = f.copy()
3344                             del http_f['manifest_url']
3345                             http_url = re.sub(
3346                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3347                             http_f.update({
3348                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3349                                 'url': http_url,
3350                                 'protocol': protocol,
3351                             })
3352                             formats.append(http_f)
3353                         i += 1
3354
3355         return formats, subtitles
3356
3357     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3358         query = compat_urlparse.urlparse(url).query
3359         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3360         mobj = re.search(
3361             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3362         url_base = mobj.group('url')
3363         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3364         formats = []
3365
3366         def manifest_url(manifest):
3367             m_url = f'{http_base_url}/{manifest}'
3368             if query:
3369                 m_url += '?%s' % query
3370             return m_url
3371
3372         if 'm3u8' not in skip_protocols:
3373             formats.extend(self._extract_m3u8_formats(
3374                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3375                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3376         if 'f4m' not in skip_protocols:
3377             formats.extend(self._extract_f4m_formats(
3378                 manifest_url('manifest.f4m'),
3379                 video_id, f4m_id='hds', fatal=False))
3380         if 'dash' not in skip_protocols:
3381             formats.extend(self._extract_mpd_formats(
3382                 manifest_url('manifest.mpd'),
3383                 video_id, mpd_id='dash', fatal=False))
3384         if re.search(r'(?:/smil:|\.smil)', url_base):
3385             if 'smil' not in skip_protocols:
3386                 rtmp_formats = self._extract_smil_formats(
3387                     manifest_url('jwplayer.smil'),
3388                     video_id, fatal=False)
3389                 for rtmp_format in rtmp_formats:
3390                     rtsp_format = rtmp_format.copy()
3391                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3392                     del rtsp_format['play_path']
3393                     del rtsp_format['ext']
3394                     rtsp_format.update({
3395                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3396                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3397                         'protocol': 'rtsp',
3398                     })
3399                     formats.extend([rtmp_format, rtsp_format])
3400         else:
3401             for protocol in ('rtmp', 'rtsp'):
3402                 if protocol not in skip_protocols:
3403                     formats.append({
3404                         'url': f'{protocol}:{url_base}',
3405                         'format_id': protocol,
3406                         'protocol': protocol,
3407                     })
3408         return formats
3409
3410     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3411         mobj = re.search(
3412             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3413             webpage)
3414         if mobj:
3415             try:
3416                 jwplayer_data = self._parse_json(mobj.group('options'),
3417                                                  video_id=video_id,
3418                                                  transform_source=transform_source)
3419             except ExtractorError:
3420                 pass
3421             else:
3422                 if isinstance(jwplayer_data, dict):
3423                     return jwplayer_data
3424
3425     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3426         jwplayer_data = self._find_jwplayer_data(
3427             webpage, video_id, transform_source=js_to_json)
3428         return self._parse_jwplayer_data(
3429             jwplayer_data, video_id, *args, **kwargs)
3430
3431     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3432                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3433         # JWPlayer backward compatibility: flattened playlists
3434         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3435         if 'playlist' not in jwplayer_data:
3436             jwplayer_data = {'playlist': [jwplayer_data]}
3437
3438         entries = []
3439
3440         # JWPlayer backward compatibility: single playlist item
3441         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3442         if not isinstance(jwplayer_data['playlist'], list):
3443             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3444
3445         for video_data in jwplayer_data['playlist']:
3446             # JWPlayer backward compatibility: flattened sources
3447             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3448             if 'sources' not in video_data:
3449                 video_data['sources'] = [video_data]
3450
3451             this_video_id = video_id or video_data['mediaid']
3452
3453             formats = self._parse_jwplayer_formats(
3454                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3455                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3456
3457             subtitles = {}
3458             tracks = video_data.get('tracks')
3459             if tracks and isinstance(tracks, list):
3460                 for track in tracks:
3461                     if not isinstance(track, dict):
3462                         continue
3463                     track_kind = track.get('kind')
3464                     if not track_kind or not isinstance(track_kind, compat_str):
3465                         continue
3466                     if track_kind.lower() not in ('captions', 'subtitles'):
3467                         continue
3468                     track_url = urljoin(base_url, track.get('file'))
3469                     if not track_url:
3470                         continue
3471                     subtitles.setdefault(track.get('label') or 'en', []).append({
3472                         'url': self._proto_relative_url(track_url)
3473                     })
3474
3475             entry = {
3476                 'id': this_video_id,
3477                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3478                 'description': clean_html(video_data.get('description')),
3479                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3480                 'timestamp': int_or_none(video_data.get('pubdate')),
3481                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3482                 'subtitles': subtitles,
3483             }
3484             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3485             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3486                 entry.update({
3487                     '_type': 'url_transparent',
3488                     'url': formats[0]['url'],
3489                 })
3490             else:
3491                 self._sort_formats(formats)
3492                 entry['formats'] = formats
3493             entries.append(entry)
3494         if len(entries) == 1:
3495             return entries[0]
3496         else:
3497             return self.playlist_result(entries)
3498
3499     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3500                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3501         urls = []
3502         formats = []
3503         for source in jwplayer_sources_data:
3504             if not isinstance(source, dict):
3505                 continue
3506             source_url = urljoin(
3507                 base_url, self._proto_relative_url(source.get('file')))
3508             if not source_url or source_url in urls:
3509                 continue
3510             urls.append(source_url)
3511             source_type = source.get('type') or ''
3512             ext = mimetype2ext(source_type) or determine_ext(source_url)
3513             if source_type == 'hls' or ext == 'm3u8':
3514                 formats.extend(self._extract_m3u8_formats(
3515                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3516                     m3u8_id=m3u8_id, fatal=False))
3517             elif source_type == 'dash' or ext == 'mpd':
3518                 formats.extend(self._extract_mpd_formats(
3519                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3520             elif ext == 'smil':
3521                 formats.extend(self._extract_smil_formats(
3522                     source_url, video_id, fatal=False))
3523             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3524             elif source_type.startswith('audio') or ext in (
3525                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3526                 formats.append({
3527                     'url': source_url,
3528                     'vcodec': 'none',
3529                     'ext': ext,
3530                 })
3531             else:
3532                 height = int_or_none(source.get('height'))
3533                 if height is None:
3534                     # Often no height is provided but there is a label in
3535                     # format like "1080p", "720p SD", or 1080.
3536                     height = int_or_none(self._search_regex(
3537                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3538                         'height', default=None))
3539                 a_format = {
3540                     'url': source_url,
3541                     'width': int_or_none(source.get('width')),
3542                     'height': height,
3543                     'tbr': int_or_none(source.get('bitrate')),
3544                     'ext': ext,
3545                 }
3546                 if source_url.startswith('rtmp'):
3547                     a_format['ext'] = 'flv'
3548                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3549                     # of jwplayer.flash.swf
3550                     rtmp_url_parts = re.split(
3551                         r'((?:mp4|mp3|flv):)', source_url, 1)
3552                     if len(rtmp_url_parts) == 3:
3553                         rtmp_url, prefix, play_path = rtmp_url_parts
3554                         a_format.update({
3555                             'url': rtmp_url,
3556                             'play_path': prefix + play_path,
3557                         })
3558                     if rtmp_params:
3559                         a_format.update(rtmp_params)
3560                 formats.append(a_format)
3561         return formats
3562
3563     def _live_title(self, name):
3564         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3565         return name
3566
3567     def _int(self, v, name, fatal=False, **kwargs):
3568         res = int_or_none(v, **kwargs)
3569         if res is None:
3570             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3571             if fatal:
3572                 raise ExtractorError(msg)
3573             else:
3574                 self.report_warning(msg)
3575         return res
3576
3577     def _float(self, v, name, fatal=False, **kwargs):
3578         res = float_or_none(v, **kwargs)
3579         if res is None:
3580             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3581             if fatal:
3582                 raise ExtractorError(msg)
3583             else:
3584                 self.report_warning(msg)
3585         return res
3586
3587     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3588                     path='/', secure=False, discard=False, rest={}, **kwargs):
3589         cookie = compat_cookiejar_Cookie(
3590             0, name, value, port, port is not None, domain, True,
3591             domain.startswith('.'), path, True, secure, expire_time,
3592             discard, None, None, rest)
3593         self._downloader.cookiejar.set_cookie(cookie)
3594
3595     def _get_cookies(self, url):
3596         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3597         req = sanitized_Request(url)
3598         self._downloader.cookiejar.add_cookie_header(req)
3599         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3600
3601     def _apply_first_set_cookie_header(self, url_handle, cookie):
3602         """
3603         Apply first Set-Cookie header instead of the last. Experimental.
3604
3605         Some sites (e.g. [1-3]) may serve two cookies under the same name
3606         in Set-Cookie header and expect the first (old) one to be set rather
3607         than second (new). However, as of RFC6265 the newer one cookie
3608         should be set into cookie store what actually happens.
3609         We will workaround this issue by resetting the cookie to
3610         the first one manually.
3611         1. https://new.vk.com/
3612         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3613         3. https://learning.oreilly.com/
3614         """
3615         for header, cookies in url_handle.headers.items():
3616             if header.lower() != 'set-cookie':
3617                 continue
3618             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3619             cookie_value = re.search(
3620                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3621             if cookie_value:
3622                 value, domain = cookie_value.groups()
3623                 self._set_cookie(domain, cookie, value)
3624                 break
3625
3626     def get_testcases(self, include_onlymatching=False):
3627         t = getattr(self, '_TEST', None)
3628         if t:
3629             assert not hasattr(self, '_TESTS'), \
3630                 '%s has _TEST and _TESTS' % type(self).__name__
3631             tests = [t]
3632         else:
3633             tests = getattr(self, '_TESTS', [])
3634         for t in tests:
3635             if not include_onlymatching and t.get('only_matching', False):
3636                 continue
3637             t['name'] = type(self).__name__[:-len('IE')]
3638             yield t
3639
3640     def is_suitable(self, age_limit):
3641         """ Test whether the extractor is generally suitable for the given
3642         age limit (i.e. pornographic sites are not, all others usually are) """
3643
3644         any_restricted = False
3645         for tc in self.get_testcases(include_onlymatching=False):
3646             if tc.get('playlist', []):
3647                 tc = tc['playlist'][0]
3648             is_restricted = age_restricted(
3649                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3650             if not is_restricted:
3651                 return True
3652             any_restricted = any_restricted or is_restricted
3653         return not any_restricted
3654
3655     def extract_subtitles(self, *args, **kwargs):
3656         if (self.get_param('writesubtitles', False)
3657                 or self.get_param('listsubtitles')):
3658             return self._get_subtitles(*args, **kwargs)
3659         return {}
3660
3661     def _get_subtitles(self, *args, **kwargs):
3662         raise NotImplementedError('This method must be implemented by subclasses')
3663
3664     def extract_comments(self, *args, **kwargs):
3665         if not self.get_param('getcomments'):
3666             return None
3667         generator = self._get_comments(*args, **kwargs)
3668
3669         def extractor():
3670             comments = []
3671             interrupted = True
3672             try:
3673                 while True:
3674                     comments.append(next(generator))
3675             except StopIteration:
3676                 interrupted = False
3677             except KeyboardInterrupt:
3678                 self.to_screen('Interrupted by user')
3679             except Exception as e:
3680                 if self.get_param('ignoreerrors') is not True:
3681                     raise
3682                 self._downloader.report_error(e)
3683             comment_count = len(comments)
3684             self.to_screen(f'Extracted {comment_count} comments')
3685             return {
3686                 'comments': comments,
3687                 'comment_count': None if interrupted else comment_count
3688             }
3689         return extractor
3690
3691     def _get_comments(self, *args, **kwargs):
3692         raise NotImplementedError('This method must be implemented by subclasses')
3693
3694     @staticmethod
3695     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3696         """ Merge subtitle items for one language. Items with duplicated URLs/data
3697         will be dropped. """
3698         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3699         ret = list(subtitle_list1)
3700         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3701         return ret
3702
3703     @classmethod
3704     def _merge_subtitles(cls, *dicts, target=None):
3705         """ Merge subtitle dictionaries, language by language. """
3706         if target is None:
3707             target = {}
3708         for d in dicts:
3709             for lang, subs in d.items():
3710                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3711         return target
3712
3713     def extract_automatic_captions(self, *args, **kwargs):
3714         if (self.get_param('writeautomaticsub', False)
3715                 or self.get_param('listsubtitles')):
3716             return self._get_automatic_captions(*args, **kwargs)
3717         return {}
3718
3719     def _get_automatic_captions(self, *args, **kwargs):
3720         raise NotImplementedError('This method must be implemented by subclasses')
3721
3722     def mark_watched(self, *args, **kwargs):
3723         if not self.get_param('mark_watched', False):
3724             return
3725         if (self.supports_login() and self._get_login_info()[0] is not None
3726                 or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')):
3727             self._mark_watched(*args, **kwargs)
3728
3729     def _mark_watched(self, *args, **kwargs):
3730         raise NotImplementedError('This method must be implemented by subclasses')
3731
3732     def geo_verification_headers(self):
3733         headers = {}
3734         geo_verification_proxy = self.get_param('geo_verification_proxy')
3735         if geo_verification_proxy:
3736             headers['Ytdl-request-proxy'] = geo_verification_proxy
3737         return headers
3738
3739     def _generic_id(self, url):
3740         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3741
3742     def _generic_title(self, url):
3743         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3744
3745     @staticmethod
3746     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3747         all_known = all(map(
3748             lambda x: x is not None,
3749             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3750         return (
3751             'private' if is_private
3752             else 'premium_only' if needs_premium
3753             else 'subscriber_only' if needs_subscription
3754             else 'needs_auth' if needs_auth
3755             else 'unlisted' if is_unlisted
3756             else 'public' if all_known
3757             else None)
3758
3759     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3760         '''
3761         @returns            A list of values for the extractor argument given by "key"
3762                             or "default" if no such key is present
3763         @param default      The default value to return when the key is not present (default: [])
3764         @param casesense    When false, the values are converted to lower case
3765         '''
3766         val = traverse_obj(
3767             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3768         if val is None:
3769             return [] if default is NO_DEFAULT else default
3770         return list(val) if casesense else [x.lower() for x in val]
3771
3772     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3773         if not playlist_id or not video_id:
3774             return not video_id
3775
3776         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3777         if no_playlist is not None:
3778             return not no_playlist
3779
3780         video_id = '' if video_id is True else f' {video_id}'
3781         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3782         if self.get_param('noplaylist'):
3783             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3784             return False
3785         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3786         return True
3787
3788
3789 class SearchInfoExtractor(InfoExtractor):
3790     """
3791     Base class for paged search queries extractors.
3792     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3793     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3794     """
3795
3796     _MAX_RESULTS = float('inf')
3797
3798     @classmethod
3799     def _make_valid_url(cls):
3800         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3801
3802     def _real_extract(self, query):
3803         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3804         if prefix == '':
3805             return self._get_n_results(query, 1)
3806         elif prefix == 'all':
3807             return self._get_n_results(query, self._MAX_RESULTS)
3808         else:
3809             n = int(prefix)
3810             if n <= 0:
3811                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3812             elif n > self._MAX_RESULTS:
3813                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3814                 n = self._MAX_RESULTS
3815             return self._get_n_results(query, n)
3816
3817     def _get_n_results(self, query, n):
3818         """Get a specified number of results for a query.
3819         Either this function or _search_results must be overridden by subclasses """
3820         return self.playlist_result(
3821             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3822             query, query)
3823
3824     def _search_results(self, query):
3825         """Returns an iterator of search results"""
3826         raise NotImplementedError('This method must be implemented by subclasses')
3827
3828     @property
3829     def SEARCH_KEY(self):
3830         return self._SEARCH_KEY