yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import inspect
   9 import itertools
  10 import json
  11 import math
  12 import netrc
  13 import os
  14 import random
  15 import re
  16 import sys
  17 import time
  18 import types
  19 import urllib.parse
  20 import urllib.request
  21 import xml.etree.ElementTree
  22
  23 from ..compat import functools  # isort: split
  24 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  25 from ..cookies import LenientSimpleCookie
  26 from ..downloader import FileDownloader
  27 from ..downloader.f4m import get_base_url, remove_encrypted_media
  28 from ..utils import (
  29     IDENTITY,
  30     JSON_LD_RE,
  31     NO_DEFAULT,
  32     ExtractorError,
  33     GeoRestrictedError,
  34     GeoUtils,
  35     LenientJSONDecoder,
  36     RegexNotFoundError,
  37     RetryManager,
  38     UnsupportedError,
  39     age_restricted,
  40     base_url,
  41     bug_reports_message,
  42     classproperty,
  43     clean_html,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     encode_data_uri,
  48     error_to_compat_str,
  49     extract_attributes,
  50     filter_dict,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     int_or_none,
  55     join_nonempty,
  56     js_to_json,
  57     mimetype2ext,
  58     network_exceptions,
  59     orderedSet,
  60     parse_bitrate,
  61     parse_codecs,
  62     parse_duration,
  63     parse_iso8601,
  64     parse_m3u8_attributes,
  65     parse_resolution,
  66     sanitize_filename,
  67     sanitize_url,
  68     sanitized_Request,
  69     smuggle_url,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     traverse_obj,
  74     try_call,
  75     try_get,
  76     unescapeHTML,
  77     unified_strdate,
  78     unified_timestamp,
  79     update_Request,
  80     update_url_query,
  81     url_basename,
  82     url_or_none,
  83     urljoin,
  84     variadic,
  85     xpath_element,
  86     xpath_text,
  87     xpath_with_ns,
  88 )
  89
  90
  91 class InfoExtractor:
  92     """Information Extractor class.
  93
  94     Information extractors are the classes that, given a URL, extract
  95     information about the video (or videos) the URL refers to. This
  96     information includes the real video URL, the video title, author and
  97     others. The information is stored in a dictionary which is then
  98     passed to the YoutubeDL. The YoutubeDL processes this
  99     information possibly downloading the video to the file system, among
 100     other possible outcomes.
 101
 102     The type field determines the type of the result.
 103     By far the most common value (and the default if _type is missing) is
 104     "video", which indicates a single video.
 105
 106     For a video, the dictionaries must include the following fields:
 107
 108     id:             Video identifier.
 109     title:          Video title, unescaped. Set to an empty string if video has
 110                     no title as opposed to "None" which signifies that the
 111                     extractor failed to obtain a title
 112
 113     Additionally, it must contain either a formats entry or a url one:
 114
 115     formats:        A list of dictionaries for each format available, ordered
 116                     from worst to best quality.
 117
 118                     Potential fields:
 119                     * url        The mandatory URL representing the media:
 120                                    for plain file media - HTTP URL of this file,
 121                                    for RTMP - RTMP URL,
 122                                    for HLS - URL of the M3U8 media playlist,
 123                                    for HDS - URL of the F4M manifest,
 124                                    for DASH
 125                                      - HTTP URL to plain file media (in case of
 126                                        unfragmented media)
 127                                      - URL of the MPD manifest or base URL
 128                                        representing the media if MPD manifest
 129                                        is parsed from a string (in case of
 130                                        fragmented media)
 131                                    for MSS - URL of the ISM manifest.
 132                     * manifest_url
 133                                  The URL of the manifest file in case of
 134                                  fragmented media:
 135                                    for HLS - URL of the M3U8 master playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH - URL of the MPD manifest,
 138                                    for MSS - URL of the ISM manifest.
 139                     * manifest_stream_number  (For internal use only)
 140                                  The index of the stream in the manifest file
 141                     * ext        Will be calculated from URL if missing
 142                     * format     A human-readable description of the format
 143                                  ("mp4 container with h264/opus").
 144                                  Calculated from the format_id, width, height.
 145                                  and format_note fields if missing.
 146                     * format_id  A short description of the format
 147                                  ("mp4_h264_opus" or "19").
 148                                 Technically optional, but strongly recommended.
 149                     * format_note Additional info about the format
 150                                  ("3D" or "DASH video")
 151                     * width      Width of the video, if known
 152                     * height     Height of the video, if known
 153                     * aspect_ratio  Aspect ratio of the video, if known
 154                                  Automatically calculated from width and height
 155                     * resolution Textual description of width and height
 156                                  Automatically calculated from width and height
 157                     * dynamic_range The dynamic range of the video. One of:
 158                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 159                     * tbr        Average bitrate of audio and video in KBit/s
 160                     * abr        Average audio bitrate in KBit/s
 161                     * acodec     Name of the audio codec in use
 162                     * asr        Audio sampling rate in Hertz
 163                     * audio_channels  Number of audio channels
 164                     * vbr        Average video bitrate in KBit/s
 165                     * fps        Frame rate
 166                     * vcodec     Name of the video codec in use
 167                     * container  Name of the container format
 168                     * filesize   The number of bytes, if known in advance
 169                     * filesize_approx  An estimate for the number of bytes
 170                     * player_url SWF Player URL (used for rtmpdump).
 171                     * protocol   The protocol that will be used for the actual
 172                                  download, lower-case. One of "http", "https" or
 173                                  one of the protocols defined in downloader.PROTOCOL_MAP
 174                     * fragment_base_url
 175                                  Base URL for fragments. Each fragment's path
 176                                  value (if present) will be relative to
 177                                  this URL.
 178                     * fragments  A list of fragments of a fragmented media.
 179                                  Each fragment entry must contain either an url
 180                                  or a path. If an url is present it should be
 181                                  considered by a client. Otherwise both path and
 182                                  fragment_base_url must be present. Here is
 183                                  the list of all potential fields:
 184                                  * "url" - fragment's URL
 185                                  * "path" - fragment's path relative to
 186                                             fragment_base_url
 187                                  * "duration" (optional, int or float)
 188                                  * "filesize" (optional, int)
 189                     * is_from_start  Is a live format that can be downloaded
 190                                 from the start. Boolean
 191                     * preference Order number of this format. If this field is
 192                                  present and not None, the formats get sorted
 193                                  by this field, regardless of all other values.
 194                                  -1 for default (order by other properties),
 195                                  -2 or smaller for less than default.
 196                                  < -1000 to hide the format (if there is
 197                                     another one which is strictly better)
 198                     * language   Language code, e.g. "de" or "en-US".
 199                     * language_preference  Is this in the language mentioned in
 200                                  the URL?
 201                                  10 if it's what the URL is about,
 202                                  -1 for default (don't know),
 203                                  -10 otherwise, other values reserved for now.
 204                     * quality    Order number of the video quality of this
 205                                  format, irrespective of the file format.
 206                                  -1 for default (order by other properties),
 207                                  -2 or smaller for less than default.
 208                     * source_preference  Order number for this video source
 209                                   (quality takes higher priority)
 210                                  -1 for default (order by other properties),
 211                                  -2 or smaller for less than default.
 212                     * http_headers  A dictionary of additional HTTP headers
 213                                  to add to the request.
 214                     * stretched_ratio  If given and not 1, indicates that the
 215                                  video's pixels are not square.
 216                                  width : height ratio as float.
 217                     * no_resume  The server does not support resuming the
 218                                  (HTTP or RTMP) download. Boolean.
 219                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 220                     * downloader_options  A dictionary of downloader options
 221                                  (For internal use only)
 222                                  * http_chunk_size Chunk size for HTTP downloads
 223                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 224                     RTMP formats can also have the additional fields: page_url,
 225                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 226                     rtmp_protocol, rtmp_real_time
 227
 228     url:            Final video URL.
 229     ext:            Video filename extension.
 230     format:         The video format, defaults to ext (used for --get-format)
 231     player_url:     SWF Player URL (used for rtmpdump).
 232
 233     The following fields are optional:
 234
 235     direct:         True if a direct video file was given (must only be set by GenericIE)
 236     alt_title:      A secondary title of the video.
 237     display_id      An alternative identifier for the video, not necessarily
 238                     unique, but available before title. Typically, id is
 239                     something like "4234987", title "Dancing naked mole rats",
 240                     and display_id "dancing-naked-mole-rats"
 241     thumbnails:     A list of dictionaries, with the following entries:
 242                         * "id" (optional, string) - Thumbnail format ID
 243                         * "url"
 244                         * "preference" (optional, int) - quality of the image
 245                         * "width" (optional, int)
 246                         * "height" (optional, int)
 247                         * "resolution" (optional, string "{width}x{height}",
 248                                         deprecated)
 249                         * "filesize" (optional, int)
 250                         * "http_headers" (dict) - HTTP headers for the request
 251     thumbnail:      Full URL to a video thumbnail image.
 252     description:    Full video description.
 253     uploader:       Full name of the video uploader.
 254     license:        License name the video is licensed under.
 255     creator:        The creator of the video.
 256     timestamp:      UNIX timestamp of the moment the video was uploaded
 257     upload_date:    Video upload date in UTC (YYYYMMDD).
 258                     If not explicitly set, calculated from timestamp
 259     release_timestamp: UNIX timestamp of the moment the video was released.
 260                     If it is not clear whether to use timestamp or this, use the former
 261     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 262                     If not explicitly set, calculated from release_timestamp
 263     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 264     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 265                     If not explicitly set, calculated from modified_timestamp
 266     uploader_id:    Nickname or id of the video uploader.
 267     uploader_url:   Full URL to a personal webpage of the video uploader.
 268     channel:        Full name of the channel the video is uploaded on.
 269                     Note that channel fields may or may not repeat uploader
 270                     fields. This depends on a particular extractor.
 271     channel_id:     Id of the channel.
 272     channel_url:    Full URL to a channel webpage.
 273     channel_follower_count: Number of followers of the channel.
 274     location:       Physical location where the video was filmed.
 275     subtitles:      The available subtitles as a dictionary in the format
 276                     {tag: subformats}. "tag" is usually a language code, and
 277                     "subformats" is a list sorted from lower to higher
 278                     preference, each element is a dictionary with the "ext"
 279                     entry and one of:
 280                         * "data": The subtitles file contents
 281                         * "url": A URL pointing to the subtitles file
 282                     It can optionally also have:
 283                         * "name": Name or description of the subtitles
 284                         * "http_headers": A dictionary of additional HTTP headers
 285                                   to add to the request.
 286                     "ext" will be calculated from URL if missing
 287     automatic_captions: Like 'subtitles'; contains automatically generated
 288                     captions instead of normal subtitles
 289     duration:       Length of the video in seconds, as an integer or float.
 290     view_count:     How many users have watched the video on the platform.
 291     concurrent_view_count: How many users are currently watching the video on the platform.
 292     like_count:     Number of positive ratings of the video
 293     dislike_count:  Number of negative ratings of the video
 294     repost_count:   Number of reposts of the video
 295     average_rating: Average rating give by users, the scale used depends on the webpage
 296     comment_count:  Number of comments on the video
 297     comments:       A list of comments, each with one or more of the following
 298                     properties (all but one of text or html optional):
 299                         * "author" - human-readable name of the comment author
 300                         * "author_id" - user ID of the comment author
 301                         * "author_thumbnail" - The thumbnail of the comment author
 302                         * "id" - Comment ID
 303                         * "html" - Comment as HTML
 304                         * "text" - Plain text of the comment
 305                         * "timestamp" - UNIX timestamp of comment
 306                         * "parent" - ID of the comment this one is replying to.
 307                                      Set to "root" to indicate that this is a
 308                                      comment to the original video.
 309                         * "like_count" - Number of positive ratings of the comment
 310                         * "dislike_count" - Number of negative ratings of the comment
 311                         * "is_favorited" - Whether the comment is marked as
 312                                            favorite by the video uploader
 313                         * "author_is_uploader" - Whether the comment is made by
 314                                                  the video uploader
 315     age_limit:      Age restriction for the video, as an integer (years)
 316     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 317                     should allow to get the same result again. (It will be set
 318                     by YoutubeDL if it's missing)
 319     categories:     A list of categories that the video falls in, for example
 320                     ["Sports", "Berlin"]
 321     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 322     cast:           A list of the video cast
 323     is_live:        True, False, or None (=unknown). Whether this video is a
 324                     live stream that goes on instead of a fixed-length video.
 325     was_live:       True, False, or None (=unknown). Whether this video was
 326                     originally a live stream.
 327     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 328                     or 'post_live' (was live, but VOD is not yet processed)
 329                     If absent, automatically set from is_live, was_live
 330     start_time:     Time in seconds where the reproduction should start, as
 331                     specified in the URL.
 332     end_time:       Time in seconds where the reproduction should end, as
 333                     specified in the URL.
 334     chapters:       A list of dictionaries, with the following entries:
 335                         * "start_time" - The start time of the chapter in seconds
 336                         * "end_time" - The end time of the chapter in seconds
 337                         * "title" (optional, string)
 338     playable_in_embed: Whether this video is allowed to play in embedded
 339                     players on other sites. Can be True (=always allowed),
 340                     False (=never allowed), None (=unknown), or a string
 341                     specifying the criteria for embedability; e.g. 'whitelist'
 342     availability:   Under what condition the video is available. One of
 343                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 344                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 345                     to set it
 346     _old_archive_ids: A list of old archive ids needed for backward compatibility
 347     __post_extractor: A function to be called just before the metadata is
 348                     written to either disk, logger or console. The function
 349                     must return a dict which will be added to the info_dict.
 350                     This is usefull for additional information that is
 351                     time-consuming to extract. Note that the fields thus
 352                     extracted will not be available to output template and
 353                     match_filter. So, only "comments" and "comment_count" are
 354                     currently allowed to be extracted via this method.
 355
 356     The following fields should only be used when the video belongs to some logical
 357     chapter or section:
 358
 359     chapter:        Name or title of the chapter the video belongs to.
 360     chapter_number: Number of the chapter the video belongs to, as an integer.
 361     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 362
 363     The following fields should only be used when the video is an episode of some
 364     series, programme or podcast:
 365
 366     series:         Title of the series or programme the video episode belongs to.
 367     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 368     season:         Title of the season the video episode belongs to.
 369     season_number:  Number of the season the video episode belongs to, as an integer.
 370     season_id:      Id of the season the video episode belongs to, as a unicode string.
 371     episode:        Title of the video episode. Unlike mandatory video title field,
 372                     this field should denote the exact title of the video episode
 373                     without any kind of decoration.
 374     episode_number: Number of the video episode within a season, as an integer.
 375     episode_id:     Id of the video episode, as a unicode string.
 376
 377     The following fields should only be used when the media is a track or a part of
 378     a music album:
 379
 380     track:          Title of the track.
 381     track_number:   Number of the track within an album or a disc, as an integer.
 382     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 383                     as a unicode string.
 384     artist:         Artist(s) of the track.
 385     genre:          Genre(s) of the track.
 386     album:          Title of the album the track belongs to.
 387     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 388     album_artist:   List of all artists appeared on the album (e.g.
 389                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 390                     and compilations).
 391     disc_number:    Number of the disc or other physical medium the track belongs to,
 392                     as an integer.
 393     release_year:   Year (YYYY) when the album was released.
 394     composer:       Composer of the piece
 395
 396     The following fields should only be set for clips that should be cut from the original video:
 397
 398     section_start:  Start time of the section in seconds
 399     section_end:    End time of the section in seconds
 400
 401     The following fields should only be set for storyboards:
 402     rows:           Number of rows in each storyboard fragment, as an integer
 403     columns:        Number of columns in each storyboard fragment, as an integer
 404
 405     Unless mentioned otherwise, the fields should be Unicode strings.
 406
 407     Unless mentioned otherwise, None is equivalent to absence of information.
 408
 409
 410     _type "playlist" indicates multiple videos.
 411     There must be a key "entries", which is a list, an iterable, or a PagedList
 412     object, each element of which is a valid dictionary by this specification.
 413
 414     Additionally, playlists can have "id", "title", and any other relevant
 415     attributes with the same semantics as videos (see above).
 416
 417     It can also have the following optional fields:
 418
 419     playlist_count: The total number of videos in a playlist. If not given,
 420                     YoutubeDL tries to calculate it from "entries"
 421
 422
 423     _type "multi_video" indicates that there are multiple videos that
 424     form a single show, for examples multiple acts of an opera or TV episode.
 425     It must have an entries key like a playlist and contain all the keys
 426     required for a video at the same time.
 427
 428
 429     _type "url" indicates that the video must be extracted from another
 430     location, possibly by a different extractor. Its only required key is:
 431     "url" - the next URL to extract.
 432     The key "ie_key" can be set to the class name (minus the trailing "IE",
 433     e.g. "Youtube") if the extractor class is known in advance.
 434     Additionally, the dictionary may have any properties of the resolved entity
 435     known in advance, for example "title" if the title of the referred video is
 436     known ahead of time.
 437
 438
 439     _type "url_transparent" entities have the same specification as "url", but
 440     indicate that the given additional information is more precise than the one
 441     associated with the resolved URL.
 442     This is useful when a site employs a video service that hosts the video and
 443     its technical metadata, but that video service does not embed a useful
 444     title, description etc.
 445
 446
 447     Subclasses of this should also be added to the list of extractors and
 448     should define a _VALID_URL regexp and, re-define the _real_extract() and
 449     (optionally) _real_initialize() methods.
 450
 451     Subclasses may also override suitable() if necessary, but ensure the function
 452     signature is preserved and that this function imports everything it needs
 453     (except other extractors), so that lazy_extractors works correctly.
 454
 455     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 456     the HTML of Generic webpages. It may also override _extract_embed_urls
 457     or _extract_from_webpage as necessary. While these are normally classmethods,
 458     _extract_from_webpage is allowed to be an instance method.
 459
 460     _extract_from_webpage may raise self.StopExtraction() to stop further
 461     processing of the webpage and obtain exclusive rights to it. This is useful
 462     when the extractor cannot reliably be matched using just the URL,
 463     e.g. invidious/peertube instances
 464
 465     Embed-only extractors can be defined by setting _VALID_URL = False.
 466
 467     To support username + password (or netrc) login, the extractor must define a
 468     _NETRC_MACHINE and re-define _perform_login(username, password) and
 469     (optionally) _initialize_pre_login() methods. The _perform_login method will
 470     be called between _initialize_pre_login and _real_initialize if credentials
 471     are passed by the user. In cases where it is necessary to have the login
 472     process as part of the extraction rather than initialization, _perform_login
 473     can be left undefined.
 474
 475     _GEO_BYPASS attribute may be set to False in order to disable
 476     geo restriction bypass mechanisms for a particular extractor.
 477     Though it won't disable explicit geo restriction bypass based on
 478     country code provided with geo_bypass_country.
 479
 480     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 481     countries for this extractor. One of these countries will be used by
 482     geo restriction bypass mechanism right away in order to bypass
 483     geo restriction, of course, if the mechanism is not disabled.
 484
 485     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 486     IP blocks in CIDR notation for this extractor. One of these IP blocks
 487     will be used by geo restriction bypass mechanism similarly
 488     to _GEO_COUNTRIES.
 489
 490     The _ENABLED attribute should be set to False for IEs that
 491     are disabled by default and must be explicitly enabled.
 492
 493     The _WORKING attribute should be set to False for broken IEs
 494     in order to warn the users and skip the tests.
 495     """
 496
 497     _ready = False
 498     _downloader = None
 499     _x_forwarded_for_ip = None
 500     _GEO_BYPASS = True
 501     _GEO_COUNTRIES = None
 502     _GEO_IP_BLOCKS = None
 503     _WORKING = True
 504     _ENABLED = True
 505     _NETRC_MACHINE = None
 506     IE_DESC = None
 507     SEARCH_KEY = None
 508     _VALID_URL = None
 509     _EMBED_REGEX = []
 510
 511     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 512         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 513         return {
 514             None: '',
 515             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 516             'password': f'Use {password_hint}',
 517             'cookies': (
 518                 'Use --cookies-from-browser or --cookies for the authentication. '
 519                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 520         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 521
 522     def __init__(self, downloader=None):
 523         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 524         If a downloader is not passed during initialization,
 525         it must be set using "set_downloader()" before "extract()" is called"""
 526         self._ready = False
 527         self._x_forwarded_for_ip = None
 528         self._printed_messages = set()
 529         self.set_downloader(downloader)
 530
 531     @classmethod
 532     def _match_valid_url(cls, url):
 533         if cls._VALID_URL is False:
 534             return None
 535         # This does not use has/getattr intentionally - we want to know whether
 536         # we have cached the regexp for *this* class, whereas getattr would also
 537         # match the superclass
 538         if '_VALID_URL_RE' not in cls.__dict__:
 539             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 540         return cls._VALID_URL_RE.match(url)
 541
 542     @classmethod
 543     def suitable(cls, url):
 544         """Receives a URL and returns True if suitable for this IE."""
 545         # This function must import everything it needs (except other extractors),
 546         # so that lazy_extractors works correctly
 547         return cls._match_valid_url(url) is not None
 548
 549     @classmethod
 550     def _match_id(cls, url):
 551         return cls._match_valid_url(url).group('id')
 552
 553     @classmethod
 554     def get_temp_id(cls, url):
 555         try:
 556             return cls._match_id(url)
 557         except (IndexError, AttributeError):
 558             return None
 559
 560     @classmethod
 561     def working(cls):
 562         """Getter method for _WORKING."""
 563         return cls._WORKING
 564
 565     @classmethod
 566     def supports_login(cls):
 567         return bool(cls._NETRC_MACHINE)
 568
 569     def initialize(self):
 570         """Initializes an instance (authentication, etc)."""
 571         self._printed_messages = set()
 572         self._initialize_geo_bypass({
 573             'countries': self._GEO_COUNTRIES,
 574             'ip_blocks': self._GEO_IP_BLOCKS,
 575         })
 576         if not self._ready:
 577             self._initialize_pre_login()
 578             if self.supports_login():
 579                 username, password = self._get_login_info()
 580                 if username:
 581                     self._perform_login(username, password)
 582             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 583                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 584             self._real_initialize()
 585             self._ready = True
 586
 587     def _initialize_geo_bypass(self, geo_bypass_context):
 588         """
 589         Initialize geo restriction bypass mechanism.
 590
 591         This method is used to initialize geo bypass mechanism based on faking
 592         X-Forwarded-For HTTP header. A random country from provided country list
 593         is selected and a random IP belonging to this country is generated. This
 594         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 595         HTTP requests.
 596
 597         This method will be used for initial geo bypass mechanism initialization
 598         during the instance initialization with _GEO_COUNTRIES and
 599         _GEO_IP_BLOCKS.
 600
 601         You may also manually call it from extractor's code if geo bypass
 602         information is not available beforehand (e.g. obtained during
 603         extraction) or due to some other reason. In this case you should pass
 604         this information in geo bypass context passed as first argument. It may
 605         contain following fields:
 606
 607         countries:  List of geo unrestricted countries (similar
 608                     to _GEO_COUNTRIES)
 609         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 610                     (similar to _GEO_IP_BLOCKS)
 611
 612         """
 613         if not self._x_forwarded_for_ip:
 614
 615             # Geo bypass mechanism is explicitly disabled by user
 616             if not self.get_param('geo_bypass', True):
 617                 return
 618
 619             if not geo_bypass_context:
 620                 geo_bypass_context = {}
 621
 622             # Backward compatibility: previously _initialize_geo_bypass
 623             # expected a list of countries, some 3rd party code may still use
 624             # it this way
 625             if isinstance(geo_bypass_context, (list, tuple)):
 626                 geo_bypass_context = {
 627                     'countries': geo_bypass_context,
 628                 }
 629
 630             # The whole point of geo bypass mechanism is to fake IP
 631             # as X-Forwarded-For HTTP header based on some IP block or
 632             # country code.
 633
 634             # Path 1: bypassing based on IP block in CIDR notation
 635
 636             # Explicit IP block specified by user, use it right away
 637             # regardless of whether extractor is geo bypassable or not
 638             ip_block = self.get_param('geo_bypass_ip_block', None)
 639
 640             # Otherwise use random IP block from geo bypass context but only
 641             # if extractor is known as geo bypassable
 642             if not ip_block:
 643                 ip_blocks = geo_bypass_context.get('ip_blocks')
 644                 if self._GEO_BYPASS and ip_blocks:
 645                     ip_block = random.choice(ip_blocks)
 646
 647             if ip_block:
 648                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 649                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 650                 return
 651
 652             # Path 2: bypassing based on country code
 653
 654             # Explicit country code specified by user, use it right away
 655             # regardless of whether extractor is geo bypassable or not
 656             country = self.get_param('geo_bypass_country', None)
 657
 658             # Otherwise use random country code from geo bypass context but
 659             # only if extractor is known as geo bypassable
 660             if not country:
 661                 countries = geo_bypass_context.get('countries')
 662                 if self._GEO_BYPASS and countries:
 663                     country = random.choice(countries)
 664
 665             if country:
 666                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 667                 self._downloader.write_debug(
 668                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 669
 670     def extract(self, url):
 671         """Extracts URL information and returns it in list of dicts."""
 672         try:
 673             for _ in range(2):
 674                 try:
 675                     self.initialize()
 676                     self.write_debug('Extracting URL: %s' % url)
 677                     ie_result = self._real_extract(url)
 678                     if ie_result is None:
 679                         return None
 680                     if self._x_forwarded_for_ip:
 681                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 682                     subtitles = ie_result.get('subtitles') or {}
 683                     if 'no-live-chat' in self.get_param('compat_opts'):
 684                         for lang in ('live_chat', 'comments', 'danmaku'):
 685                             subtitles.pop(lang, None)
 686                     return ie_result
 687                 except GeoRestrictedError as e:
 688                     if self.__maybe_fake_ip_and_retry(e.countries):
 689                         continue
 690                     raise
 691         except UnsupportedError:
 692             raise
 693         except ExtractorError as e:
 694             kwargs = {
 695                 'video_id': e.video_id or self.get_temp_id(url),
 696                 'ie': self.IE_NAME,
 697                 'tb': e.traceback or sys.exc_info()[2],
 698                 'expected': e.expected,
 699                 'cause': e.cause
 700             }
 701             if hasattr(e, 'countries'):
 702                 kwargs['countries'] = e.countries
 703             raise type(e)(e.orig_msg, **kwargs)
 704         except http.client.IncompleteRead as e:
 705             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 706         except (KeyError, StopIteration) as e:
 707             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 708
 709     def __maybe_fake_ip_and_retry(self, countries):
 710         if (not self.get_param('geo_bypass_country', None)
 711                 and self._GEO_BYPASS
 712                 and self.get_param('geo_bypass', True)
 713                 and not self._x_forwarded_for_ip
 714                 and countries):
 715             country_code = random.choice(countries)
 716             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 717             if self._x_forwarded_for_ip:
 718                 self.report_warning(
 719                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 720                     % (self._x_forwarded_for_ip, country_code.upper()))
 721                 return True
 722         return False
 723
 724     def set_downloader(self, downloader):
 725         """Sets a YoutubeDL instance as the downloader for this IE."""
 726         self._downloader = downloader
 727
 728     @property
 729     def cache(self):
 730         return self._downloader.cache
 731
 732     @property
 733     def cookiejar(self):
 734         return self._downloader.cookiejar
 735
 736     def _initialize_pre_login(self):
 737         """ Initialization before login. Redefine in subclasses."""
 738         pass
 739
 740     def _perform_login(self, username, password):
 741         """ Login with username and password. Redefine in subclasses."""
 742         pass
 743
 744     def _real_initialize(self):
 745         """Real initialization process. Redefine in subclasses."""
 746         pass
 747
 748     def _real_extract(self, url):
 749         """Real extraction process. Redefine in subclasses."""
 750         raise NotImplementedError('This method must be implemented by subclasses')
 751
 752     @classmethod
 753     def ie_key(cls):
 754         """A string for getting the InfoExtractor with get_info_extractor"""
 755         return cls.__name__[:-2]
 756
 757     @classproperty
 758     def IE_NAME(cls):
 759         return cls.__name__[:-2]
 760
 761     @staticmethod
 762     def __can_accept_status_code(err, expected_status):
 763         assert isinstance(err, urllib.error.HTTPError)
 764         if expected_status is None:
 765             return False
 766         elif callable(expected_status):
 767             return expected_status(err.code) is True
 768         else:
 769             return err.code in variadic(expected_status)
 770
 771     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 772         if isinstance(url_or_request, urllib.request.Request):
 773             return update_Request(url_or_request, data=data, headers=headers, query=query)
 774         if query:
 775             url_or_request = update_url_query(url_or_request, query)
 776         return sanitized_Request(url_or_request, data, headers or {})
 777
 778     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 779         """
 780         Return the response handle.
 781
 782         See _download_webpage docstring for arguments specification.
 783         """
 784         if not self._downloader._first_webpage_request:
 785             sleep_interval = self.get_param('sleep_interval_requests') or 0
 786             if sleep_interval > 0:
 787                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 788                 time.sleep(sleep_interval)
 789         else:
 790             self._downloader._first_webpage_request = False
 791
 792         if note is None:
 793             self.report_download_webpage(video_id)
 794         elif note is not False:
 795             if video_id is None:
 796                 self.to_screen(str(note))
 797             else:
 798                 self.to_screen(f'{video_id}: {note}')
 799
 800         # Some sites check X-Forwarded-For HTTP header in order to figure out
 801         # the origin of the client behind proxy. This allows bypassing geo
 802         # restriction by faking this header's value to IP that belongs to some
 803         # geo unrestricted country. We will do so once we encounter any
 804         # geo restriction error.
 805         if self._x_forwarded_for_ip:
 806             headers = (headers or {}).copy()
 807             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 808
 809         try:
 810             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 811         except network_exceptions as err:
 812             if isinstance(err, urllib.error.HTTPError):
 813                 if self.__can_accept_status_code(err, expected_status):
 814                     # Retain reference to error to prevent file object from
 815                     # being closed before it can be read. Works around the
 816                     # effects of <https://bugs.python.org/issue15002>
 817                     # introduced in Python 3.4.1.
 818                     err.fp._error = err
 819                     return err.fp
 820
 821             if errnote is False:
 822                 return False
 823             if errnote is None:
 824                 errnote = 'Unable to download webpage'
 825
 826             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 827             if fatal:
 828                 raise ExtractorError(errmsg, cause=err)
 829             else:
 830                 self.report_warning(errmsg)
 831                 return False
 832
 833     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 834                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 835         """
 836         Return a tuple (page content as string, URL handle).
 837
 838         Arguments:
 839         url_or_request -- plain text URL as a string or
 840             a urllib.request.Request object
 841         video_id -- Video/playlist/item identifier (string)
 842
 843         Keyword arguments:
 844         note -- note printed before downloading (string)
 845         errnote -- note printed in case of an error (string)
 846         fatal -- flag denoting whether error should be considered fatal,
 847             i.e. whether it should cause ExtractionError to be raised,
 848             otherwise a warning will be reported and extraction continued
 849         encoding -- encoding for a page content decoding, guessed automatically
 850             when not explicitly specified
 851         data -- POST data (bytes)
 852         headers -- HTTP headers (dict)
 853         query -- URL query (dict)
 854         expected_status -- allows to accept failed HTTP requests (non 2xx
 855             status code) by explicitly specifying a set of accepted status
 856             codes. Can be any of the following entities:
 857                 - an integer type specifying an exact failed status code to
 858                   accept
 859                 - a list or a tuple of integer types specifying a list of
 860                   failed status codes to accept
 861                 - a callable accepting an actual failed status code and
 862                   returning True if it should be accepted
 863             Note that this argument does not affect success status codes (2xx)
 864             which are always accepted.
 865         """
 866
 867         # Strip hashes from the URL (#1038)
 868         if isinstance(url_or_request, str):
 869             url_or_request = url_or_request.partition('#')[0]
 870
 871         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 872         if urlh is False:
 873             assert not fatal
 874             return False
 875         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 876         return (content, urlh)
 877
 878     @staticmethod
 879     def _guess_encoding_from_content(content_type, webpage_bytes):
 880         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 881         if m:
 882             encoding = m.group(1)
 883         else:
 884             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 885                           webpage_bytes[:1024])
 886             if m:
 887                 encoding = m.group(1).decode('ascii')
 888             elif webpage_bytes.startswith(b'\xff\xfe'):
 889                 encoding = 'utf-16'
 890             else:
 891                 encoding = 'utf-8'
 892
 893         return encoding
 894
 895     def __check_blocked(self, content):
 896         first_block = content[:512]
 897         if ('<title>Access to this site is blocked</title>' in content
 898                 and 'Websense' in first_block):
 899             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 900             blocked_iframe = self._html_search_regex(
 901                 r'<iframe src="([^"]+)"', content,
 902                 'Websense information URL', default=None)
 903             if blocked_iframe:
 904                 msg += ' Visit %s for more details' % blocked_iframe
 905             raise ExtractorError(msg, expected=True)
 906         if '<title>The URL you requested has been blocked</title>' in first_block:
 907             msg = (
 908                 'Access to this webpage has been blocked by Indian censorship. '
 909                 'Use a VPN or proxy server (with --proxy) to route around it.')
 910             block_msg = self._html_search_regex(
 911                 r'</h1><p>(.*?)</p>',
 912                 content, 'block message', default=None)
 913             if block_msg:
 914                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 915             raise ExtractorError(msg, expected=True)
 916         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 917                 and 'blocklist.rkn.gov.ru' in content):
 918             raise ExtractorError(
 919                 'Access to this webpage has been blocked by decision of the Russian government. '
 920                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 921                 expected=True)
 922
 923     def _request_dump_filename(self, url, video_id):
 924         basen = f'{video_id}_{url}'
 925         trim_length = self.get_param('trim_file_name') or 240
 926         if len(basen) > trim_length:
 927             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 928             basen = basen[:trim_length - len(h)] + h
 929         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 930         # Working around MAX_PATH limitation on Windows (see
 931         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 932         if compat_os_name == 'nt':
 933             absfilepath = os.path.abspath(filename)
 934             if len(absfilepath) > 259:
 935                 filename = fR'\\?\{absfilepath}'
 936         return filename
 937
 938     def __decode_webpage(self, webpage_bytes, encoding, headers):
 939         if not encoding:
 940             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 941         try:
 942             return webpage_bytes.decode(encoding, 'replace')
 943         except LookupError:
 944             return webpage_bytes.decode('utf-8', 'replace')
 945
 946     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 947         webpage_bytes = urlh.read()
 948         if prefix is not None:
 949             webpage_bytes = prefix + webpage_bytes
 950         if self.get_param('dump_intermediate_pages', False):
 951             self.to_screen('Dumping request to ' + urlh.geturl())
 952             dump = base64.b64encode(webpage_bytes).decode('ascii')
 953             self._downloader.to_screen(dump)
 954         if self.get_param('write_pages'):
 955             filename = self._request_dump_filename(urlh.geturl(), video_id)
 956             self.to_screen(f'Saving request to {filename}')
 957             with open(filename, 'wb') as outf:
 958                 outf.write(webpage_bytes)
 959
 960         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 961         self.__check_blocked(content)
 962
 963         return content
 964
 965     def __print_error(self, errnote, fatal, video_id, err):
 966         if fatal:
 967             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 968         elif errnote:
 969             self.report_warning(f'{video_id}: {errnote}: {err}')
 970
 971     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 972         if transform_source:
 973             xml_string = transform_source(xml_string)
 974         try:
 975             return compat_etree_fromstring(xml_string.encode('utf-8'))
 976         except xml.etree.ElementTree.ParseError as ve:
 977             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 978
 979     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 980         try:
 981             return json.loads(
 982                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 983         except ValueError as ve:
 984             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
 985
 986     def _parse_socket_response_as_json(self, data, *args, **kwargs):
 987         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
 988
 989     def __create_download_methods(name, parser, note, errnote, return_value):
 990
 991         def parse(ie, content, *args, errnote=errnote, **kwargs):
 992             if parser is None:
 993                 return content
 994             if errnote is False:
 995                 kwargs['errnote'] = errnote
 996             # parser is fetched by name so subclasses can override it
 997             return getattr(ie, parser)(content, *args, **kwargs)
 998
 999         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1000                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1001             res = self._download_webpage_handle(
1002                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1003                 data=data, headers=headers, query=query, expected_status=expected_status)
1004             if res is False:
1005                 return res
1006             content, urlh = res
1007             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1008
1009         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1010                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1011             if self.get_param('load_pages'):
1012                 url_or_request = self._create_request(url_or_request, data, headers, query)
1013                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1014                 self.to_screen(f'Loading request from {filename}')
1015                 try:
1016                     with open(filename, 'rb') as dumpf:
1017                         webpage_bytes = dumpf.read()
1018                 except OSError as e:
1019                     self.report_warning(f'Unable to load request from disk: {e}')
1020                 else:
1021                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1022                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1023             kwargs = {
1024                 'note': note,
1025                 'errnote': errnote,
1026                 'transform_source': transform_source,
1027                 'fatal': fatal,
1028                 'encoding': encoding,
1029                 'data': data,
1030                 'headers': headers,
1031                 'query': query,
1032                 'expected_status': expected_status,
1033             }
1034             if parser is None:
1035                 kwargs.pop('transform_source')
1036             # The method is fetched by name so subclasses can override _download_..._handle
1037             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1038             return res if res is False else res[0]
1039
1040         def impersonate(func, name, return_value):
1041             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1042             func.__doc__ = f'''
1043                 @param transform_source     Apply this transformation before parsing
1044                 @returns                    {return_value}
1045
1046                 See _download_webpage_handle docstring for other arguments specification
1047             '''
1048
1049         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1050         impersonate(download_content, f'_download_{name}', f'{return_value}')
1051         return download_handle, download_content
1052
1053     _download_xml_handle, _download_xml = __create_download_methods(
1054         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1055     _download_json_handle, _download_json = __create_download_methods(
1056         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1057     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1058         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1059     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1060
1061     def _download_webpage(
1062             self, url_or_request, video_id, note=None, errnote=None,
1063             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1064         """
1065         Return the data of the page as a string.
1066
1067         Keyword arguments:
1068         tries -- number of tries
1069         timeout -- sleep interval between tries
1070
1071         See _download_webpage_handle docstring for other arguments specification.
1072         """
1073
1074         R''' # NB: These are unused; should they be deprecated?
1075         if tries != 1:
1076             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1077         if timeout is NO_DEFAULT:
1078             timeout = 5
1079         else:
1080             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1081         '''
1082
1083         try_count = 0
1084         while True:
1085             try:
1086                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1087             except http.client.IncompleteRead as e:
1088                 try_count += 1
1089                 if try_count >= tries:
1090                     raise e
1091                 self._sleep(timeout, video_id)
1092
1093     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1094         idstr = format_field(video_id, None, '%s: ')
1095         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1096         if only_once:
1097             if f'WARNING: {msg}' in self._printed_messages:
1098                 return
1099             self._printed_messages.add(f'WARNING: {msg}')
1100         self._downloader.report_warning(msg, *args, **kwargs)
1101
1102     def to_screen(self, msg, *args, **kwargs):
1103         """Print msg to screen, prefixing it with '[ie_name]'"""
1104         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1105
1106     def write_debug(self, msg, *args, **kwargs):
1107         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1108
1109     def get_param(self, name, default=None, *args, **kwargs):
1110         if self._downloader:
1111             return self._downloader.params.get(name, default, *args, **kwargs)
1112         return default
1113
1114     def report_drm(self, video_id, partial=NO_DEFAULT):
1115         if partial is not NO_DEFAULT:
1116             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1117         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1118
1119     def report_extraction(self, id_or_name):
1120         """Report information extraction."""
1121         self.to_screen('%s: Extracting information' % id_or_name)
1122
1123     def report_download_webpage(self, video_id):
1124         """Report webpage download."""
1125         self.to_screen('%s: Downloading webpage' % video_id)
1126
1127     def report_age_confirmation(self):
1128         """Report attempt to confirm age."""
1129         self.to_screen('Confirming age')
1130
1131     def report_login(self):
1132         """Report attempt to log in."""
1133         self.to_screen('Logging in')
1134
1135     def raise_login_required(
1136             self, msg='This video is only available for registered users',
1137             metadata_available=False, method=NO_DEFAULT):
1138         if metadata_available and (
1139                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1140             self.report_warning(msg)
1141             return
1142         msg += format_field(self._login_hint(method), None, '. %s')
1143         raise ExtractorError(msg, expected=True)
1144
1145     def raise_geo_restricted(
1146             self, msg='This video is not available from your location due to geo restriction',
1147             countries=None, metadata_available=False):
1148         if metadata_available and (
1149                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1150             self.report_warning(msg)
1151         else:
1152             raise GeoRestrictedError(msg, countries=countries)
1153
1154     def raise_no_formats(self, msg, expected=False, video_id=None):
1155         if expected and (
1156                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1157             self.report_warning(msg, video_id)
1158         elif isinstance(msg, ExtractorError):
1159             raise msg
1160         else:
1161             raise ExtractorError(msg, expected=expected, video_id=video_id)
1162
1163     # Methods for following #608
1164     @staticmethod
1165     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1166         """Returns a URL that points to a page that should be processed"""
1167         if ie is not None:
1168             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1169         if video_id is not None:
1170             kwargs['id'] = video_id
1171         if video_title is not None:
1172             kwargs['title'] = video_title
1173         return {
1174             **kwargs,
1175             '_type': 'url_transparent' if url_transparent else 'url',
1176             'url': url,
1177         }
1178
1179     @classmethod
1180     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1181                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1182         return cls.playlist_result(
1183             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1184             playlist_id, playlist_title, **kwargs)
1185
1186     @staticmethod
1187     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1188         """Returns a playlist"""
1189         if playlist_id:
1190             kwargs['id'] = playlist_id
1191         if playlist_title:
1192             kwargs['title'] = playlist_title
1193         if playlist_description is not None:
1194             kwargs['description'] = playlist_description
1195         return {
1196             **kwargs,
1197             '_type': 'multi_video' if multi_video else 'playlist',
1198             'entries': entries,
1199         }
1200
1201     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1202         """
1203         Perform a regex search on the given string, using a single or a list of
1204         patterns returning the first matching group.
1205         In case of failure return a default value or raise a WARNING or a
1206         RegexNotFoundError, depending on fatal, specifying the field name.
1207         """
1208         if string is None:
1209             mobj = None
1210         elif isinstance(pattern, (str, re.Pattern)):
1211             mobj = re.search(pattern, string, flags)
1212         else:
1213             for p in pattern:
1214                 mobj = re.search(p, string, flags)
1215                 if mobj:
1216                     break
1217
1218         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1219
1220         if mobj:
1221             if group is None:
1222                 # return the first matching group
1223                 return next(g for g in mobj.groups() if g is not None)
1224             elif isinstance(group, (list, tuple)):
1225                 return tuple(mobj.group(g) for g in group)
1226             else:
1227                 return mobj.group(group)
1228         elif default is not NO_DEFAULT:
1229             return default
1230         elif fatal:
1231             raise RegexNotFoundError('Unable to extract %s' % _name)
1232         else:
1233             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1234             return None
1235
1236     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1237                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1238         """Searches string for the JSON object specified by start_pattern"""
1239         # NB: end_pattern is only used to reduce the size of the initial match
1240         if default is NO_DEFAULT:
1241             default, has_default = {}, False
1242         else:
1243             fatal, has_default = False, True
1244
1245         json_string = self._search_regex(
1246             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1247             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1248         if not json_string:
1249             return default
1250
1251         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1252         try:
1253             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1254         except ExtractorError as e:
1255             if fatal:
1256                 raise ExtractorError(
1257                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1258             elif not has_default:
1259                 self.report_warning(
1260                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1261         return default
1262
1263     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1264         """
1265         Like _search_regex, but strips HTML tags and unescapes entities.
1266         """
1267         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1268         if res:
1269             return clean_html(res).strip()
1270         else:
1271             return res
1272
1273     def _get_netrc_login_info(self, netrc_machine=None):
1274         username = None
1275         password = None
1276         netrc_machine = netrc_machine or self._NETRC_MACHINE
1277
1278         if self.get_param('usenetrc', False):
1279             try:
1280                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1281                 if os.path.isdir(netrc_file):
1282                     netrc_file = os.path.join(netrc_file, '.netrc')
1283                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1284                 if info is not None:
1285                     username = info[0]
1286                     password = info[2]
1287                 else:
1288                     raise netrc.NetrcParseError(
1289                         'No authenticators for %s' % netrc_machine)
1290             except (OSError, netrc.NetrcParseError) as err:
1291                 self.report_warning(
1292                     'parsing .netrc: %s' % error_to_compat_str(err))
1293
1294         return username, password
1295
1296     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1297         """
1298         Get the login info as (username, password)
1299         First look for the manually specified credentials using username_option
1300         and password_option as keys in params dictionary. If no such credentials
1301         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1302         value.
1303         If there's no info available, return (None, None)
1304         """
1305
1306         # Attempt to use provided username and password or .netrc data
1307         username = self.get_param(username_option)
1308         if username is not None:
1309             password = self.get_param(password_option)
1310         else:
1311             username, password = self._get_netrc_login_info(netrc_machine)
1312
1313         return username, password
1314
1315     def _get_tfa_info(self, note='two-factor verification code'):
1316         """
1317         Get the two-factor authentication info
1318         TODO - asking the user will be required for sms/phone verify
1319         currently just uses the command line option
1320         If there's no info available, return None
1321         """
1322
1323         tfa = self.get_param('twofactor')
1324         if tfa is not None:
1325             return tfa
1326
1327         return getpass.getpass('Type %s and press [Return]: ' % note)
1328
1329     # Helper functions for extracting OpenGraph info
1330     @staticmethod
1331     def _og_regexes(prop):
1332         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1333         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1334                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1335         template = r'<meta[^>]+?%s[^>]+?%s'
1336         return [
1337             template % (property_re, content_re),
1338             template % (content_re, property_re),
1339         ]
1340
1341     @staticmethod
1342     def _meta_regex(prop):
1343         return r'''(?isx)<meta
1344                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1345                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1346
1347     def _og_search_property(self, prop, html, name=None, **kargs):
1348         prop = variadic(prop)
1349         if name is None:
1350             name = 'OpenGraph %s' % prop[0]
1351         og_regexes = []
1352         for p in prop:
1353             og_regexes.extend(self._og_regexes(p))
1354         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1355         if escaped is None:
1356             return None
1357         return unescapeHTML(escaped)
1358
1359     def _og_search_thumbnail(self, html, **kargs):
1360         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1361
1362     def _og_search_description(self, html, **kargs):
1363         return self._og_search_property('description', html, fatal=False, **kargs)
1364
1365     def _og_search_title(self, html, *, fatal=False, **kargs):
1366         return self._og_search_property('title', html, fatal=fatal, **kargs)
1367
1368     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1369         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1370         if secure:
1371             regexes = self._og_regexes('video:secure_url') + regexes
1372         return self._html_search_regex(regexes, html, name, **kargs)
1373
1374     def _og_search_url(self, html, **kargs):
1375         return self._og_search_property('url', html, **kargs)
1376
1377     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1378         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1379
1380     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1381         name = variadic(name)
1382         if display_name is None:
1383             display_name = name[0]
1384         return self._html_search_regex(
1385             [self._meta_regex(n) for n in name],
1386             html, display_name, fatal=fatal, group='content', **kwargs)
1387
1388     def _dc_search_uploader(self, html):
1389         return self._html_search_meta('dc.creator', html, 'uploader')
1390
1391     @staticmethod
1392     def _rta_search(html):
1393         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1394         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1395                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1396                      html):
1397             return 18
1398
1399         # And then there are the jokers who advertise that they use RTA, but actually don't.
1400         AGE_LIMIT_MARKERS = [
1401             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1402         ]
1403         if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1404             return 18
1405         return 0
1406
1407     def _media_rating_search(self, html):
1408         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1409         rating = self._html_search_meta('rating', html)
1410
1411         if not rating:
1412             return None
1413
1414         RATING_TABLE = {
1415             'safe for kids': 0,
1416             'general': 8,
1417             '14 years': 14,
1418             'mature': 17,
1419             'restricted': 19,
1420         }
1421         return RATING_TABLE.get(rating.lower())
1422
1423     def _family_friendly_search(self, html):
1424         # See http://schema.org/VideoObject
1425         family_friendly = self._html_search_meta(
1426             'isFamilyFriendly', html, default=None)
1427
1428         if not family_friendly:
1429             return None
1430
1431         RATING_TABLE = {
1432             '1': 0,
1433             'true': 0,
1434             '0': 18,
1435             'false': 18,
1436         }
1437         return RATING_TABLE.get(family_friendly.lower())
1438
1439     def _twitter_search_player(self, html):
1440         return self._html_search_meta('twitter:player', html,
1441                                       'twitter card player')
1442
1443     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1444         """Yield all json ld objects in the html"""
1445         if default is not NO_DEFAULT:
1446             fatal = False
1447         for mobj in re.finditer(JSON_LD_RE, html):
1448             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1449             for json_ld in variadic(json_ld_item):
1450                 if isinstance(json_ld, dict):
1451                     yield json_ld
1452
1453     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1454         """Search for a video in any json ld in the html"""
1455         if default is not NO_DEFAULT:
1456             fatal = False
1457         info = self._json_ld(
1458             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1459             video_id, fatal=fatal, expected_type=expected_type)
1460         if info:
1461             return info
1462         if default is not NO_DEFAULT:
1463             return default
1464         elif fatal:
1465             raise RegexNotFoundError('Unable to extract JSON-LD')
1466         else:
1467             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1468             return {}
1469
1470     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1471         if isinstance(json_ld, str):
1472             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1473         if not json_ld:
1474             return {}
1475         info = {}
1476
1477         INTERACTION_TYPE_MAP = {
1478             'CommentAction': 'comment',
1479             'AgreeAction': 'like',
1480             'DisagreeAction': 'dislike',
1481             'LikeAction': 'like',
1482             'DislikeAction': 'dislike',
1483             'ListenAction': 'view',
1484             'WatchAction': 'view',
1485             'ViewAction': 'view',
1486         }
1487
1488         def is_type(e, *expected_types):
1489             type = variadic(traverse_obj(e, '@type'))
1490             return any(x in type for x in expected_types)
1491
1492         def extract_interaction_type(e):
1493             interaction_type = e.get('interactionType')
1494             if isinstance(interaction_type, dict):
1495                 interaction_type = interaction_type.get('@type')
1496             return str_or_none(interaction_type)
1497
1498         def extract_interaction_statistic(e):
1499             interaction_statistic = e.get('interactionStatistic')
1500             if isinstance(interaction_statistic, dict):
1501                 interaction_statistic = [interaction_statistic]
1502             if not isinstance(interaction_statistic, list):
1503                 return
1504             for is_e in interaction_statistic:
1505                 if not is_type(is_e, 'InteractionCounter'):
1506                     continue
1507                 interaction_type = extract_interaction_type(is_e)
1508                 if not interaction_type:
1509                     continue
1510                 # For interaction count some sites provide string instead of
1511                 # an integer (as per spec) with non digit characters (e.g. ",")
1512                 # so extracting count with more relaxed str_to_int
1513                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1514                 if interaction_count is None:
1515                     continue
1516                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1517                 if not count_kind:
1518                     continue
1519                 count_key = '%s_count' % count_kind
1520                 if info.get(count_key) is not None:
1521                     continue
1522                 info[count_key] = interaction_count
1523
1524         def extract_chapter_information(e):
1525             chapters = [{
1526                 'title': part.get('name'),
1527                 'start_time': part.get('startOffset'),
1528                 'end_time': part.get('endOffset'),
1529             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1530             for idx, (last_c, current_c, next_c) in enumerate(zip(
1531                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1532                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1533                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1534                 if None in current_c.values():
1535                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1536                     return
1537             if chapters:
1538                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1539                 info['chapters'] = chapters
1540
1541         def extract_video_object(e):
1542             author = e.get('author')
1543             info.update({
1544                 'url': url_or_none(e.get('contentUrl')),
1545                 'ext': mimetype2ext(e.get('encodingFormat')),
1546                 'title': unescapeHTML(e.get('name')),
1547                 'description': unescapeHTML(e.get('description')),
1548                 'thumbnails': [{'url': unescapeHTML(url)}
1549                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1550                                if url_or_none(url)],
1551                 'duration': parse_duration(e.get('duration')),
1552                 'timestamp': unified_timestamp(e.get('uploadDate')),
1553                 # author can be an instance of 'Organization' or 'Person' types.
1554                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1555                 # however some websites are using 'Text' type instead.
1556                 # 1. https://schema.org/VideoObject
1557                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1558                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1559                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1560                 'tbr': int_or_none(e.get('bitrate')),
1561                 'width': int_or_none(e.get('width')),
1562                 'height': int_or_none(e.get('height')),
1563                 'view_count': int_or_none(e.get('interactionCount')),
1564                 'tags': try_call(lambda: e.get('keywords').split(',')),
1565             })
1566             if is_type(e, 'AudioObject'):
1567                 info.update({
1568                     'vcodec': 'none',
1569                     'abr': int_or_none(e.get('bitrate')),
1570                 })
1571             extract_interaction_statistic(e)
1572             extract_chapter_information(e)
1573
1574         def traverse_json_ld(json_ld, at_top_level=True):
1575             for e in variadic(json_ld):
1576                 if not isinstance(e, dict):
1577                     continue
1578                 if at_top_level and '@context' not in e:
1579                     continue
1580                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1581                     traverse_json_ld(e['@graph'], at_top_level=False)
1582                     continue
1583                 if expected_type is not None and not is_type(e, expected_type):
1584                     continue
1585                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1586                 if rating is not None:
1587                     info['average_rating'] = rating
1588                 if is_type(e, 'TVEpisode', 'Episode'):
1589                     episode_name = unescapeHTML(e.get('name'))
1590                     info.update({
1591                         'episode': episode_name,
1592                         'episode_number': int_or_none(e.get('episodeNumber')),
1593                         'description': unescapeHTML(e.get('description')),
1594                     })
1595                     if not info.get('title') and episode_name:
1596                         info['title'] = episode_name
1597                     part_of_season = e.get('partOfSeason')
1598                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1599                         info.update({
1600                             'season': unescapeHTML(part_of_season.get('name')),
1601                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1602                         })
1603                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1604                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1605                         info['series'] = unescapeHTML(part_of_series.get('name'))
1606                 elif is_type(e, 'Movie'):
1607                     info.update({
1608                         'title': unescapeHTML(e.get('name')),
1609                         'description': unescapeHTML(e.get('description')),
1610                         'duration': parse_duration(e.get('duration')),
1611                         'timestamp': unified_timestamp(e.get('dateCreated')),
1612                     })
1613                 elif is_type(e, 'Article', 'NewsArticle'):
1614                     info.update({
1615                         'timestamp': parse_iso8601(e.get('datePublished')),
1616                         'title': unescapeHTML(e.get('headline')),
1617                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1618                     })
1619                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1620                         extract_video_object(e['video'][0])
1621                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1622                         extract_video_object(e['subjectOf'][0])
1623                 elif is_type(e, 'VideoObject', 'AudioObject'):
1624                     extract_video_object(e)
1625                     if expected_type is None:
1626                         continue
1627                     else:
1628                         break
1629                 video = e.get('video')
1630                 if is_type(video, 'VideoObject'):
1631                     extract_video_object(video)
1632                 if expected_type is None:
1633                     continue
1634                 else:
1635                     break
1636
1637         traverse_json_ld(json_ld)
1638         return filter_dict(info)
1639
1640     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1641         return self._parse_json(
1642             self._search_regex(
1643                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1644                 webpage, 'next.js data', fatal=fatal, **kw),
1645             video_id, transform_source=transform_source, fatal=fatal)
1646
1647     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1648         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1649         rectx = re.escape(context_name)
1650         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1651         js, arg_keys, arg_vals = self._search_regex(
1652             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1653             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1654             default=NO_DEFAULT if fatal else (None, None, None))
1655         if js is None:
1656             return {}
1657
1658         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1659
1660         for key, val in args.items():
1661             if val in ('undefined', 'void 0'):
1662                 args[key] = 'null'
1663
1664         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1665         return traverse_obj(ret, traverse) or {}
1666
1667     @staticmethod
1668     def _hidden_inputs(html):
1669         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1670         hidden_inputs = {}
1671         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1672             attrs = extract_attributes(input)
1673             if not input:
1674                 continue
1675             if attrs.get('type') not in ('hidden', 'submit'):
1676                 continue
1677             name = attrs.get('name') or attrs.get('id')
1678             value = attrs.get('value')
1679             if name and value is not None:
1680                 hidden_inputs[name] = value
1681         return hidden_inputs
1682
1683     def _form_hidden_inputs(self, form_id, html):
1684         form = self._search_regex(
1685             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1686             html, '%s form' % form_id, group='form')
1687         return self._hidden_inputs(form)
1688
1689     class FormatSort:
1690         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1691
1692         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1693                    'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
1694                    'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1695         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1696                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1697                         'fps', 'fs_approx', 'source', 'id')
1698
1699         settings = {
1700             'vcodec': {'type': 'ordered', 'regex': True,
1701                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1702             'acodec': {'type': 'ordered', 'regex': True,
1703                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1704             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1705                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1706             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1707                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1708             'vext': {'type': 'ordered', 'field': 'video_ext',
1709                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1710                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1711             'aext': {'type': 'ordered', 'field': 'audio_ext',
1712                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1713                      'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
1714             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1715             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1716                            'field': ('vcodec', 'acodec'),
1717                            'function': lambda it: int(any(v != 'none' for v in it))},
1718             'ie_pref': {'priority': True, 'type': 'extractor'},
1719             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1720             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1721             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1722             'quality': {'convert': 'float', 'default': -1},
1723             'filesize': {'convert': 'bytes'},
1724             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1725             'id': {'convert': 'string', 'field': 'format_id'},
1726             'height': {'convert': 'float_none'},
1727             'width': {'convert': 'float_none'},
1728             'fps': {'convert': 'float_none'},
1729             'channels': {'convert': 'float_none', 'field': 'audio_channels'},
1730             'tbr': {'convert': 'float_none'},
1731             'vbr': {'convert': 'float_none'},
1732             'abr': {'convert': 'float_none'},
1733             'asr': {'convert': 'float_none'},
1734             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1735
1736             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1737             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1738             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1739             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1740             'res': {'type': 'multiple', 'field': ('height', 'width'),
1741                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1742
1743             # Actual field names
1744             'format_id': {'type': 'alias', 'field': 'id'},
1745             'preference': {'type': 'alias', 'field': 'ie_pref'},
1746             'language_preference': {'type': 'alias', 'field': 'lang'},
1747             'source_preference': {'type': 'alias', 'field': 'source'},
1748             'protocol': {'type': 'alias', 'field': 'proto'},
1749             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1750             'audio_channels': {'type': 'alias', 'field': 'channels'},
1751
1752             # Deprecated
1753             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1754             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1755             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1756             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1757             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1758             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1759             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1760             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1761             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1762             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1763             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1764             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1765             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1766             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1767             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1768             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1769             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1770             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1771             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1772             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1773         }
1774
1775         def __init__(self, ie, field_preference):
1776             self._order = []
1777             self.ydl = ie._downloader
1778             self.evaluate_params(self.ydl.params, field_preference)
1779             if ie.get_param('verbose'):
1780                 self.print_verbose_info(self.ydl.write_debug)
1781
1782         def _get_field_setting(self, field, key):
1783             if field not in self.settings:
1784                 if key in ('forced', 'priority'):
1785                     return False
1786                 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
1787                                             'deprecated and may be removed in a future version')
1788                 self.settings[field] = {}
1789             propObj = self.settings[field]
1790             if key not in propObj:
1791                 type = propObj.get('type')
1792                 if key == 'field':
1793                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1794                 elif key == 'convert':
1795                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1796                 else:
1797                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1798                 propObj[key] = default
1799             return propObj[key]
1800
1801         def _resolve_field_value(self, field, value, convertNone=False):
1802             if value is None:
1803                 if not convertNone:
1804                     return None
1805             else:
1806                 value = value.lower()
1807             conversion = self._get_field_setting(field, 'convert')
1808             if conversion == 'ignore':
1809                 return None
1810             if conversion == 'string':
1811                 return value
1812             elif conversion == 'float_none':
1813                 return float_or_none(value)
1814             elif conversion == 'bytes':
1815                 return FileDownloader.parse_bytes(value)
1816             elif conversion == 'order':
1817                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1818                 use_regex = self._get_field_setting(field, 'regex')
1819                 list_length = len(order_list)
1820                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1821                 if use_regex and value is not None:
1822                     for i, regex in enumerate(order_list):
1823                         if regex and re.match(regex, value):
1824                             return list_length - i
1825                     return list_length - empty_pos  # not in list
1826                 else:  # not regex or  value = None
1827                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1828             else:
1829                 if value.isnumeric():
1830                     return float(value)
1831                 else:
1832                     self.settings[field]['convert'] = 'string'
1833                     return value
1834
1835         def evaluate_params(self, params, sort_extractor):
1836             self._use_free_order = params.get('prefer_free_formats', False)
1837             self._sort_user = params.get('format_sort', [])
1838             self._sort_extractor = sort_extractor
1839
1840             def add_item(field, reverse, closest, limit_text):
1841                 field = field.lower()
1842                 if field in self._order:
1843                     return
1844                 self._order.append(field)
1845                 limit = self._resolve_field_value(field, limit_text)
1846                 data = {
1847                     'reverse': reverse,
1848                     'closest': False if limit is None else closest,
1849                     'limit_text': limit_text,
1850                     'limit': limit}
1851                 if field in self.settings:
1852                     self.settings[field].update(data)
1853                 else:
1854                     self.settings[field] = data
1855
1856             sort_list = (
1857                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1858                 + (tuple() if params.get('format_sort_force', False)
1859                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1860                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1861
1862             for item in sort_list:
1863                 match = re.match(self.regex, item)
1864                 if match is None:
1865                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1866                 field = match.group('field')
1867                 if field is None:
1868                     continue
1869                 if self._get_field_setting(field, 'type') == 'alias':
1870                     alias, field = field, self._get_field_setting(field, 'field')
1871                     if self._get_field_setting(alias, 'deprecated'):
1872                         self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
1873                                                     f'be removed in a future version. Please use {field} instead')
1874                 reverse = match.group('reverse') is not None
1875                 closest = match.group('separator') == '~'
1876                 limit_text = match.group('limit')
1877
1878                 has_limit = limit_text is not None
1879                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1880                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1881
1882                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1883                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1884                 limit_count = len(limits)
1885                 for (i, f) in enumerate(fields):
1886                     add_item(f, reverse, closest,
1887                              limits[i] if i < limit_count
1888                              else limits[0] if has_limit and not has_multiple_limits
1889                              else None)
1890
1891         def print_verbose_info(self, write_debug):
1892             if self._sort_user:
1893                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1894             if self._sort_extractor:
1895                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1896             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1897                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1898                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1899                               self._get_field_setting(field, 'limit_text'),
1900                               self._get_field_setting(field, 'limit'))
1901                 if self._get_field_setting(field, 'limit_text') is not None else '')
1902                 for field in self._order if self._get_field_setting(field, 'visible')]))
1903
1904         def _calculate_field_preference_from_value(self, format, field, type, value):
1905             reverse = self._get_field_setting(field, 'reverse')
1906             closest = self._get_field_setting(field, 'closest')
1907             limit = self._get_field_setting(field, 'limit')
1908
1909             if type == 'extractor':
1910                 maximum = self._get_field_setting(field, 'max')
1911                 if value is None or (maximum is not None and value >= maximum):
1912                     value = -1
1913             elif type == 'boolean':
1914                 in_list = self._get_field_setting(field, 'in_list')
1915                 not_in_list = self._get_field_setting(field, 'not_in_list')
1916                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1917             elif type == 'ordered':
1918                 value = self._resolve_field_value(field, value, True)
1919
1920             # try to convert to number
1921             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1922             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1923             if is_num:
1924                 value = val_num
1925
1926             return ((-10, 0) if value is None
1927                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1928                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1929                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1930                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1931                     else (-1, value, 0))
1932
1933         def _calculate_field_preference(self, format, field):
1934             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1935             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1936             if type == 'multiple':
1937                 type = 'field'  # Only 'field' is allowed in multiple for now
1938                 actual_fields = self._get_field_setting(field, 'field')
1939
1940                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1941             else:
1942                 value = get_value(field)
1943             return self._calculate_field_preference_from_value(format, field, type, value)
1944
1945         def calculate_preference(self, format):
1946             # Determine missing protocol
1947             if not format.get('protocol'):
1948                 format['protocol'] = determine_protocol(format)
1949
1950             # Determine missing ext
1951             if not format.get('ext') and 'url' in format:
1952                 format['ext'] = determine_ext(format['url'])
1953             if format.get('vcodec') == 'none':
1954                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1955                 format['video_ext'] = 'none'
1956             else:
1957                 format['video_ext'] = format['ext']
1958                 format['audio_ext'] = 'none'
1959             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1960             #    format['preference'] = -1000
1961
1962             # Determine missing bitrates
1963             if format.get('tbr') is None:
1964                 if format.get('vbr') is not None and format.get('abr') is not None:
1965                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1966             else:
1967                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1968                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1969                 if format.get('acodec') != 'none' and format.get('abr') is None:
1970                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1971
1972             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1973
1974     def _sort_formats(self, formats, field_preference=[]):
1975         if not formats:
1976             return
1977         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1978
1979     def _check_formats(self, formats, video_id):
1980         if formats:
1981             formats[:] = filter(
1982                 lambda f: self._is_valid_url(
1983                     f['url'], video_id,
1984                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1985                 formats)
1986
1987     @staticmethod
1988     def _remove_duplicate_formats(formats):
1989         format_urls = set()
1990         unique_formats = []
1991         for f in formats:
1992             if f['url'] not in format_urls:
1993                 format_urls.add(f['url'])
1994                 unique_formats.append(f)
1995         formats[:] = unique_formats
1996
1997     def _is_valid_url(self, url, video_id, item='video', headers={}):
1998         url = self._proto_relative_url(url, scheme='http:')
1999         # For now assume non HTTP(S) URLs always valid
2000         if not (url.startswith('http://') or url.startswith('https://')):
2001             return True
2002         try:
2003             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
2004             return True
2005         except ExtractorError as e:
2006             self.to_screen(
2007                 '%s: %s URL is invalid, skipping: %s'
2008                 % (video_id, item, error_to_compat_str(e.cause)))
2009             return False
2010
2011     def http_scheme(self):
2012         """ Either "http:" or "https:", depending on the user's preferences """
2013         return (
2014             'http:'
2015             if self.get_param('prefer_insecure', False)
2016             else 'https:')
2017
2018     def _proto_relative_url(self, url, scheme=None):
2019         scheme = scheme or self.http_scheme()
2020         assert scheme.endswith(':')
2021         return sanitize_url(url, scheme=scheme[:-1])
2022
2023     def _sleep(self, timeout, video_id, msg_template=None):
2024         if msg_template is None:
2025             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
2026         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2027         self.to_screen(msg)
2028         time.sleep(timeout)
2029
2030     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2031                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
2032                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
2033         res = self._download_xml_handle(
2034             manifest_url, video_id, 'Downloading f4m manifest',
2035             'Unable to download f4m manifest',
2036             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
2037             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
2038             transform_source=transform_source,
2039             fatal=fatal, data=data, headers=headers, query=query)
2040         if res is False:
2041             return []
2042
2043         manifest, urlh = res
2044         manifest_url = urlh.geturl()
2045
2046         return self._parse_f4m_formats(
2047             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2048             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2049
2050     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2051                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2052                            fatal=True, m3u8_id=None):
2053         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2054             return []
2055
2056         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2057         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2058         if akamai_pv is not None and ';' in akamai_pv.text:
2059             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2060             if playerVerificationChallenge.strip() != '':
2061                 return []
2062
2063         formats = []
2064         manifest_version = '1.0'
2065         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2066         if not media_nodes:
2067             manifest_version = '2.0'
2068             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2069         # Remove unsupported DRM protected media from final formats
2070         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2071         media_nodes = remove_encrypted_media(media_nodes)
2072         if not media_nodes:
2073             return formats
2074
2075         manifest_base_url = get_base_url(manifest)
2076
2077         bootstrap_info = xpath_element(
2078             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2079             'bootstrap info', default=None)
2080
2081         vcodec = None
2082         mime_type = xpath_text(
2083             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2084             'base URL', default=None)
2085         if mime_type and mime_type.startswith('audio/'):
2086             vcodec = 'none'
2087
2088         for i, media_el in enumerate(media_nodes):
2089             tbr = int_or_none(media_el.attrib.get('bitrate'))
2090             width = int_or_none(media_el.attrib.get('width'))
2091             height = int_or_none(media_el.attrib.get('height'))
2092             format_id = join_nonempty(f4m_id, tbr or i)
2093             # If <bootstrapInfo> is present, the specified f4m is a
2094             # stream-level manifest, and only set-level manifests may refer to
2095             # external resources.  See section 11.4 and section 4 of F4M spec
2096             if bootstrap_info is None:
2097                 media_url = None
2098                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2099                 if manifest_version == '2.0':
2100                     media_url = media_el.attrib.get('href')
2101                 if media_url is None:
2102                     media_url = media_el.attrib.get('url')
2103                 if not media_url:
2104                     continue
2105                 manifest_url = (
2106                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2107                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2108                 # If media_url is itself a f4m manifest do the recursive extraction
2109                 # since bitrates in parent manifest (this one) and media_url manifest
2110                 # may differ leading to inability to resolve the format by requested
2111                 # bitrate in f4m downloader
2112                 ext = determine_ext(manifest_url)
2113                 if ext == 'f4m':
2114                     f4m_formats = self._extract_f4m_formats(
2115                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2116                         transform_source=transform_source, fatal=fatal)
2117                     # Sometimes stream-level manifest contains single media entry that
2118                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2119                     # At the same time parent's media entry in set-level manifest may
2120                     # contain it. We will copy it from parent in such cases.
2121                     if len(f4m_formats) == 1:
2122                         f = f4m_formats[0]
2123                         f.update({
2124                             'tbr': f.get('tbr') or tbr,
2125                             'width': f.get('width') or width,
2126                             'height': f.get('height') or height,
2127                             'format_id': f.get('format_id') if not tbr else format_id,
2128                             'vcodec': vcodec,
2129                         })
2130                     formats.extend(f4m_formats)
2131                     continue
2132                 elif ext == 'm3u8':
2133                     formats.extend(self._extract_m3u8_formats(
2134                         manifest_url, video_id, 'mp4', preference=preference,
2135                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2136                     continue
2137             formats.append({
2138                 'format_id': format_id,
2139                 'url': manifest_url,
2140                 'manifest_url': manifest_url,
2141                 'ext': 'flv' if bootstrap_info is not None else None,
2142                 'protocol': 'f4m',
2143                 'tbr': tbr,
2144                 'width': width,
2145                 'height': height,
2146                 'vcodec': vcodec,
2147                 'preference': preference,
2148                 'quality': quality,
2149             })
2150         return formats
2151
2152     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2153         return {
2154             'format_id': join_nonempty(m3u8_id, 'meta'),
2155             'url': m3u8_url,
2156             'ext': ext,
2157             'protocol': 'm3u8',
2158             'preference': preference - 100 if preference else -100,
2159             'quality': quality,
2160             'resolution': 'multiple',
2161             'format_note': 'Quality selection URL',
2162         }
2163
2164     def _report_ignoring_subs(self, name):
2165         self.report_warning(bug_reports_message(
2166             f'Ignoring subtitle tracks found in the {name} manifest; '
2167             'if any subtitle tracks are missing,'
2168         ), only_once=True)
2169
2170     def _extract_m3u8_formats(self, *args, **kwargs):
2171         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2172         if subs:
2173             self._report_ignoring_subs('HLS')
2174         return fmts
2175
2176     def _extract_m3u8_formats_and_subtitles(
2177             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2178             preference=None, quality=None, m3u8_id=None, note=None,
2179             errnote=None, fatal=True, live=False, data=None, headers={},
2180             query={}):
2181
2182         res = self._download_webpage_handle(
2183             m3u8_url, video_id,
2184             note='Downloading m3u8 information' if note is None else note,
2185             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2186             fatal=fatal, data=data, headers=headers, query=query)
2187
2188         if res is False:
2189             return [], {}
2190
2191         m3u8_doc, urlh = res
2192         m3u8_url = urlh.geturl()
2193
2194         return self._parse_m3u8_formats_and_subtitles(
2195             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2196             preference=preference, quality=quality, m3u8_id=m3u8_id,
2197             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2198             headers=headers, query=query, video_id=video_id)
2199
2200     def _parse_m3u8_formats_and_subtitles(
2201             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2202             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2203             errnote=None, fatal=True, data=None, headers={}, query={},
2204             video_id=None):
2205         formats, subtitles = [], {}
2206
2207         has_drm = re.search('|'.join([
2208             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2209             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2210         ]), m3u8_doc)
2211
2212         def format_url(url):
2213             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2214
2215         if self.get_param('hls_split_discontinuity', False):
2216             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2217                 if not m3u8_doc:
2218                     if not manifest_url:
2219                         return []
2220                     m3u8_doc = self._download_webpage(
2221                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2222                         note=False, errnote='Failed to download m3u8 playlist information')
2223                     if m3u8_doc is False:
2224                         return []
2225                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2226
2227         else:
2228             def _extract_m3u8_playlist_indices(*args, **kwargs):
2229                 return [None]
2230
2231         # References:
2232         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2233         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2234         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2235
2236         # We should try extracting formats only from master playlists [1, 4.3.4],
2237         # i.e. playlists that describe available qualities. On the other hand
2238         # media playlists [1, 4.3.3] should be returned as is since they contain
2239         # just the media without qualities renditions.
2240         # Fortunately, master playlist can be easily distinguished from media
2241         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2242         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2243         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2244         # media playlist and MUST NOT appear in master playlist thus we can
2245         # clearly detect media playlist with this criterion.
2246
2247         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2248             formats = [{
2249                 'format_id': join_nonempty(m3u8_id, idx),
2250                 'format_index': idx,
2251                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2252                 'ext': ext,
2253                 'protocol': entry_protocol,
2254                 'preference': preference,
2255                 'quality': quality,
2256                 'has_drm': has_drm,
2257             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2258
2259             return formats, subtitles
2260
2261         groups = {}
2262         last_stream_inf = {}
2263
2264         def extract_media(x_media_line):
2265             media = parse_m3u8_attributes(x_media_line)
2266             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2267             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2268             if not (media_type and group_id and name):
2269                 return
2270             groups.setdefault(group_id, []).append(media)
2271             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2272             if media_type == 'SUBTITLES':
2273                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2274                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2275                 # However, lack of URI has been spotted in the wild.
2276                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2277                 if not media.get('URI'):
2278                     return
2279                 url = format_url(media['URI'])
2280                 sub_info = {
2281                     'url': url,
2282                     'ext': determine_ext(url),
2283                 }
2284                 if sub_info['ext'] == 'm3u8':
2285                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2286                     # files may contain is WebVTT:
2287                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2288                     sub_info['ext'] = 'vtt'
2289                     sub_info['protocol'] = 'm3u8_native'
2290                 lang = media.get('LANGUAGE') or 'und'
2291                 subtitles.setdefault(lang, []).append(sub_info)
2292             if media_type not in ('VIDEO', 'AUDIO'):
2293                 return
2294             media_url = media.get('URI')
2295             if media_url:
2296                 manifest_url = format_url(media_url)
2297                 formats.extend({
2298                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2299                     'format_note': name,
2300                     'format_index': idx,
2301                     'url': manifest_url,
2302                     'manifest_url': m3u8_url,
2303                     'language': media.get('LANGUAGE'),
2304                     'ext': ext,
2305                     'protocol': entry_protocol,
2306                     'preference': preference,
2307                     'quality': quality,
2308                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2309                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2310
2311         def build_stream_name():
2312             # Despite specification does not mention NAME attribute for
2313             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2314             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2315             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2316             stream_name = last_stream_inf.get('NAME')
2317             if stream_name:
2318                 return stream_name
2319             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2320             # from corresponding rendition group
2321             stream_group_id = last_stream_inf.get('VIDEO')
2322             if not stream_group_id:
2323                 return
2324             stream_group = groups.get(stream_group_id)
2325             if not stream_group:
2326                 return stream_group_id
2327             rendition = stream_group[0]
2328             return rendition.get('NAME') or stream_group_id
2329
2330         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2331         # chance to detect video only formats when EXT-X-STREAM-INF tags
2332         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2333         for line in m3u8_doc.splitlines():
2334             if line.startswith('#EXT-X-MEDIA:'):
2335                 extract_media(line)
2336
2337         for line in m3u8_doc.splitlines():
2338             if line.startswith('#EXT-X-STREAM-INF:'):
2339                 last_stream_inf = parse_m3u8_attributes(line)
2340             elif line.startswith('#') or not line.strip():
2341                 continue
2342             else:
2343                 tbr = float_or_none(
2344                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2345                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2346                 manifest_url = format_url(line.strip())
2347
2348                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2349                     format_id = [m3u8_id, None, idx]
2350                     # Bandwidth of live streams may differ over time thus making
2351                     # format_id unpredictable. So it's better to keep provided
2352                     # format_id intact.
2353                     if not live:
2354                         stream_name = build_stream_name()
2355                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2356                     f = {
2357                         'format_id': join_nonempty(*format_id),
2358                         'format_index': idx,
2359                         'url': manifest_url,
2360                         'manifest_url': m3u8_url,
2361                         'tbr': tbr,
2362                         'ext': ext,
2363                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2364                         'protocol': entry_protocol,
2365                         'preference': preference,
2366                         'quality': quality,
2367                     }
2368                     resolution = last_stream_inf.get('RESOLUTION')
2369                     if resolution:
2370                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2371                         if mobj:
2372                             f['width'] = int(mobj.group('width'))
2373                             f['height'] = int(mobj.group('height'))
2374                     # Unified Streaming Platform
2375                     mobj = re.search(
2376                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2377                     if mobj:
2378                         abr, vbr = mobj.groups()
2379                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2380                         f.update({
2381                             'vbr': vbr,
2382                             'abr': abr,
2383                         })
2384                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2385                     f.update(codecs)
2386                     audio_group_id = last_stream_inf.get('AUDIO')
2387                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2388                     # references a rendition group MUST have a CODECS attribute.
2389                     # However, this is not always respected. E.g. [2]
2390                     # contains EXT-X-STREAM-INF tag which references AUDIO
2391                     # rendition group but does not have CODECS and despite
2392                     # referencing an audio group it represents a complete
2393                     # (with audio and video) format. So, for such cases we will
2394                     # ignore references to rendition groups and treat them
2395                     # as complete formats.
2396                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2397                         audio_group = groups.get(audio_group_id)
2398                         if audio_group and audio_group[0].get('URI'):
2399                             # TODO: update acodec for audio only formats with
2400                             # the same GROUP-ID
2401                             f['acodec'] = 'none'
2402                     if not f.get('ext'):
2403                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2404                     formats.append(f)
2405
2406                     # for DailyMotion
2407                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2408                     if progressive_uri:
2409                         http_f = f.copy()
2410                         del http_f['manifest_url']
2411                         http_f.update({
2412                             'format_id': f['format_id'].replace('hls-', 'http-'),
2413                             'protocol': 'http',
2414                             'url': progressive_uri,
2415                         })
2416                         formats.append(http_f)
2417
2418                 last_stream_inf = {}
2419         return formats, subtitles
2420
2421     def _extract_m3u8_vod_duration(
2422             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2423
2424         m3u8_vod = self._download_webpage(
2425             m3u8_vod_url, video_id,
2426             note='Downloading m3u8 VOD manifest' if note is None else note,
2427             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2428             fatal=False, data=data, headers=headers, query=query)
2429
2430         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2431
2432     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2433         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2434             return None
2435
2436         return int(sum(
2437             float(line[len('#EXTINF:'):].split(',')[0])
2438             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2439
2440     @staticmethod
2441     def _xpath_ns(path, namespace=None):
2442         if not namespace:
2443             return path
2444         out = []
2445         for c in path.split('/'):
2446             if not c or c == '.':
2447                 out.append(c)
2448             else:
2449                 out.append('{%s}%s' % (namespace, c))
2450         return '/'.join(out)
2451
2452     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2453         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2454         if res is False:
2455             assert not fatal
2456             return [], {}
2457
2458         smil, urlh = res
2459         smil_url = urlh.geturl()
2460
2461         namespace = self._parse_smil_namespace(smil)
2462
2463         fmts = self._parse_smil_formats(
2464             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2465         subs = self._parse_smil_subtitles(
2466             smil, namespace=namespace)
2467
2468         return fmts, subs
2469
2470     def _extract_smil_formats(self, *args, **kwargs):
2471         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2472         if subs:
2473             self._report_ignoring_subs('SMIL')
2474         return fmts
2475
2476     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2477         res = self._download_smil(smil_url, video_id, fatal=fatal)
2478         if res is False:
2479             return {}
2480
2481         smil, urlh = res
2482         smil_url = urlh.geturl()
2483
2484         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2485
2486     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2487         return self._download_xml_handle(
2488             smil_url, video_id, 'Downloading SMIL file',
2489             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2490
2491     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2492         namespace = self._parse_smil_namespace(smil)
2493
2494         formats = self._parse_smil_formats(
2495             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2496         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2497
2498         video_id = os.path.splitext(url_basename(smil_url))[0]
2499         title = None
2500         description = None
2501         upload_date = None
2502         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2503             name = meta.attrib.get('name')
2504             content = meta.attrib.get('content')
2505             if not name or not content:
2506                 continue
2507             if not title and name == 'title':
2508                 title = content
2509             elif not description and name in ('description', 'abstract'):
2510                 description = content
2511             elif not upload_date and name == 'date':
2512                 upload_date = unified_strdate(content)
2513
2514         thumbnails = [{
2515             'id': image.get('type'),
2516             'url': image.get('src'),
2517             'width': int_or_none(image.get('width')),
2518             'height': int_or_none(image.get('height')),
2519         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2520
2521         return {
2522             'id': video_id,
2523             'title': title or video_id,
2524             'description': description,
2525             'upload_date': upload_date,
2526             'thumbnails': thumbnails,
2527             'formats': formats,
2528             'subtitles': subtitles,
2529         }
2530
2531     def _parse_smil_namespace(self, smil):
2532         return self._search_regex(
2533             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2534
2535     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2536         base = smil_url
2537         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2538             b = meta.get('base') or meta.get('httpBase')
2539             if b:
2540                 base = b
2541                 break
2542
2543         formats = []
2544         rtmp_count = 0
2545         http_count = 0
2546         m3u8_count = 0
2547         imgs_count = 0
2548
2549         srcs = set()
2550         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2551         for medium in media:
2552             src = medium.get('src')
2553             if not src or src in srcs:
2554                 continue
2555             srcs.add(src)
2556
2557             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2558             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2559             width = int_or_none(medium.get('width'))
2560             height = int_or_none(medium.get('height'))
2561             proto = medium.get('proto')
2562             ext = medium.get('ext')
2563             src_ext = determine_ext(src)
2564             streamer = medium.get('streamer') or base
2565
2566             if proto == 'rtmp' or streamer.startswith('rtmp'):
2567                 rtmp_count += 1
2568                 formats.append({
2569                     'url': streamer,
2570                     'play_path': src,
2571                     'ext': 'flv',
2572                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2573                     'tbr': bitrate,
2574                     'filesize': filesize,
2575                     'width': width,
2576                     'height': height,
2577                 })
2578                 if transform_rtmp_url:
2579                     streamer, src = transform_rtmp_url(streamer, src)
2580                     formats[-1].update({
2581                         'url': streamer,
2582                         'play_path': src,
2583                     })
2584                 continue
2585
2586             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2587             src_url = src_url.strip()
2588
2589             if proto == 'm3u8' or src_ext == 'm3u8':
2590                 m3u8_formats = self._extract_m3u8_formats(
2591                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2592                 if len(m3u8_formats) == 1:
2593                     m3u8_count += 1
2594                     m3u8_formats[0].update({
2595                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2596                         'tbr': bitrate,
2597                         'width': width,
2598                         'height': height,
2599                     })
2600                 formats.extend(m3u8_formats)
2601             elif src_ext == 'f4m':
2602                 f4m_url = src_url
2603                 if not f4m_params:
2604                     f4m_params = {
2605                         'hdcore': '3.2.0',
2606                         'plugin': 'flowplayer-3.2.0.1',
2607                     }
2608                 f4m_url += '&' if '?' in f4m_url else '?'
2609                 f4m_url += urllib.parse.urlencode(f4m_params)
2610                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2611             elif src_ext == 'mpd':
2612                 formats.extend(self._extract_mpd_formats(
2613                     src_url, video_id, mpd_id='dash', fatal=False))
2614             elif re.search(r'\.ism/[Mm]anifest', src_url):
2615                 formats.extend(self._extract_ism_formats(
2616                     src_url, video_id, ism_id='mss', fatal=False))
2617             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2618                 http_count += 1
2619                 formats.append({
2620                     'url': src_url,
2621                     'ext': ext or src_ext or 'flv',
2622                     'format_id': 'http-%d' % (bitrate or http_count),
2623                     'tbr': bitrate,
2624                     'filesize': filesize,
2625                     'width': width,
2626                     'height': height,
2627                 })
2628
2629         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2630             src = medium.get('src')
2631             if not src or src in srcs:
2632                 continue
2633             srcs.add(src)
2634
2635             imgs_count += 1
2636             formats.append({
2637                 'format_id': 'imagestream-%d' % (imgs_count),
2638                 'url': src,
2639                 'ext': mimetype2ext(medium.get('type')),
2640                 'acodec': 'none',
2641                 'vcodec': 'none',
2642                 'width': int_or_none(medium.get('width')),
2643                 'height': int_or_none(medium.get('height')),
2644                 'format_note': 'SMIL storyboards',
2645             })
2646
2647         return formats
2648
2649     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2650         urls = []
2651         subtitles = {}
2652         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2653             src = textstream.get('src')
2654             if not src or src in urls:
2655                 continue
2656             urls.append(src)
2657             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2658             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2659             subtitles.setdefault(lang, []).append({
2660                 'url': src,
2661                 'ext': ext,
2662             })
2663         return subtitles
2664
2665     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2666         res = self._download_xml_handle(
2667             xspf_url, playlist_id, 'Downloading xpsf playlist',
2668             'Unable to download xspf manifest', fatal=fatal)
2669         if res is False:
2670             return []
2671
2672         xspf, urlh = res
2673         xspf_url = urlh.geturl()
2674
2675         return self._parse_xspf(
2676             xspf, playlist_id, xspf_url=xspf_url,
2677             xspf_base_url=base_url(xspf_url))
2678
2679     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2680         NS_MAP = {
2681             'xspf': 'http://xspf.org/ns/0/',
2682             's1': 'http://static.streamone.nl/player/ns/0',
2683         }
2684
2685         entries = []
2686         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2687             title = xpath_text(
2688                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2689             description = xpath_text(
2690                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2691             thumbnail = xpath_text(
2692                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2693             duration = float_or_none(
2694                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2695
2696             formats = []
2697             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2698                 format_url = urljoin(xspf_base_url, location.text)
2699                 if not format_url:
2700                     continue
2701                 formats.append({
2702                     'url': format_url,
2703                     'manifest_url': xspf_url,
2704                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2705                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2706                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2707                 })
2708             self._sort_formats(formats)
2709
2710             entries.append({
2711                 'id': playlist_id,
2712                 'title': title,
2713                 'description': description,
2714                 'thumbnail': thumbnail,
2715                 'duration': duration,
2716                 'formats': formats,
2717             })
2718         return entries
2719
2720     def _extract_mpd_formats(self, *args, **kwargs):
2721         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2722         if subs:
2723             self._report_ignoring_subs('DASH')
2724         return fmts
2725
2726     def _extract_mpd_formats_and_subtitles(
2727             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2728             fatal=True, data=None, headers={}, query={}):
2729         res = self._download_xml_handle(
2730             mpd_url, video_id,
2731             note='Downloading MPD manifest' if note is None else note,
2732             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2733             fatal=fatal, data=data, headers=headers, query=query)
2734         if res is False:
2735             return [], {}
2736         mpd_doc, urlh = res
2737         if mpd_doc is None:
2738             return [], {}
2739
2740         # We could have been redirected to a new url when we retrieved our mpd file.
2741         mpd_url = urlh.geturl()
2742         mpd_base_url = base_url(mpd_url)
2743
2744         return self._parse_mpd_formats_and_subtitles(
2745             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2746
2747     def _parse_mpd_formats(self, *args, **kwargs):
2748         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2749         if subs:
2750             self._report_ignoring_subs('DASH')
2751         return fmts
2752
2753     def _parse_mpd_formats_and_subtitles(
2754             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2755         """
2756         Parse formats from MPD manifest.
2757         References:
2758          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2759             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2760          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2761         """
2762         if not self.get_param('dynamic_mpd', True):
2763             if mpd_doc.get('type') == 'dynamic':
2764                 return [], {}
2765
2766         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2767
2768         def _add_ns(path):
2769             return self._xpath_ns(path, namespace)
2770
2771         def is_drm_protected(element):
2772             return element.find(_add_ns('ContentProtection')) is not None
2773
2774         def extract_multisegment_info(element, ms_parent_info):
2775             ms_info = ms_parent_info.copy()
2776
2777             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2778             # common attributes and elements.  We will only extract relevant
2779             # for us.
2780             def extract_common(source):
2781                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2782                 if segment_timeline is not None:
2783                     s_e = segment_timeline.findall(_add_ns('S'))
2784                     if s_e:
2785                         ms_info['total_number'] = 0
2786                         ms_info['s'] = []
2787                         for s in s_e:
2788                             r = int(s.get('r', 0))
2789                             ms_info['total_number'] += 1 + r
2790                             ms_info['s'].append({
2791                                 't': int(s.get('t', 0)),
2792                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2793                                 'd': int(s.attrib['d']),
2794                                 'r': r,
2795                             })
2796                 start_number = source.get('startNumber')
2797                 if start_number:
2798                     ms_info['start_number'] = int(start_number)
2799                 timescale = source.get('timescale')
2800                 if timescale:
2801                     ms_info['timescale'] = int(timescale)
2802                 segment_duration = source.get('duration')
2803                 if segment_duration:
2804                     ms_info['segment_duration'] = float(segment_duration)
2805
2806             def extract_Initialization(source):
2807                 initialization = source.find(_add_ns('Initialization'))
2808                 if initialization is not None:
2809                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2810
2811             segment_list = element.find(_add_ns('SegmentList'))
2812             if segment_list is not None:
2813                 extract_common(segment_list)
2814                 extract_Initialization(segment_list)
2815                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2816                 if segment_urls_e:
2817                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2818             else:
2819                 segment_template = element.find(_add_ns('SegmentTemplate'))
2820                 if segment_template is not None:
2821                     extract_common(segment_template)
2822                     media = segment_template.get('media')
2823                     if media:
2824                         ms_info['media'] = media
2825                     initialization = segment_template.get('initialization')
2826                     if initialization:
2827                         ms_info['initialization'] = initialization
2828                     else:
2829                         extract_Initialization(segment_template)
2830             return ms_info
2831
2832         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2833         formats, subtitles = [], {}
2834         stream_numbers = collections.defaultdict(int)
2835         for period in mpd_doc.findall(_add_ns('Period')):
2836             period_duration = parse_duration(period.get('duration')) or mpd_duration
2837             period_ms_info = extract_multisegment_info(period, {
2838                 'start_number': 1,
2839                 'timescale': 1,
2840             })
2841             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2842                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2843                 for representation in adaptation_set.findall(_add_ns('Representation')):
2844                     representation_attrib = adaptation_set.attrib.copy()
2845                     representation_attrib.update(representation.attrib)
2846                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2847                     mime_type = representation_attrib['mimeType']
2848                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2849
2850                     codec_str = representation_attrib.get('codecs', '')
2851                     # Some kind of binary subtitle found in some youtube livestreams
2852                     if mime_type == 'application/x-rawcc':
2853                         codecs = {'scodec': codec_str}
2854                     else:
2855                         codecs = parse_codecs(codec_str)
2856                     if content_type not in ('video', 'audio', 'text'):
2857                         if mime_type == 'image/jpeg':
2858                             content_type = mime_type
2859                         elif codecs.get('vcodec', 'none') != 'none':
2860                             content_type = 'video'
2861                         elif codecs.get('acodec', 'none') != 'none':
2862                             content_type = 'audio'
2863                         elif codecs.get('scodec', 'none') != 'none':
2864                             content_type = 'text'
2865                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2866                             content_type = 'text'
2867                         else:
2868                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2869                             continue
2870
2871                     base_url = ''
2872                     for element in (representation, adaptation_set, period, mpd_doc):
2873                         base_url_e = element.find(_add_ns('BaseURL'))
2874                         if try_call(lambda: base_url_e.text) is not None:
2875                             base_url = base_url_e.text + base_url
2876                             if re.match(r'^https?://', base_url):
2877                                 break
2878                     if mpd_base_url and base_url.startswith('/'):
2879                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2880                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2881                         if not mpd_base_url.endswith('/'):
2882                             mpd_base_url += '/'
2883                         base_url = mpd_base_url + base_url
2884                     representation_id = representation_attrib.get('id')
2885                     lang = representation_attrib.get('lang')
2886                     url_el = representation.find(_add_ns('BaseURL'))
2887                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2888                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2889                     if representation_id is not None:
2890                         format_id = representation_id
2891                     else:
2892                         format_id = content_type
2893                     if mpd_id:
2894                         format_id = mpd_id + '-' + format_id
2895                     if content_type in ('video', 'audio'):
2896                         f = {
2897                             'format_id': format_id,
2898                             'manifest_url': mpd_url,
2899                             'ext': mimetype2ext(mime_type),
2900                             'width': int_or_none(representation_attrib.get('width')),
2901                             'height': int_or_none(representation_attrib.get('height')),
2902                             'tbr': float_or_none(bandwidth, 1000),
2903                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2904                             'fps': int_or_none(representation_attrib.get('frameRate')),
2905                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2906                             'format_note': 'DASH %s' % content_type,
2907                             'filesize': filesize,
2908                             'container': mimetype2ext(mime_type) + '_dash',
2909                             **codecs
2910                         }
2911                     elif content_type == 'text':
2912                         f = {
2913                             'ext': mimetype2ext(mime_type),
2914                             'manifest_url': mpd_url,
2915                             'filesize': filesize,
2916                         }
2917                     elif content_type == 'image/jpeg':
2918                         # See test case in VikiIE
2919                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2920                         f = {
2921                             'format_id': format_id,
2922                             'ext': 'mhtml',
2923                             'manifest_url': mpd_url,
2924                             'format_note': 'DASH storyboards (jpeg)',
2925                             'acodec': 'none',
2926                             'vcodec': 'none',
2927                         }
2928                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2929                         f['has_drm'] = True
2930                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2931
2932                     def prepare_template(template_name, identifiers):
2933                         tmpl = representation_ms_info[template_name]
2934                         if representation_id is not None:
2935                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2936                         # First of, % characters outside $...$ templates
2937                         # must be escaped by doubling for proper processing
2938                         # by % operator string formatting used further (see
2939                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2940                         t = ''
2941                         in_template = False
2942                         for c in tmpl:
2943                             t += c
2944                             if c == '$':
2945                                 in_template = not in_template
2946                             elif c == '%' and not in_template:
2947                                 t += c
2948                         # Next, $...$ templates are translated to their
2949                         # %(...) counterparts to be used with % operator
2950                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2951                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2952                         t.replace('$$', '$')
2953                         return t
2954
2955                     # @initialization is a regular template like @media one
2956                     # so it should be handled just the same way (see
2957                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2958                     if 'initialization' in representation_ms_info:
2959                         initialization_template = prepare_template(
2960                             'initialization',
2961                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2962                             # $Time$ shall not be included for @initialization thus
2963                             # only $Bandwidth$ remains
2964                             ('Bandwidth', ))
2965                         representation_ms_info['initialization_url'] = initialization_template % {
2966                             'Bandwidth': bandwidth,
2967                         }
2968
2969                     def location_key(location):
2970                         return 'url' if re.match(r'^https?://', location) else 'path'
2971
2972                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2973
2974                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2975                         media_location_key = location_key(media_template)
2976
2977                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2978                         # can't be used at the same time
2979                         if '%(Number' in media_template and 's' not in representation_ms_info:
2980                             segment_duration = None
2981                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2982                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2983                                 representation_ms_info['total_number'] = int(math.ceil(
2984                                     float_or_none(period_duration, segment_duration, default=0)))
2985                             representation_ms_info['fragments'] = [{
2986                                 media_location_key: media_template % {
2987                                     'Number': segment_number,
2988                                     'Bandwidth': bandwidth,
2989                                 },
2990                                 'duration': segment_duration,
2991                             } for segment_number in range(
2992                                 representation_ms_info['start_number'],
2993                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2994                         else:
2995                             # $Number*$ or $Time$ in media template with S list available
2996                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2997                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2998                             representation_ms_info['fragments'] = []
2999                             segment_time = 0
3000                             segment_d = None
3001                             segment_number = representation_ms_info['start_number']
3002
3003                             def add_segment_url():
3004                                 segment_url = media_template % {
3005                                     'Time': segment_time,
3006                                     'Bandwidth': bandwidth,
3007                                     'Number': segment_number,
3008                                 }
3009                                 representation_ms_info['fragments'].append({
3010                                     media_location_key: segment_url,
3011                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
3012                                 })
3013
3014                             for num, s in enumerate(representation_ms_info['s']):
3015                                 segment_time = s.get('t') or segment_time
3016                                 segment_d = s['d']
3017                                 add_segment_url()
3018                                 segment_number += 1
3019                                 for r in range(s.get('r', 0)):
3020                                     segment_time += segment_d
3021                                     add_segment_url()
3022                                     segment_number += 1
3023                                 segment_time += segment_d
3024                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
3025                         # No media template,
3026                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
3027                         # or any YouTube dashsegments video
3028                         fragments = []
3029                         segment_index = 0
3030                         timescale = representation_ms_info['timescale']
3031                         for s in representation_ms_info['s']:
3032                             duration = float_or_none(s['d'], timescale)
3033                             for r in range(s.get('r', 0) + 1):
3034                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
3035                                 fragments.append({
3036                                     location_key(segment_uri): segment_uri,
3037                                     'duration': duration,
3038                                 })
3039                                 segment_index += 1
3040                         representation_ms_info['fragments'] = fragments
3041                     elif 'segment_urls' in representation_ms_info:
3042                         # Segment URLs with no SegmentTimeline
3043                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3044                         # https://github.com/ytdl-org/youtube-dl/pull/14844
3045                         fragments = []
3046                         segment_duration = float_or_none(
3047                             representation_ms_info['segment_duration'],
3048                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3049                         for segment_url in representation_ms_info['segment_urls']:
3050                             fragment = {
3051                                 location_key(segment_url): segment_url,
3052                             }
3053                             if segment_duration:
3054                                 fragment['duration'] = segment_duration
3055                             fragments.append(fragment)
3056                         representation_ms_info['fragments'] = fragments
3057                     # If there is a fragments key available then we correctly recognized fragmented media.
3058                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3059                     # assumption is not necessarily correct since we may simply have no support for
3060                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3061                     if 'fragments' in representation_ms_info:
3062                         f.update({
3063                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3064                             'url': mpd_url or base_url,
3065                             'fragment_base_url': base_url,
3066                             'fragments': [],
3067                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3068                         })
3069                         if 'initialization_url' in representation_ms_info:
3070                             initialization_url = representation_ms_info['initialization_url']
3071                             if not f.get('url'):
3072                                 f['url'] = initialization_url
3073                             f['fragments'].append({location_key(initialization_url): initialization_url})
3074                         f['fragments'].extend(representation_ms_info['fragments'])
3075                         if not period_duration:
3076                             period_duration = try_get(
3077                                 representation_ms_info,
3078                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3079                     else:
3080                         # Assuming direct URL to unfragmented media.
3081                         f['url'] = base_url
3082                     if content_type in ('video', 'audio', 'image/jpeg'):
3083                         f['manifest_stream_number'] = stream_numbers[f['url']]
3084                         stream_numbers[f['url']] += 1
3085                         formats.append(f)
3086                     elif content_type == 'text':
3087                         subtitles.setdefault(lang or 'und', []).append(f)
3088
3089         return formats, subtitles
3090
3091     def _extract_ism_formats(self, *args, **kwargs):
3092         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3093         if subs:
3094             self._report_ignoring_subs('ISM')
3095         return fmts
3096
3097     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3098         res = self._download_xml_handle(
3099             ism_url, video_id,
3100             note='Downloading ISM manifest' if note is None else note,
3101             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3102             fatal=fatal, data=data, headers=headers, query=query)
3103         if res is False:
3104             return [], {}
3105         ism_doc, urlh = res
3106         if ism_doc is None:
3107             return [], {}
3108
3109         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3110
3111     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3112         """
3113         Parse formats from ISM manifest.
3114         References:
3115          1. [MS-SSTR]: Smooth Streaming Protocol,
3116             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3117         """
3118         if ism_doc.get('IsLive') == 'TRUE':
3119             return [], {}
3120
3121         duration = int(ism_doc.attrib['Duration'])
3122         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3123
3124         formats = []
3125         subtitles = {}
3126         for stream in ism_doc.findall('StreamIndex'):
3127             stream_type = stream.get('Type')
3128             if stream_type not in ('video', 'audio', 'text'):
3129                 continue
3130             url_pattern = stream.attrib['Url']
3131             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3132             stream_name = stream.get('Name')
3133             stream_language = stream.get('Language', 'und')
3134             for track in stream.findall('QualityLevel'):
3135                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3136                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3137                 # TODO: add support for WVC1 and WMAP
3138                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3139                     self.report_warning('%s is not a supported codec' % fourcc)
3140                     continue
3141                 tbr = int(track.attrib['Bitrate']) // 1000
3142                 # [1] does not mention Width and Height attributes. However,
3143                 # they're often present while MaxWidth and MaxHeight are
3144                 # missing, so should be used as fallbacks
3145                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3146                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3147                 sampling_rate = int_or_none(track.get('SamplingRate'))
3148
3149                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3150                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3151
3152                 fragments = []
3153                 fragment_ctx = {
3154                     'time': 0,
3155                 }
3156                 stream_fragments = stream.findall('c')
3157                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3158                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3159                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3160                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3161                     if not fragment_ctx['duration']:
3162                         try:
3163                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3164                         except IndexError:
3165                             next_fragment_time = duration
3166                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3167                     for _ in range(fragment_repeat):
3168                         fragments.append({
3169                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3170                             'duration': fragment_ctx['duration'] / stream_timescale,
3171                         })
3172                         fragment_ctx['time'] += fragment_ctx['duration']
3173
3174                 if stream_type == 'text':
3175                     subtitles.setdefault(stream_language, []).append({
3176                         'ext': 'ismt',
3177                         'protocol': 'ism',
3178                         'url': ism_url,
3179                         'manifest_url': ism_url,
3180                         'fragments': fragments,
3181                         '_download_params': {
3182                             'stream_type': stream_type,
3183                             'duration': duration,
3184                             'timescale': stream_timescale,
3185                             'fourcc': fourcc,
3186                             'language': stream_language,
3187                             'codec_private_data': track.get('CodecPrivateData'),
3188                         }
3189                     })
3190                 elif stream_type in ('video', 'audio'):
3191                     formats.append({
3192                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3193                         'url': ism_url,
3194                         'manifest_url': ism_url,
3195                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3196                         'width': width,
3197                         'height': height,
3198                         'tbr': tbr,
3199                         'asr': sampling_rate,
3200                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3201                         'acodec': 'none' if stream_type == 'video' else fourcc,
3202                         'protocol': 'ism',
3203                         'fragments': fragments,
3204                         'has_drm': ism_doc.find('Protection') is not None,
3205                         '_download_params': {
3206                             'stream_type': stream_type,
3207                             'duration': duration,
3208                             'timescale': stream_timescale,
3209                             'width': width or 0,
3210                             'height': height or 0,
3211                             'fourcc': fourcc,
3212                             'language': stream_language,
3213                             'codec_private_data': track.get('CodecPrivateData'),
3214                             'sampling_rate': sampling_rate,
3215                             'channels': int_or_none(track.get('Channels', 2)),
3216                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3217                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3218                         },
3219                     })
3220         return formats, subtitles
3221
3222     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3223         def absolute_url(item_url):
3224             return urljoin(base_url, item_url)
3225
3226         def parse_content_type(content_type):
3227             if not content_type:
3228                 return {}
3229             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3230             if ctr:
3231                 mimetype, codecs = ctr.groups()
3232                 f = parse_codecs(codecs)
3233                 f['ext'] = mimetype2ext(mimetype)
3234                 return f
3235             return {}
3236
3237         def _media_formats(src, cur_media_type, type_info=None):
3238             type_info = type_info or {}
3239             full_url = absolute_url(src)
3240             ext = type_info.get('ext') or determine_ext(full_url)
3241             if ext == 'm3u8':
3242                 is_plain_url = False
3243                 formats = self._extract_m3u8_formats(
3244                     full_url, video_id, ext='mp4',
3245                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3246                     preference=preference, quality=quality, fatal=False)
3247             elif ext == 'mpd':
3248                 is_plain_url = False
3249                 formats = self._extract_mpd_formats(
3250                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3251             else:
3252                 is_plain_url = True
3253                 formats = [{
3254                     'url': full_url,
3255                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3256                     'ext': ext,
3257                 }]
3258             return is_plain_url, formats
3259
3260         entries = []
3261         # amp-video and amp-audio are very similar to their HTML5 counterparts
3262         # so we will include them right here (see
3263         # https://www.ampproject.org/docs/reference/components/amp-video)
3264         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3265         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3266         media_tags = [(media_tag, media_tag_name, media_type, '')
3267                       for media_tag, media_tag_name, media_type
3268                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3269         media_tags.extend(re.findall(
3270             # We only allow video|audio followed by a whitespace or '>'.
3271             # Allowing more characters may end up in significant slow down (see
3272             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3273             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3274             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3275         for media_tag, _, media_type, media_content in media_tags:
3276             media_info = {
3277                 'formats': [],
3278                 'subtitles': {},
3279             }
3280             media_attributes = extract_attributes(media_tag)
3281             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3282             if src:
3283                 f = parse_content_type(media_attributes.get('type'))
3284                 _, formats = _media_formats(src, media_type, f)
3285                 media_info['formats'].extend(formats)
3286             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3287             if media_content:
3288                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3289                     s_attr = extract_attributes(source_tag)
3290                     # data-video-src and data-src are non standard but seen
3291                     # several times in the wild
3292                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3293                     if not src:
3294                         continue
3295                     f = parse_content_type(s_attr.get('type'))
3296                     is_plain_url, formats = _media_formats(src, media_type, f)
3297                     if is_plain_url:
3298                         # width, height, res, label and title attributes are
3299                         # all not standard but seen several times in the wild
3300                         labels = [
3301                             s_attr.get(lbl)
3302                             for lbl in ('label', 'title')
3303                             if str_or_none(s_attr.get(lbl))
3304                         ]
3305                         width = int_or_none(s_attr.get('width'))
3306                         height = (int_or_none(s_attr.get('height'))
3307                                   or int_or_none(s_attr.get('res')))
3308                         if not width or not height:
3309                             for lbl in labels:
3310                                 resolution = parse_resolution(lbl)
3311                                 if not resolution:
3312                                     continue
3313                                 width = width or resolution.get('width')
3314                                 height = height or resolution.get('height')
3315                         for lbl in labels:
3316                             tbr = parse_bitrate(lbl)
3317                             if tbr:
3318                                 break
3319                         else:
3320                             tbr = None
3321                         f.update({
3322                             'width': width,
3323                             'height': height,
3324                             'tbr': tbr,
3325                             'format_id': s_attr.get('label') or s_attr.get('title'),
3326                         })
3327                         f.update(formats[0])
3328                         media_info['formats'].append(f)
3329                     else:
3330                         media_info['formats'].extend(formats)
3331                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3332                     track_attributes = extract_attributes(track_tag)
3333                     kind = track_attributes.get('kind')
3334                     if not kind or kind in ('subtitles', 'captions'):
3335                         src = strip_or_none(track_attributes.get('src'))
3336                         if not src:
3337                             continue
3338                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3339                         media_info['subtitles'].setdefault(lang, []).append({
3340                             'url': absolute_url(src),
3341                         })
3342             for f in media_info['formats']:
3343                 f.setdefault('http_headers', {})['Referer'] = base_url
3344             if media_info['formats'] or media_info['subtitles']:
3345                 entries.append(media_info)
3346         return entries
3347
3348     def _extract_akamai_formats(self, *args, **kwargs):
3349         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3350         if subs:
3351             self._report_ignoring_subs('akamai')
3352         return fmts
3353
3354     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3355         signed = 'hdnea=' in manifest_url
3356         if not signed:
3357             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3358             manifest_url = re.sub(
3359                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3360                 '', manifest_url).strip('?')
3361
3362         formats = []
3363         subtitles = {}
3364
3365         hdcore_sign = 'hdcore=3.7.0'
3366         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3367         hds_host = hosts.get('hds')
3368         if hds_host:
3369             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3370         if 'hdcore=' not in f4m_url:
3371             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3372         f4m_formats = self._extract_f4m_formats(
3373             f4m_url, video_id, f4m_id='hds', fatal=False)
3374         for entry in f4m_formats:
3375             entry.update({'extra_param_to_segment_url': hdcore_sign})
3376         formats.extend(f4m_formats)
3377
3378         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3379         hls_host = hosts.get('hls')
3380         if hls_host:
3381             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3382         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3383             m3u8_url, video_id, 'mp4', 'm3u8_native',
3384             m3u8_id='hls', fatal=False)
3385         formats.extend(m3u8_formats)
3386         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3387
3388         http_host = hosts.get('http')
3389         if http_host and m3u8_formats and not signed:
3390             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3391             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3392             qualities_length = len(qualities)
3393             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3394                 i = 0
3395                 for f in m3u8_formats:
3396                     if f['vcodec'] != 'none':
3397                         for protocol in ('http', 'https'):
3398                             http_f = f.copy()
3399                             del http_f['manifest_url']
3400                             http_url = re.sub(
3401                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3402                             http_f.update({
3403                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3404                                 'url': http_url,
3405                                 'protocol': protocol,
3406                             })
3407                             formats.append(http_f)
3408                         i += 1
3409
3410         return formats, subtitles
3411
3412     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3413         query = urllib.parse.urlparse(url).query
3414         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3415         mobj = re.search(
3416             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3417         url_base = mobj.group('url')
3418         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3419         formats = []
3420
3421         def manifest_url(manifest):
3422             m_url = f'{http_base_url}/{manifest}'
3423             if query:
3424                 m_url += '?%s' % query
3425             return m_url
3426
3427         if 'm3u8' not in skip_protocols:
3428             formats.extend(self._extract_m3u8_formats(
3429                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3430                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3431         if 'f4m' not in skip_protocols:
3432             formats.extend(self._extract_f4m_formats(
3433                 manifest_url('manifest.f4m'),
3434                 video_id, f4m_id='hds', fatal=False))
3435         if 'dash' not in skip_protocols:
3436             formats.extend(self._extract_mpd_formats(
3437                 manifest_url('manifest.mpd'),
3438                 video_id, mpd_id='dash', fatal=False))
3439         if re.search(r'(?:/smil:|\.smil)', url_base):
3440             if 'smil' not in skip_protocols:
3441                 rtmp_formats = self._extract_smil_formats(
3442                     manifest_url('jwplayer.smil'),
3443                     video_id, fatal=False)
3444                 for rtmp_format in rtmp_formats:
3445                     rtsp_format = rtmp_format.copy()
3446                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3447                     del rtsp_format['play_path']
3448                     del rtsp_format['ext']
3449                     rtsp_format.update({
3450                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3451                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3452                         'protocol': 'rtsp',
3453                     })
3454                     formats.extend([rtmp_format, rtsp_format])
3455         else:
3456             for protocol in ('rtmp', 'rtsp'):
3457                 if protocol not in skip_protocols:
3458                     formats.append({
3459                         'url': f'{protocol}:{url_base}',
3460                         'format_id': protocol,
3461                         'protocol': protocol,
3462                     })
3463         return formats
3464
3465     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3466         mobj = re.search(
3467             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3468             webpage)
3469         if mobj:
3470             try:
3471                 jwplayer_data = self._parse_json(mobj.group('options'),
3472                                                  video_id=video_id,
3473                                                  transform_source=transform_source)
3474             except ExtractorError:
3475                 pass
3476             else:
3477                 if isinstance(jwplayer_data, dict):
3478                     return jwplayer_data
3479
3480     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3481         jwplayer_data = self._find_jwplayer_data(
3482             webpage, video_id, transform_source=js_to_json)
3483         return self._parse_jwplayer_data(
3484             jwplayer_data, video_id, *args, **kwargs)
3485
3486     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3487                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3488         # JWPlayer backward compatibility: flattened playlists
3489         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3490         if 'playlist' not in jwplayer_data:
3491             jwplayer_data = {'playlist': [jwplayer_data]}
3492
3493         entries = []
3494
3495         # JWPlayer backward compatibility: single playlist item
3496         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3497         if not isinstance(jwplayer_data['playlist'], list):
3498             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3499
3500         for video_data in jwplayer_data['playlist']:
3501             # JWPlayer backward compatibility: flattened sources
3502             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3503             if 'sources' not in video_data:
3504                 video_data['sources'] = [video_data]
3505
3506             this_video_id = video_id or video_data['mediaid']
3507
3508             formats = self._parse_jwplayer_formats(
3509                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3510                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3511
3512             subtitles = {}
3513             tracks = video_data.get('tracks')
3514             if tracks and isinstance(tracks, list):
3515                 for track in tracks:
3516                     if not isinstance(track, dict):
3517                         continue
3518                     track_kind = track.get('kind')
3519                     if not track_kind or not isinstance(track_kind, str):
3520                         continue
3521                     if track_kind.lower() not in ('captions', 'subtitles'):
3522                         continue
3523                     track_url = urljoin(base_url, track.get('file'))
3524                     if not track_url:
3525                         continue
3526                     subtitles.setdefault(track.get('label') or 'en', []).append({
3527                         'url': self._proto_relative_url(track_url)
3528                     })
3529
3530             entry = {
3531                 'id': this_video_id,
3532                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3533                 'description': clean_html(video_data.get('description')),
3534                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3535                 'timestamp': int_or_none(video_data.get('pubdate')),
3536                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3537                 'subtitles': subtitles,
3538             }
3539             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3540             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3541                 entry.update({
3542                     '_type': 'url_transparent',
3543                     'url': formats[0]['url'],
3544                 })
3545             else:
3546                 self._sort_formats(formats)
3547                 entry['formats'] = formats
3548             entries.append(entry)
3549         if len(entries) == 1:
3550             return entries[0]
3551         else:
3552             return self.playlist_result(entries)
3553
3554     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3555                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3556         urls = []
3557         formats = []
3558         for source in jwplayer_sources_data:
3559             if not isinstance(source, dict):
3560                 continue
3561             source_url = urljoin(
3562                 base_url, self._proto_relative_url(source.get('file')))
3563             if not source_url or source_url in urls:
3564                 continue
3565             urls.append(source_url)
3566             source_type = source.get('type') or ''
3567             ext = mimetype2ext(source_type) or determine_ext(source_url)
3568             if source_type == 'hls' or ext == 'm3u8':
3569                 formats.extend(self._extract_m3u8_formats(
3570                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3571                     m3u8_id=m3u8_id, fatal=False))
3572             elif source_type == 'dash' or ext == 'mpd':
3573                 formats.extend(self._extract_mpd_formats(
3574                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3575             elif ext == 'smil':
3576                 formats.extend(self._extract_smil_formats(
3577                     source_url, video_id, fatal=False))
3578             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3579             elif source_type.startswith('audio') or ext in (
3580                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3581                 formats.append({
3582                     'url': source_url,
3583                     'vcodec': 'none',
3584                     'ext': ext,
3585                 })
3586             else:
3587                 height = int_or_none(source.get('height'))
3588                 if height is None:
3589                     # Often no height is provided but there is a label in
3590                     # format like "1080p", "720p SD", or 1080.
3591                     height = int_or_none(self._search_regex(
3592                         r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3593                         'height', default=None))
3594                 a_format = {
3595                     'url': source_url,
3596                     'width': int_or_none(source.get('width')),
3597                     'height': height,
3598                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3599                     'filesize': int_or_none(source.get('filesize')),
3600                     'ext': ext,
3601                 }
3602                 if source_url.startswith('rtmp'):
3603                     a_format['ext'] = 'flv'
3604                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3605                     # of jwplayer.flash.swf
3606                     rtmp_url_parts = re.split(
3607                         r'((?:mp4|mp3|flv):)', source_url, 1)
3608                     if len(rtmp_url_parts) == 3:
3609                         rtmp_url, prefix, play_path = rtmp_url_parts
3610                         a_format.update({
3611                             'url': rtmp_url,
3612                             'play_path': prefix + play_path,
3613                         })
3614                     if rtmp_params:
3615                         a_format.update(rtmp_params)
3616                 formats.append(a_format)
3617         return formats
3618
3619     def _live_title(self, name):
3620         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3621         return name
3622
3623     def _int(self, v, name, fatal=False, **kwargs):
3624         res = int_or_none(v, **kwargs)
3625         if res is None:
3626             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3627             if fatal:
3628                 raise ExtractorError(msg)
3629             else:
3630                 self.report_warning(msg)
3631         return res
3632
3633     def _float(self, v, name, fatal=False, **kwargs):
3634         res = float_or_none(v, **kwargs)
3635         if res is None:
3636             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3637             if fatal:
3638                 raise ExtractorError(msg)
3639             else:
3640                 self.report_warning(msg)
3641         return res
3642
3643     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3644                     path='/', secure=False, discard=False, rest={}, **kwargs):
3645         cookie = http.cookiejar.Cookie(
3646             0, name, value, port, port is not None, domain, True,
3647             domain.startswith('.'), path, True, secure, expire_time,
3648             discard, None, None, rest)
3649         self.cookiejar.set_cookie(cookie)
3650
3651     def _get_cookies(self, url):
3652         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3653         return LenientSimpleCookie(self._downloader._calc_cookies(url))
3654
3655     def _apply_first_set_cookie_header(self, url_handle, cookie):
3656         """
3657         Apply first Set-Cookie header instead of the last. Experimental.
3658
3659         Some sites (e.g. [1-3]) may serve two cookies under the same name
3660         in Set-Cookie header and expect the first (old) one to be set rather
3661         than second (new). However, as of RFC6265 the newer one cookie
3662         should be set into cookie store what actually happens.
3663         We will workaround this issue by resetting the cookie to
3664         the first one manually.
3665         1. https://new.vk.com/
3666         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3667         3. https://learning.oreilly.com/
3668         """
3669         for header, cookies in url_handle.headers.items():
3670             if header.lower() != 'set-cookie':
3671                 continue
3672             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3673             cookie_value = re.search(
3674                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3675             if cookie_value:
3676                 value, domain = cookie_value.groups()
3677                 self._set_cookie(domain, cookie, value)
3678                 break
3679
3680     @classmethod
3681     def get_testcases(cls, include_onlymatching=False):
3682         # Do not look in super classes
3683         t = vars(cls).get('_TEST')
3684         if t:
3685             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3686             tests = [t]
3687         else:
3688             tests = vars(cls).get('_TESTS', [])
3689         for t in tests:
3690             if not include_onlymatching and t.get('only_matching', False):
3691                 continue
3692             t['name'] = cls.ie_key()
3693             yield t
3694
3695     @classmethod
3696     def get_webpage_testcases(cls):
3697         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3698         for t in tests:
3699             t['name'] = cls.ie_key()
3700         return tests
3701
3702     @classproperty(cache=True)
3703     def age_limit(cls):
3704         """Get age limit from the testcases"""
3705         return max(traverse_obj(
3706             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3707             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3708
3709     @classproperty(cache=True)
3710     def _RETURN_TYPE(cls):
3711         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3712         tests = tuple(cls.get_testcases(include_onlymatching=False))
3713         if not tests:
3714             return None
3715         elif not any(k.startswith('playlist') for test in tests for k in test):
3716             return 'video'
3717         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3718             return 'playlist'
3719         return 'any'
3720
3721     @classmethod
3722     def is_single_video(cls, url):
3723         """Returns whether the URL is of a single video, None if unknown"""
3724         assert cls.suitable(url), 'The URL must be suitable for the extractor'
3725         return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3726
3727     @classmethod
3728     def is_suitable(cls, age_limit):
3729         """Test whether the extractor is generally suitable for the given age limit"""
3730         return not age_restricted(cls.age_limit, age_limit)
3731
3732     @classmethod
3733     def description(cls, *, markdown=True, search_examples=None):
3734         """Description of the extractor"""
3735         desc = ''
3736         if cls._NETRC_MACHINE:
3737             if markdown:
3738                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3739             else:
3740                 desc += f' [{cls._NETRC_MACHINE}]'
3741         if cls.IE_DESC is False:
3742             desc += ' [HIDDEN]'
3743         elif cls.IE_DESC:
3744             desc += f' {cls.IE_DESC}'
3745         if cls.SEARCH_KEY:
3746             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3747             if search_examples:
3748                 _COUNTS = ('', '5', '10', 'all')
3749                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3750         if not cls.working():
3751             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3752
3753         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3754         name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3755         return f'{name}:{desc}' if desc else name
3756
3757     def extract_subtitles(self, *args, **kwargs):
3758         if (self.get_param('writesubtitles', False)
3759                 or self.get_param('listsubtitles')):
3760             return self._get_subtitles(*args, **kwargs)
3761         return {}
3762
3763     def _get_subtitles(self, *args, **kwargs):
3764         raise NotImplementedError('This method must be implemented by subclasses')
3765
3766     class CommentsDisabled(Exception):
3767         """Raise in _get_comments if comments are disabled for the video"""
3768
3769     def extract_comments(self, *args, **kwargs):
3770         if not self.get_param('getcomments'):
3771             return None
3772         generator = self._get_comments(*args, **kwargs)
3773
3774         def extractor():
3775             comments = []
3776             interrupted = True
3777             try:
3778                 while True:
3779                     comments.append(next(generator))
3780             except StopIteration:
3781                 interrupted = False
3782             except KeyboardInterrupt:
3783                 self.to_screen('Interrupted by user')
3784             except self.CommentsDisabled:
3785                 return {'comments': None, 'comment_count': None}
3786             except Exception as e:
3787                 if self.get_param('ignoreerrors') is not True:
3788                     raise
3789                 self._downloader.report_error(e)
3790             comment_count = len(comments)
3791             self.to_screen(f'Extracted {comment_count} comments')
3792             return {
3793                 'comments': comments,
3794                 'comment_count': None if interrupted else comment_count
3795             }
3796         return extractor
3797
3798     def _get_comments(self, *args, **kwargs):
3799         raise NotImplementedError('This method must be implemented by subclasses')
3800
3801     @staticmethod
3802     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3803         """ Merge subtitle items for one language. Items with duplicated URLs/data
3804         will be dropped. """
3805         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3806         ret = list(subtitle_list1)
3807         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3808         return ret
3809
3810     @classmethod
3811     def _merge_subtitles(cls, *dicts, target=None):
3812         """ Merge subtitle dictionaries, language by language. """
3813         if target is None:
3814             target = {}
3815         for d in dicts:
3816             for lang, subs in d.items():
3817                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3818         return target
3819
3820     def extract_automatic_captions(self, *args, **kwargs):
3821         if (self.get_param('writeautomaticsub', False)
3822                 or self.get_param('listsubtitles')):
3823             return self._get_automatic_captions(*args, **kwargs)
3824         return {}
3825
3826     def _get_automatic_captions(self, *args, **kwargs):
3827         raise NotImplementedError('This method must be implemented by subclasses')
3828
3829     @functools.cached_property
3830     def _cookies_passed(self):
3831         """Whether cookies have been passed to YoutubeDL"""
3832         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3833
3834     def mark_watched(self, *args, **kwargs):
3835         if not self.get_param('mark_watched', False):
3836             return
3837         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3838             self._mark_watched(*args, **kwargs)
3839
3840     def _mark_watched(self, *args, **kwargs):
3841         raise NotImplementedError('This method must be implemented by subclasses')
3842
3843     def geo_verification_headers(self):
3844         headers = {}
3845         geo_verification_proxy = self.get_param('geo_verification_proxy')
3846         if geo_verification_proxy:
3847             headers['Ytdl-request-proxy'] = geo_verification_proxy
3848         return headers
3849
3850     @staticmethod
3851     def _generic_id(url):
3852         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3853
3854     def _generic_title(self, url='', webpage='', *, default=None):
3855         return (self._og_search_title(webpage, default=None)
3856                 or self._html_extract_title(webpage, default=None)
3857                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3858                 or default)
3859
3860     @staticmethod
3861     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3862         all_known = all(map(
3863             lambda x: x is not None,
3864             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3865         return (
3866             'private' if is_private
3867             else 'premium_only' if needs_premium
3868             else 'subscriber_only' if needs_subscription
3869             else 'needs_auth' if needs_auth
3870             else 'unlisted' if is_unlisted
3871             else 'public' if all_known
3872             else None)
3873
3874     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3875         '''
3876         @returns            A list of values for the extractor argument given by "key"
3877                             or "default" if no such key is present
3878         @param default      The default value to return when the key is not present (default: [])
3879         @param casesense    When false, the values are converted to lower case
3880         '''
3881         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3882         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3883         if val is None:
3884             return [] if default is NO_DEFAULT else default
3885         return list(val) if casesense else [x.lower() for x in val]
3886
3887     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3888         if not playlist_id or not video_id:
3889             return not video_id
3890
3891         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3892         if no_playlist is not None:
3893             return not no_playlist
3894
3895         video_id = '' if video_id is True else f' {video_id}'
3896         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3897         if self.get_param('noplaylist'):
3898             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3899             return False
3900         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3901         return True
3902
3903     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3904         RetryManager.report_retry(
3905             err, _count or int(fatal), _retries,
3906             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3907             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3908
3909     def RetryManager(self, **kwargs):
3910         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3911
3912     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3913         display_id = traverse_obj(info_dict, 'display_id', 'id')
3914         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3915         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3916             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3917
3918     @classmethod
3919     def extract_from_webpage(cls, ydl, url, webpage):
3920         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3921               else ydl.get_info_extractor(cls.ie_key()))
3922         for info in ie._extract_from_webpage(url, webpage) or []:
3923             # url = None since we do not want to set (webpage/original)_url
3924             ydl.add_default_extra_info(info, ie, None)
3925             yield info
3926
3927     @classmethod
3928     def _extract_from_webpage(cls, url, webpage):
3929         for embed_url in orderedSet(
3930                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3931             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3932
3933     @classmethod
3934     def _extract_embed_urls(cls, url, webpage):
3935         """@returns all the embed urls on the webpage"""
3936         if '_EMBED_URL_RE' not in cls.__dict__:
3937             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3938             for idx, regex in enumerate(cls._EMBED_REGEX):
3939                 assert regex.count('(?P<url>') == 1, \
3940                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3941             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3942
3943         for regex in cls._EMBED_URL_RE:
3944             for mobj in regex.finditer(webpage):
3945                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3946                 if cls._VALID_URL is False or cls.suitable(embed_url):
3947                     yield embed_url
3948
3949     class StopExtraction(Exception):
3950         pass
3951
3952     @classmethod
3953     def _extract_url(cls, webpage):  # TODO: Remove
3954         """Only for compatibility with some older extractors"""
3955         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3956
3957     @classmethod
3958     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3959         if plugin_name:
3960             mro = inspect.getmro(cls)
3961             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3962             cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key
3963             while getattr(super_class, '__wrapped__', None):
3964                 super_class = super_class.__wrapped__
3965             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3966
3967         return super().__init_subclass__(**kwargs)
3968
3969
3970 class SearchInfoExtractor(InfoExtractor):
3971     """
3972     Base class for paged search queries extractors.
3973     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3974     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3975     """
3976
3977     _MAX_RESULTS = float('inf')
3978     _RETURN_TYPE = 'playlist'
3979
3980     @classproperty
3981     def _VALID_URL(cls):
3982         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3983
3984     def _real_extract(self, query):
3985         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3986         if prefix == '':
3987             return self._get_n_results(query, 1)
3988         elif prefix == 'all':
3989             return self._get_n_results(query, self._MAX_RESULTS)
3990         else:
3991             n = int(prefix)
3992             if n <= 0:
3993                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3994             elif n > self._MAX_RESULTS:
3995                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3996                 n = self._MAX_RESULTS
3997             return self._get_n_results(query, n)
3998
3999     def _get_n_results(self, query, n):
4000         """Get a specified number of results for a query.
4001         Either this function or _search_results must be overridden by subclasses """
4002         return self.playlist_result(
4003             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
4004             query, query)
4005
4006     def _search_results(self, query):
4007         """Returns an iterator of search results"""
4008         raise NotImplementedError('This method must be implemented by subclasses')
4009
4010     @classproperty
4011     def SEARCH_KEY(cls):
4012         return cls._SEARCH_KEY
4013
4014
4015 class UnsupportedURLIE(InfoExtractor):
4016     _VALID_URL = '.*'
4017     _ENABLED = False
4018     IE_DESC = False
4019
4020     def _real_extract(self, url):
4021         raise UnsupportedError(url)