yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import getpass
   4 import hashlib
   5 import http.client
   6 import http.cookiejar
   7 import http.cookies
   8 import itertools
   9 import json
  10 import math
  11 import netrc
  12 import os
  13 import random
  14 import re
  15 import sys
  16 import time
  17 import types
  18 import urllib.parse
  19 import urllib.request
  20 import xml.etree.ElementTree
  21
  22 from ..compat import functools  # isort: split
  23 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  24 from ..downloader import FileDownloader
  25 from ..downloader.f4m import get_base_url, remove_encrypted_media
  26 from ..utils import (
  27     IDENTITY,
  28     JSON_LD_RE,
  29     NO_DEFAULT,
  30     ExtractorError,
  31     GeoRestrictedError,
  32     GeoUtils,
  33     LenientJSONDecoder,
  34     RegexNotFoundError,
  35     RetryManager,
  36     UnsupportedError,
  37     age_restricted,
  38     base_url,
  39     bug_reports_message,
  40     classproperty,
  41     clean_html,
  42     determine_ext,
  43     determine_protocol,
  44     dict_get,
  45     encode_data_uri,
  46     error_to_compat_str,
  47     extract_attributes,
  48     filter_dict,
  49     fix_xml_ampersands,
  50     float_or_none,
  51     format_field,
  52     int_or_none,
  53     join_nonempty,
  54     js_to_json,
  55     mimetype2ext,
  56     network_exceptions,
  57     orderedSet,
  58     parse_bitrate,
  59     parse_codecs,
  60     parse_duration,
  61     parse_iso8601,
  62     parse_m3u8_attributes,
  63     parse_resolution,
  64     sanitize_filename,
  65     sanitize_url,
  66     sanitized_Request,
  67     str_or_none,
  68     str_to_int,
  69     strip_or_none,
  70     traverse_obj,
  71     try_call,
  72     try_get,
  73     unescapeHTML,
  74     unified_strdate,
  75     unified_timestamp,
  76     update_Request,
  77     update_url_query,
  78     url_basename,
  79     url_or_none,
  80     urljoin,
  81     variadic,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor:
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped. Set to an empty string if video has
 107                     no title as opposed to "None" which signifies that the
 108                     extractor failed to obtain a title
 109
 110     Additionally, it must contain either a formats entry or a url one:
 111
 112     formats:        A list of dictionaries for each format available, ordered
 113                     from worst to best quality.
 114
 115                     Potential fields:
 116                     * url        The mandatory URL representing the media:
 117                                    for plain file media - HTTP URL of this file,
 118                                    for RTMP - RTMP URL,
 119                                    for HLS - URL of the M3U8 media playlist,
 120                                    for HDS - URL of the F4M manifest,
 121                                    for DASH
 122                                      - HTTP URL to plain file media (in case of
 123                                        unfragmented media)
 124                                      - URL of the MPD manifest or base URL
 125                                        representing the media if MPD manifest
 126                                        is parsed from a string (in case of
 127                                        fragmented media)
 128                                    for MSS - URL of the ISM manifest.
 129                     * manifest_url
 130                                  The URL of the manifest file in case of
 131                                  fragmented media:
 132                                    for HLS - URL of the M3U8 master playlist,
 133                                    for HDS - URL of the F4M manifest,
 134                                    for DASH - URL of the MPD manifest,
 135                                    for MSS - URL of the ISM manifest.
 136                     * manifest_stream_number  (For internal use only)
 137                                  The index of the stream in the manifest file
 138                     * ext        Will be calculated from URL if missing
 139                     * format     A human-readable description of the format
 140                                  ("mp4 container with h264/opus").
 141                                  Calculated from the format_id, width, height.
 142                                  and format_note fields if missing.
 143                     * format_id  A short description of the format
 144                                  ("mp4_h264_opus" or "19").
 145                                 Technically optional, but strongly recommended.
 146                     * format_note Additional info about the format
 147                                  ("3D" or "DASH video")
 148                     * width      Width of the video, if known
 149                     * height     Height of the video, if known
 150                     * resolution Textual description of width and height
 151                     * dynamic_range The dynamic range of the video. One of:
 152                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 153                     * tbr        Average bitrate of audio and video in KBit/s
 154                     * abr        Average audio bitrate in KBit/s
 155                     * acodec     Name of the audio codec in use
 156                     * asr        Audio sampling rate in Hertz
 157                     * audio_channels  Number of audio channels
 158                     * vbr        Average video bitrate in KBit/s
 159                     * fps        Frame rate
 160                     * vcodec     Name of the video codec in use
 161                     * container  Name of the container format
 162                     * filesize   The number of bytes, if known in advance
 163                     * filesize_approx  An estimate for the number of bytes
 164                     * player_url SWF Player URL (used for rtmpdump).
 165                     * protocol   The protocol that will be used for the actual
 166                                  download, lower-case. One of "http", "https" or
 167                                  one of the protocols defined in downloader.PROTOCOL_MAP
 168                     * fragment_base_url
 169                                  Base URL for fragments. Each fragment's path
 170                                  value (if present) will be relative to
 171                                  this URL.
 172                     * fragments  A list of fragments of a fragmented media.
 173                                  Each fragment entry must contain either an url
 174                                  or a path. If an url is present it should be
 175                                  considered by a client. Otherwise both path and
 176                                  fragment_base_url must be present. Here is
 177                                  the list of all potential fields:
 178                                  * "url" - fragment's URL
 179                                  * "path" - fragment's path relative to
 180                                             fragment_base_url
 181                                  * "duration" (optional, int or float)
 182                                  * "filesize" (optional, int)
 183                     * is_from_start  Is a live format that can be downloaded
 184                                 from the start. Boolean
 185                     * preference Order number of this format. If this field is
 186                                  present and not None, the formats get sorted
 187                                  by this field, regardless of all other values.
 188                                  -1 for default (order by other properties),
 189                                  -2 or smaller for less than default.
 190                                  < -1000 to hide the format (if there is
 191                                     another one which is strictly better)
 192                     * language   Language code, e.g. "de" or "en-US".
 193                     * language_preference  Is this in the language mentioned in
 194                                  the URL?
 195                                  10 if it's what the URL is about,
 196                                  -1 for default (don't know),
 197                                  -10 otherwise, other values reserved for now.
 198                     * quality    Order number of the video quality of this
 199                                  format, irrespective of the file format.
 200                                  -1 for default (order by other properties),
 201                                  -2 or smaller for less than default.
 202                     * source_preference  Order number for this video source
 203                                   (quality takes higher priority)
 204                                  -1 for default (order by other properties),
 205                                  -2 or smaller for less than default.
 206                     * http_headers  A dictionary of additional HTTP headers
 207                                  to add to the request.
 208                     * stretched_ratio  If given and not 1, indicates that the
 209                                  video's pixels are not square.
 210                                  width : height ratio as float.
 211                     * no_resume  The server does not support resuming the
 212                                  (HTTP or RTMP) download. Boolean.
 213                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 214                     * downloader_options  A dictionary of downloader options
 215                                  (For internal use only)
 216                                  * http_chunk_size Chunk size for HTTP downloads
 217                                  * ffmpeg_args     Extra arguments for ffmpeg downloader
 218                     RTMP formats can also have the additional fields: page_url,
 219                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 220                     rtmp_protocol, rtmp_real_time
 221
 222     url:            Final video URL.
 223     ext:            Video filename extension.
 224     format:         The video format, defaults to ext (used for --get-format)
 225     player_url:     SWF Player URL (used for rtmpdump).
 226
 227     The following fields are optional:
 228
 229     direct:         True if a direct video file was given (must only be set by GenericIE)
 230     alt_title:      A secondary title of the video.
 231     display_id      An alternative identifier for the video, not necessarily
 232                     unique, but available before title. Typically, id is
 233                     something like "4234987", title "Dancing naked mole rats",
 234                     and display_id "dancing-naked-mole-rats"
 235     thumbnails:     A list of dictionaries, with the following entries:
 236                         * "id" (optional, string) - Thumbnail format ID
 237                         * "url"
 238                         * "preference" (optional, int) - quality of the image
 239                         * "width" (optional, int)
 240                         * "height" (optional, int)
 241                         * "resolution" (optional, string "{width}x{height}",
 242                                         deprecated)
 243                         * "filesize" (optional, int)
 244                         * "http_headers" (dict) - HTTP headers for the request
 245     thumbnail:      Full URL to a video thumbnail image.
 246     description:    Full video description.
 247     uploader:       Full name of the video uploader.
 248     license:        License name the video is licensed under.
 249     creator:        The creator of the video.
 250     timestamp:      UNIX timestamp of the moment the video was uploaded
 251     upload_date:    Video upload date in UTC (YYYYMMDD).
 252                     If not explicitly set, calculated from timestamp
 253     release_timestamp: UNIX timestamp of the moment the video was released.
 254                     If it is not clear whether to use timestamp or this, use the former
 255     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 256                     If not explicitly set, calculated from release_timestamp
 257     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 258     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 259                     If not explicitly set, calculated from modified_timestamp
 260     uploader_id:    Nickname or id of the video uploader.
 261     uploader_url:   Full URL to a personal webpage of the video uploader.
 262     channel:        Full name of the channel the video is uploaded on.
 263                     Note that channel fields may or may not repeat uploader
 264                     fields. This depends on a particular extractor.
 265     channel_id:     Id of the channel.
 266     channel_url:    Full URL to a channel webpage.
 267     channel_follower_count: Number of followers of the channel.
 268     location:       Physical location where the video was filmed.
 269     subtitles:      The available subtitles as a dictionary in the format
 270                     {tag: subformats}. "tag" is usually a language code, and
 271                     "subformats" is a list sorted from lower to higher
 272                     preference, each element is a dictionary with the "ext"
 273                     entry and one of:
 274                         * "data": The subtitles file contents
 275                         * "url": A URL pointing to the subtitles file
 276                     It can optionally also have:
 277                         * "name": Name or description of the subtitles
 278                         * "http_headers": A dictionary of additional HTTP headers
 279                                   to add to the request.
 280                     "ext" will be calculated from URL if missing
 281     automatic_captions: Like 'subtitles'; contains automatically generated
 282                     captions instead of normal subtitles
 283     duration:       Length of the video in seconds, as an integer or float.
 284     view_count:     How many users have watched the video on the platform.
 285     like_count:     Number of positive ratings of the video
 286     dislike_count:  Number of negative ratings of the video
 287     repost_count:   Number of reposts of the video
 288     average_rating: Average rating give by users, the scale used depends on the webpage
 289     comment_count:  Number of comments on the video
 290     comments:       A list of comments, each with one or more of the following
 291                     properties (all but one of text or html optional):
 292                         * "author" - human-readable name of the comment author
 293                         * "author_id" - user ID of the comment author
 294                         * "author_thumbnail" - The thumbnail of the comment author
 295                         * "id" - Comment ID
 296                         * "html" - Comment as HTML
 297                         * "text" - Plain text of the comment
 298                         * "timestamp" - UNIX timestamp of comment
 299                         * "parent" - ID of the comment this one is replying to.
 300                                      Set to "root" to indicate that this is a
 301                                      comment to the original video.
 302                         * "like_count" - Number of positive ratings of the comment
 303                         * "dislike_count" - Number of negative ratings of the comment
 304                         * "is_favorited" - Whether the comment is marked as
 305                                            favorite by the video uploader
 306                         * "author_is_uploader" - Whether the comment is made by
 307                                                  the video uploader
 308     age_limit:      Age restriction for the video, as an integer (years)
 309     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 310                     should allow to get the same result again. (It will be set
 311                     by YoutubeDL if it's missing)
 312     categories:     A list of categories that the video falls in, for example
 313                     ["Sports", "Berlin"]
 314     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 315     cast:           A list of the video cast
 316     is_live:        True, False, or None (=unknown). Whether this video is a
 317                     live stream that goes on instead of a fixed-length video.
 318     was_live:       True, False, or None (=unknown). Whether this video was
 319                     originally a live stream.
 320     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 321                     or 'post_live' (was live, but VOD is not yet processed)
 322                     If absent, automatically set from is_live, was_live
 323     start_time:     Time in seconds where the reproduction should start, as
 324                     specified in the URL.
 325     end_time:       Time in seconds where the reproduction should end, as
 326                     specified in the URL.
 327     chapters:       A list of dictionaries, with the following entries:
 328                         * "start_time" - The start time of the chapter in seconds
 329                         * "end_time" - The end time of the chapter in seconds
 330                         * "title" (optional, string)
 331     playable_in_embed: Whether this video is allowed to play in embedded
 332                     players on other sites. Can be True (=always allowed),
 333                     False (=never allowed), None (=unknown), or a string
 334                     specifying the criteria for embedability; e.g. 'whitelist'
 335     availability:   Under what condition the video is available. One of
 336                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 337                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 338                     to set it
 339     _old_archive_ids: A list of old archive ids needed for backward compatibility
 340     __post_extractor: A function to be called just before the metadata is
 341                     written to either disk, logger or console. The function
 342                     must return a dict which will be added to the info_dict.
 343                     This is usefull for additional information that is
 344                     time-consuming to extract. Note that the fields thus
 345                     extracted will not be available to output template and
 346                     match_filter. So, only "comments" and "comment_count" are
 347                     currently allowed to be extracted via this method.
 348
 349     The following fields should only be used when the video belongs to some logical
 350     chapter or section:
 351
 352     chapter:        Name or title of the chapter the video belongs to.
 353     chapter_number: Number of the chapter the video belongs to, as an integer.
 354     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 355
 356     The following fields should only be used when the video is an episode of some
 357     series, programme or podcast:
 358
 359     series:         Title of the series or programme the video episode belongs to.
 360     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 361     season:         Title of the season the video episode belongs to.
 362     season_number:  Number of the season the video episode belongs to, as an integer.
 363     season_id:      Id of the season the video episode belongs to, as a unicode string.
 364     episode:        Title of the video episode. Unlike mandatory video title field,
 365                     this field should denote the exact title of the video episode
 366                     without any kind of decoration.
 367     episode_number: Number of the video episode within a season, as an integer.
 368     episode_id:     Id of the video episode, as a unicode string.
 369
 370     The following fields should only be used when the media is a track or a part of
 371     a music album:
 372
 373     track:          Title of the track.
 374     track_number:   Number of the track within an album or a disc, as an integer.
 375     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 376                     as a unicode string.
 377     artist:         Artist(s) of the track.
 378     genre:          Genre(s) of the track.
 379     album:          Title of the album the track belongs to.
 380     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 381     album_artist:   List of all artists appeared on the album (e.g.
 382                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 383                     and compilations).
 384     disc_number:    Number of the disc or other physical medium the track belongs to,
 385                     as an integer.
 386     release_year:   Year (YYYY) when the album was released.
 387     composer:       Composer of the piece
 388
 389     The following fields should only be set for clips that should be cut from the original video:
 390
 391     section_start:  Start time of the section in seconds
 392     section_end:    End time of the section in seconds
 393
 394     The following fields should only be set for storyboards:
 395     rows:           Number of rows in each storyboard fragment, as an integer
 396     columns:        Number of columns in each storyboard fragment, as an integer
 397
 398     Unless mentioned otherwise, the fields should be Unicode strings.
 399
 400     Unless mentioned otherwise, None is equivalent to absence of information.
 401
 402
 403     _type "playlist" indicates multiple videos.
 404     There must be a key "entries", which is a list, an iterable, or a PagedList
 405     object, each element of which is a valid dictionary by this specification.
 406
 407     Additionally, playlists can have "id", "title", and any other relevant
 408     attributes with the same semantics as videos (see above).
 409
 410     It can also have the following optional fields:
 411
 412     playlist_count: The total number of videos in a playlist. If not given,
 413                     YoutubeDL tries to calculate it from "entries"
 414
 415
 416     _type "multi_video" indicates that there are multiple videos that
 417     form a single show, for examples multiple acts of an opera or TV episode.
 418     It must have an entries key like a playlist and contain all the keys
 419     required for a video at the same time.
 420
 421
 422     _type "url" indicates that the video must be extracted from another
 423     location, possibly by a different extractor. Its only required key is:
 424     "url" - the next URL to extract.
 425     The key "ie_key" can be set to the class name (minus the trailing "IE",
 426     e.g. "Youtube") if the extractor class is known in advance.
 427     Additionally, the dictionary may have any properties of the resolved entity
 428     known in advance, for example "title" if the title of the referred video is
 429     known ahead of time.
 430
 431
 432     _type "url_transparent" entities have the same specification as "url", but
 433     indicate that the given additional information is more precise than the one
 434     associated with the resolved URL.
 435     This is useful when a site employs a video service that hosts the video and
 436     its technical metadata, but that video service does not embed a useful
 437     title, description etc.
 438
 439
 440     Subclasses of this should also be added to the list of extractors and
 441     should define a _VALID_URL regexp and, re-define the _real_extract() and
 442     (optionally) _real_initialize() methods.
 443
 444     Subclasses may also override suitable() if necessary, but ensure the function
 445     signature is preserved and that this function imports everything it needs
 446     (except other extractors), so that lazy_extractors works correctly.
 447
 448     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 449     the HTML of Generic webpages. It may also override _extract_embed_urls
 450     or _extract_from_webpage as necessary. While these are normally classmethods,
 451     _extract_from_webpage is allowed to be an instance method.
 452
 453     _extract_from_webpage may raise self.StopExtraction() to stop further
 454     processing of the webpage and obtain exclusive rights to it. This is useful
 455     when the extractor cannot reliably be matched using just the URL,
 456     e.g. invidious/peertube instances
 457
 458     Embed-only extractors can be defined by setting _VALID_URL = False.
 459
 460     To support username + password (or netrc) login, the extractor must define a
 461     _NETRC_MACHINE and re-define _perform_login(username, password) and
 462     (optionally) _initialize_pre_login() methods. The _perform_login method will
 463     be called between _initialize_pre_login and _real_initialize if credentials
 464     are passed by the user. In cases where it is necessary to have the login
 465     process as part of the extraction rather than initialization, _perform_login
 466     can be left undefined.
 467
 468     _GEO_BYPASS attribute may be set to False in order to disable
 469     geo restriction bypass mechanisms for a particular extractor.
 470     Though it won't disable explicit geo restriction bypass based on
 471     country code provided with geo_bypass_country.
 472
 473     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 474     countries for this extractor. One of these countries will be used by
 475     geo restriction bypass mechanism right away in order to bypass
 476     geo restriction, of course, if the mechanism is not disabled.
 477
 478     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 479     IP blocks in CIDR notation for this extractor. One of these IP blocks
 480     will be used by geo restriction bypass mechanism similarly
 481     to _GEO_COUNTRIES.
 482
 483     The _ENABLED attribute should be set to False for IEs that
 484     are disabled by default and must be explicitly enabled.
 485
 486     The _WORKING attribute should be set to False for broken IEs
 487     in order to warn the users and skip the tests.
 488     """
 489
 490     _ready = False
 491     _downloader = None
 492     _x_forwarded_for_ip = None
 493     _GEO_BYPASS = True
 494     _GEO_COUNTRIES = None
 495     _GEO_IP_BLOCKS = None
 496     _WORKING = True
 497     _ENABLED = True
 498     _NETRC_MACHINE = None
 499     IE_DESC = None
 500     SEARCH_KEY = None
 501     _VALID_URL = None
 502     _EMBED_REGEX = []
 503
 504     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 505         password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 506         return {
 507             None: '',
 508             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 509             'password': f'Use {password_hint}',
 510             'cookies': (
 511                 'Use --cookies-from-browser or --cookies for the authentication. '
 512                 'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 513         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 514
 515     def __init__(self, downloader=None):
 516         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 517         If a downloader is not passed during initialization,
 518         it must be set using "set_downloader()" before "extract()" is called"""
 519         self._ready = False
 520         self._x_forwarded_for_ip = None
 521         self._printed_messages = set()
 522         self.set_downloader(downloader)
 523
 524     @classmethod
 525     def _match_valid_url(cls, url):
 526         if cls._VALID_URL is False:
 527             return None
 528         # This does not use has/getattr intentionally - we want to know whether
 529         # we have cached the regexp for *this* class, whereas getattr would also
 530         # match the superclass
 531         if '_VALID_URL_RE' not in cls.__dict__:
 532             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 533         return cls._VALID_URL_RE.match(url)
 534
 535     @classmethod
 536     def suitable(cls, url):
 537         """Receives a URL and returns True if suitable for this IE."""
 538         # This function must import everything it needs (except other extractors),
 539         # so that lazy_extractors works correctly
 540         return cls._match_valid_url(url) is not None
 541
 542     @classmethod
 543     def _match_id(cls, url):
 544         return cls._match_valid_url(url).group('id')
 545
 546     @classmethod
 547     def get_temp_id(cls, url):
 548         try:
 549             return cls._match_id(url)
 550         except (IndexError, AttributeError):
 551             return None
 552
 553     @classmethod
 554     def working(cls):
 555         """Getter method for _WORKING."""
 556         return cls._WORKING
 557
 558     @classmethod
 559     def supports_login(cls):
 560         return bool(cls._NETRC_MACHINE)
 561
 562     def initialize(self):
 563         """Initializes an instance (authentication, etc)."""
 564         self._printed_messages = set()
 565         self._initialize_geo_bypass({
 566             'countries': self._GEO_COUNTRIES,
 567             'ip_blocks': self._GEO_IP_BLOCKS,
 568         })
 569         if not self._ready:
 570             self._initialize_pre_login()
 571             if self.supports_login():
 572                 username, password = self._get_login_info()
 573                 if username:
 574                     self._perform_login(username, password)
 575             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 576                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 577             self._real_initialize()
 578             self._ready = True
 579
 580     def _initialize_geo_bypass(self, geo_bypass_context):
 581         """
 582         Initialize geo restriction bypass mechanism.
 583
 584         This method is used to initialize geo bypass mechanism based on faking
 585         X-Forwarded-For HTTP header. A random country from provided country list
 586         is selected and a random IP belonging to this country is generated. This
 587         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 588         HTTP requests.
 589
 590         This method will be used for initial geo bypass mechanism initialization
 591         during the instance initialization with _GEO_COUNTRIES and
 592         _GEO_IP_BLOCKS.
 593
 594         You may also manually call it from extractor's code if geo bypass
 595         information is not available beforehand (e.g. obtained during
 596         extraction) or due to some other reason. In this case you should pass
 597         this information in geo bypass context passed as first argument. It may
 598         contain following fields:
 599
 600         countries:  List of geo unrestricted countries (similar
 601                     to _GEO_COUNTRIES)
 602         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 603                     (similar to _GEO_IP_BLOCKS)
 604
 605         """
 606         if not self._x_forwarded_for_ip:
 607
 608             # Geo bypass mechanism is explicitly disabled by user
 609             if not self.get_param('geo_bypass', True):
 610                 return
 611
 612             if not geo_bypass_context:
 613                 geo_bypass_context = {}
 614
 615             # Backward compatibility: previously _initialize_geo_bypass
 616             # expected a list of countries, some 3rd party code may still use
 617             # it this way
 618             if isinstance(geo_bypass_context, (list, tuple)):
 619                 geo_bypass_context = {
 620                     'countries': geo_bypass_context,
 621                 }
 622
 623             # The whole point of geo bypass mechanism is to fake IP
 624             # as X-Forwarded-For HTTP header based on some IP block or
 625             # country code.
 626
 627             # Path 1: bypassing based on IP block in CIDR notation
 628
 629             # Explicit IP block specified by user, use it right away
 630             # regardless of whether extractor is geo bypassable or not
 631             ip_block = self.get_param('geo_bypass_ip_block', None)
 632
 633             # Otherwise use random IP block from geo bypass context but only
 634             # if extractor is known as geo bypassable
 635             if not ip_block:
 636                 ip_blocks = geo_bypass_context.get('ip_blocks')
 637                 if self._GEO_BYPASS and ip_blocks:
 638                     ip_block = random.choice(ip_blocks)
 639
 640             if ip_block:
 641                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 642                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 643                 return
 644
 645             # Path 2: bypassing based on country code
 646
 647             # Explicit country code specified by user, use it right away
 648             # regardless of whether extractor is geo bypassable or not
 649             country = self.get_param('geo_bypass_country', None)
 650
 651             # Otherwise use random country code from geo bypass context but
 652             # only if extractor is known as geo bypassable
 653             if not country:
 654                 countries = geo_bypass_context.get('countries')
 655                 if self._GEO_BYPASS and countries:
 656                     country = random.choice(countries)
 657
 658             if country:
 659                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 660                 self._downloader.write_debug(
 661                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 662
 663     def extract(self, url):
 664         """Extracts URL information and returns it in list of dicts."""
 665         try:
 666             for _ in range(2):
 667                 try:
 668                     self.initialize()
 669                     self.write_debug('Extracting URL: %s' % url)
 670                     ie_result = self._real_extract(url)
 671                     if ie_result is None:
 672                         return None
 673                     if self._x_forwarded_for_ip:
 674                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 675                     subtitles = ie_result.get('subtitles') or {}
 676                     if 'no-live-chat' in self.get_param('compat_opts'):
 677                         for lang in ('live_chat', 'comments', 'danmaku'):
 678                             subtitles.pop(lang, None)
 679                     return ie_result
 680                 except GeoRestrictedError as e:
 681                     if self.__maybe_fake_ip_and_retry(e.countries):
 682                         continue
 683                     raise
 684         except UnsupportedError:
 685             raise
 686         except ExtractorError as e:
 687             kwargs = {
 688                 'video_id': e.video_id or self.get_temp_id(url),
 689                 'ie': self.IE_NAME,
 690                 'tb': e.traceback or sys.exc_info()[2],
 691                 'expected': e.expected,
 692                 'cause': e.cause
 693             }
 694             if hasattr(e, 'countries'):
 695                 kwargs['countries'] = e.countries
 696             raise type(e)(e.orig_msg, **kwargs)
 697         except http.client.IncompleteRead as e:
 698             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 699         except (KeyError, StopIteration) as e:
 700             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 701
 702     def __maybe_fake_ip_and_retry(self, countries):
 703         if (not self.get_param('geo_bypass_country', None)
 704                 and self._GEO_BYPASS
 705                 and self.get_param('geo_bypass', True)
 706                 and not self._x_forwarded_for_ip
 707                 and countries):
 708             country_code = random.choice(countries)
 709             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 710             if self._x_forwarded_for_ip:
 711                 self.report_warning(
 712                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 713                     % (self._x_forwarded_for_ip, country_code.upper()))
 714                 return True
 715         return False
 716
 717     def set_downloader(self, downloader):
 718         """Sets a YoutubeDL instance as the downloader for this IE."""
 719         self._downloader = downloader
 720
 721     @property
 722     def cache(self):
 723         return self._downloader.cache
 724
 725     @property
 726     def cookiejar(self):
 727         return self._downloader.cookiejar
 728
 729     def _initialize_pre_login(self):
 730         """ Initialization before login. Redefine in subclasses."""
 731         pass
 732
 733     def _perform_login(self, username, password):
 734         """ Login with username and password. Redefine in subclasses."""
 735         pass
 736
 737     def _real_initialize(self):
 738         """Real initialization process. Redefine in subclasses."""
 739         pass
 740
 741     def _real_extract(self, url):
 742         """Real extraction process. Redefine in subclasses."""
 743         raise NotImplementedError('This method must be implemented by subclasses')
 744
 745     @classmethod
 746     def ie_key(cls):
 747         """A string for getting the InfoExtractor with get_info_extractor"""
 748         return cls.__name__[:-2]
 749
 750     @classproperty
 751     def IE_NAME(cls):
 752         return cls.__name__[:-2]
 753
 754     @staticmethod
 755     def __can_accept_status_code(err, expected_status):
 756         assert isinstance(err, urllib.error.HTTPError)
 757         if expected_status is None:
 758             return False
 759         elif callable(expected_status):
 760             return expected_status(err.code) is True
 761         else:
 762             return err.code in variadic(expected_status)
 763
 764     def _create_request(self, url_or_request, data=None, headers=None, query=None):
 765         if isinstance(url_or_request, urllib.request.Request):
 766             return update_Request(url_or_request, data=data, headers=headers, query=query)
 767         if query:
 768             url_or_request = update_url_query(url_or_request, query)
 769         return sanitized_Request(url_or_request, data, headers or {})
 770
 771     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
 772         """
 773         Return the response handle.
 774
 775         See _download_webpage docstring for arguments specification.
 776         """
 777         if not self._downloader._first_webpage_request:
 778             sleep_interval = self.get_param('sleep_interval_requests') or 0
 779             if sleep_interval > 0:
 780                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 781                 time.sleep(sleep_interval)
 782         else:
 783             self._downloader._first_webpage_request = False
 784
 785         if note is None:
 786             self.report_download_webpage(video_id)
 787         elif note is not False:
 788             if video_id is None:
 789                 self.to_screen(str(note))
 790             else:
 791                 self.to_screen(f'{video_id}: {note}')
 792
 793         # Some sites check X-Forwarded-For HTTP header in order to figure out
 794         # the origin of the client behind proxy. This allows bypassing geo
 795         # restriction by faking this header's value to IP that belongs to some
 796         # geo unrestricted country. We will do so once we encounter any
 797         # geo restriction error.
 798         if self._x_forwarded_for_ip:
 799             headers = (headers or {}).copy()
 800             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 801
 802         try:
 803             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
 804         except network_exceptions as err:
 805             if isinstance(err, urllib.error.HTTPError):
 806                 if self.__can_accept_status_code(err, expected_status):
 807                     # Retain reference to error to prevent file object from
 808                     # being closed before it can be read. Works around the
 809                     # effects of <https://bugs.python.org/issue15002>
 810                     # introduced in Python 3.4.1.
 811                     err.fp._error = err
 812                     return err.fp
 813
 814             if errnote is False:
 815                 return False
 816             if errnote is None:
 817                 errnote = 'Unable to download webpage'
 818
 819             errmsg = f'{errnote}: {error_to_compat_str(err)}'
 820             if fatal:
 821                 raise ExtractorError(errmsg, cause=err)
 822             else:
 823                 self.report_warning(errmsg)
 824                 return False
 825
 826     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 827                                  encoding=None, data=None, headers={}, query={}, expected_status=None):
 828         """
 829         Return a tuple (page content as string, URL handle).
 830
 831         Arguments:
 832         url_or_request -- plain text URL as a string or
 833             a urllib.request.Request object
 834         video_id -- Video/playlist/item identifier (string)
 835
 836         Keyword arguments:
 837         note -- note printed before downloading (string)
 838         errnote -- note printed in case of an error (string)
 839         fatal -- flag denoting whether error should be considered fatal,
 840             i.e. whether it should cause ExtractionError to be raised,
 841             otherwise a warning will be reported and extraction continued
 842         encoding -- encoding for a page content decoding, guessed automatically
 843             when not explicitly specified
 844         data -- POST data (bytes)
 845         headers -- HTTP headers (dict)
 846         query -- URL query (dict)
 847         expected_status -- allows to accept failed HTTP requests (non 2xx
 848             status code) by explicitly specifying a set of accepted status
 849             codes. Can be any of the following entities:
 850                 - an integer type specifying an exact failed status code to
 851                   accept
 852                 - a list or a tuple of integer types specifying a list of
 853                   failed status codes to accept
 854                 - a callable accepting an actual failed status code and
 855                   returning True if it should be accepted
 856             Note that this argument does not affect success status codes (2xx)
 857             which are always accepted.
 858         """
 859
 860         # Strip hashes from the URL (#1038)
 861         if isinstance(url_or_request, str):
 862             url_or_request = url_or_request.partition('#')[0]
 863
 864         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 865         if urlh is False:
 866             assert not fatal
 867             return False
 868         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 869         return (content, urlh)
 870
 871     @staticmethod
 872     def _guess_encoding_from_content(content_type, webpage_bytes):
 873         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 874         if m:
 875             encoding = m.group(1)
 876         else:
 877             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 878                           webpage_bytes[:1024])
 879             if m:
 880                 encoding = m.group(1).decode('ascii')
 881             elif webpage_bytes.startswith(b'\xff\xfe'):
 882                 encoding = 'utf-16'
 883             else:
 884                 encoding = 'utf-8'
 885
 886         return encoding
 887
 888     def __check_blocked(self, content):
 889         first_block = content[:512]
 890         if ('<title>Access to this site is blocked</title>' in content
 891                 and 'Websense' in first_block):
 892             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 893             blocked_iframe = self._html_search_regex(
 894                 r'<iframe src="([^"]+)"', content,
 895                 'Websense information URL', default=None)
 896             if blocked_iframe:
 897                 msg += ' Visit %s for more details' % blocked_iframe
 898             raise ExtractorError(msg, expected=True)
 899         if '<title>The URL you requested has been blocked</title>' in first_block:
 900             msg = (
 901                 'Access to this webpage has been blocked by Indian censorship. '
 902                 'Use a VPN or proxy server (with --proxy) to route around it.')
 903             block_msg = self._html_search_regex(
 904                 r'</h1><p>(.*?)</p>',
 905                 content, 'block message', default=None)
 906             if block_msg:
 907                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 908             raise ExtractorError(msg, expected=True)
 909         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 910                 and 'blocklist.rkn.gov.ru' in content):
 911             raise ExtractorError(
 912                 'Access to this webpage has been blocked by decision of the Russian government. '
 913                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 914                 expected=True)
 915
 916     def _request_dump_filename(self, url, video_id):
 917         basen = f'{video_id}_{url}'
 918         trim_length = self.get_param('trim_file_name') or 240
 919         if len(basen) > trim_length:
 920             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 921             basen = basen[:trim_length - len(h)] + h
 922         filename = sanitize_filename(f'{basen}.dump', restricted=True)
 923         # Working around MAX_PATH limitation on Windows (see
 924         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 925         if compat_os_name == 'nt':
 926             absfilepath = os.path.abspath(filename)
 927             if len(absfilepath) > 259:
 928                 filename = fR'\\?\{absfilepath}'
 929         return filename
 930
 931     def __decode_webpage(self, webpage_bytes, encoding, headers):
 932         if not encoding:
 933             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 934         try:
 935             return webpage_bytes.decode(encoding, 'replace')
 936         except LookupError:
 937             return webpage_bytes.decode('utf-8', 'replace')
 938
 939     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 940         webpage_bytes = urlh.read()
 941         if prefix is not None:
 942             webpage_bytes = prefix + webpage_bytes
 943         if self.get_param('dump_intermediate_pages', False):
 944             self.to_screen('Dumping request to ' + urlh.geturl())
 945             dump = base64.b64encode(webpage_bytes).decode('ascii')
 946             self._downloader.to_screen(dump)
 947         if self.get_param('write_pages'):
 948             filename = self._request_dump_filename(urlh.geturl(), video_id)
 949             self.to_screen(f'Saving request to {filename}')
 950             with open(filename, 'wb') as outf:
 951                 outf.write(webpage_bytes)
 952
 953         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
 954         self.__check_blocked(content)
 955
 956         return content
 957
 958     def __print_error(self, errnote, fatal, video_id, err):
 959         if fatal:
 960             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
 961         elif errnote:
 962             self.report_warning(f'{video_id}: {errnote}: {err}')
 963
 964     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
 965         if transform_source:
 966             xml_string = transform_source(xml_string)
 967         try:
 968             return compat_etree_fromstring(xml_string.encode('utf-8'))
 969         except xml.etree.ElementTree.ParseError as ve:
 970             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
 971
 972     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
 973         try:
 974             return json.loads(
 975                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
 976         except ValueError as ve:
 977             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
 978
 979     def _parse_socket_response_as_json(self, data, *args, **kwargs):
 980         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
 981
 982     def __create_download_methods(name, parser, note, errnote, return_value):
 983
 984         def parse(ie, content, *args, errnote=errnote, **kwargs):
 985             if parser is None:
 986                 return content
 987             if errnote is False:
 988                 kwargs['errnote'] = errnote
 989             # parser is fetched by name so subclasses can override it
 990             return getattr(ie, parser)(content, *args, **kwargs)
 991
 992         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
 993                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 994             res = self._download_webpage_handle(
 995                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
 996                 data=data, headers=headers, query=query, expected_status=expected_status)
 997             if res is False:
 998                 return res
 999             content, urlh = res
1000             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1001
1002         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1003                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1004             if self.get_param('load_pages'):
1005                 url_or_request = self._create_request(url_or_request, data, headers, query)
1006                 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1007                 self.to_screen(f'Loading request from {filename}')
1008                 try:
1009                     with open(filename, 'rb') as dumpf:
1010                         webpage_bytes = dumpf.read()
1011                 except OSError as e:
1012                     self.report_warning(f'Unable to load request from disk: {e}')
1013                 else:
1014                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1015                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1016             kwargs = {
1017                 'note': note,
1018                 'errnote': errnote,
1019                 'transform_source': transform_source,
1020                 'fatal': fatal,
1021                 'encoding': encoding,
1022                 'data': data,
1023                 'headers': headers,
1024                 'query': query,
1025                 'expected_status': expected_status,
1026             }
1027             if parser is None:
1028                 kwargs.pop('transform_source')
1029             # The method is fetched by name so subclasses can override _download_..._handle
1030             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1031             return res if res is False else res[0]
1032
1033         def impersonate(func, name, return_value):
1034             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1035             func.__doc__ = f'''
1036                 @param transform_source     Apply this transformation before parsing
1037                 @returns                    {return_value}
1038
1039                 See _download_webpage_handle docstring for other arguments specification
1040             '''
1041
1042         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1043         impersonate(download_content, f'_download_{name}', f'{return_value}')
1044         return download_handle, download_content
1045
1046     _download_xml_handle, _download_xml = __create_download_methods(
1047         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1048     _download_json_handle, _download_json = __create_download_methods(
1049         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1050     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1051         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1052     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1053
1054     def _download_webpage(
1055             self, url_or_request, video_id, note=None, errnote=None,
1056             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1057         """
1058         Return the data of the page as a string.
1059
1060         Keyword arguments:
1061         tries -- number of tries
1062         timeout -- sleep interval between tries
1063
1064         See _download_webpage_handle docstring for other arguments specification.
1065         """
1066
1067         R''' # NB: These are unused; should they be deprecated?
1068         if tries != 1:
1069             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1070         if timeout is NO_DEFAULT:
1071             timeout = 5
1072         else:
1073             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1074         '''
1075
1076         try_count = 0
1077         while True:
1078             try:
1079                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1080             except http.client.IncompleteRead as e:
1081                 try_count += 1
1082                 if try_count >= tries:
1083                     raise e
1084                 self._sleep(timeout, video_id)
1085
1086     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1087         idstr = format_field(video_id, None, '%s: ')
1088         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1089         if only_once:
1090             if f'WARNING: {msg}' in self._printed_messages:
1091                 return
1092             self._printed_messages.add(f'WARNING: {msg}')
1093         self._downloader.report_warning(msg, *args, **kwargs)
1094
1095     def to_screen(self, msg, *args, **kwargs):
1096         """Print msg to screen, prefixing it with '[ie_name]'"""
1097         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1098
1099     def write_debug(self, msg, *args, **kwargs):
1100         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1101
1102     def get_param(self, name, default=None, *args, **kwargs):
1103         if self._downloader:
1104             return self._downloader.params.get(name, default, *args, **kwargs)
1105         return default
1106
1107     def report_drm(self, video_id, partial=False):
1108         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1109
1110     def report_extraction(self, id_or_name):
1111         """Report information extraction."""
1112         self.to_screen('%s: Extracting information' % id_or_name)
1113
1114     def report_download_webpage(self, video_id):
1115         """Report webpage download."""
1116         self.to_screen('%s: Downloading webpage' % video_id)
1117
1118     def report_age_confirmation(self):
1119         """Report attempt to confirm age."""
1120         self.to_screen('Confirming age')
1121
1122     def report_login(self):
1123         """Report attempt to log in."""
1124         self.to_screen('Logging in')
1125
1126     def raise_login_required(
1127             self, msg='This video is only available for registered users',
1128             metadata_available=False, method=NO_DEFAULT):
1129         if metadata_available and (
1130                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1131             self.report_warning(msg)
1132             return
1133         msg += format_field(self._login_hint(method), None, '. %s')
1134         raise ExtractorError(msg, expected=True)
1135
1136     def raise_geo_restricted(
1137             self, msg='This video is not available from your location due to geo restriction',
1138             countries=None, metadata_available=False):
1139         if metadata_available and (
1140                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1141             self.report_warning(msg)
1142         else:
1143             raise GeoRestrictedError(msg, countries=countries)
1144
1145     def raise_no_formats(self, msg, expected=False, video_id=None):
1146         if expected and (
1147                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1148             self.report_warning(msg, video_id)
1149         elif isinstance(msg, ExtractorError):
1150             raise msg
1151         else:
1152             raise ExtractorError(msg, expected=expected, video_id=video_id)
1153
1154     # Methods for following #608
1155     @staticmethod
1156     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1157         """Returns a URL that points to a page that should be processed"""
1158         if ie is not None:
1159             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1160         if video_id is not None:
1161             kwargs['id'] = video_id
1162         if video_title is not None:
1163             kwargs['title'] = video_title
1164         return {
1165             **kwargs,
1166             '_type': 'url_transparent' if url_transparent else 'url',
1167             'url': url,
1168         }
1169
1170     @classmethod
1171     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1172                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1173         return cls.playlist_result(
1174             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1175             playlist_id, playlist_title, **kwargs)
1176
1177     @staticmethod
1178     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1179         """Returns a playlist"""
1180         if playlist_id:
1181             kwargs['id'] = playlist_id
1182         if playlist_title:
1183             kwargs['title'] = playlist_title
1184         if playlist_description is not None:
1185             kwargs['description'] = playlist_description
1186         return {
1187             **kwargs,
1188             '_type': 'multi_video' if multi_video else 'playlist',
1189             'entries': entries,
1190         }
1191
1192     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1193         """
1194         Perform a regex search on the given string, using a single or a list of
1195         patterns returning the first matching group.
1196         In case of failure return a default value or raise a WARNING or a
1197         RegexNotFoundError, depending on fatal, specifying the field name.
1198         """
1199         if string is None:
1200             mobj = None
1201         elif isinstance(pattern, (str, re.Pattern)):
1202             mobj = re.search(pattern, string, flags)
1203         else:
1204             for p in pattern:
1205                 mobj = re.search(p, string, flags)
1206                 if mobj:
1207                     break
1208
1209         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1210
1211         if mobj:
1212             if group is None:
1213                 # return the first matching group
1214                 return next(g for g in mobj.groups() if g is not None)
1215             elif isinstance(group, (list, tuple)):
1216                 return tuple(mobj.group(g) for g in group)
1217             else:
1218                 return mobj.group(group)
1219         elif default is not NO_DEFAULT:
1220             return default
1221         elif fatal:
1222             raise RegexNotFoundError('Unable to extract %s' % _name)
1223         else:
1224             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1225             return None
1226
1227     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1228                      contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
1229         """Searches string for the JSON object specified by start_pattern"""
1230         # NB: end_pattern is only used to reduce the size of the initial match
1231         if default is NO_DEFAULT:
1232             default, has_default = {}, False
1233         else:
1234             fatal, has_default = False, True
1235
1236         json_string = self._search_regex(
1237             rf'{start_pattern}\s*(?P<json>{{\s*{contains_pattern}\s*}})\s*{end_pattern}',
1238             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1239         if not json_string:
1240             return default
1241
1242         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1243         try:
1244             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1245         except ExtractorError as e:
1246             if fatal:
1247                 raise ExtractorError(
1248                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1249             elif not has_default:
1250                 self.report_warning(
1251                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1252         return default
1253
1254     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1255         """
1256         Like _search_regex, but strips HTML tags and unescapes entities.
1257         """
1258         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1259         if res:
1260             return clean_html(res).strip()
1261         else:
1262             return res
1263
1264     def _get_netrc_login_info(self, netrc_machine=None):
1265         username = None
1266         password = None
1267         netrc_machine = netrc_machine or self._NETRC_MACHINE
1268
1269         if self.get_param('usenetrc', False):
1270             try:
1271                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1272                 if os.path.isdir(netrc_file):
1273                     netrc_file = os.path.join(netrc_file, '.netrc')
1274                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1275                 if info is not None:
1276                     username = info[0]
1277                     password = info[2]
1278                 else:
1279                     raise netrc.NetrcParseError(
1280                         'No authenticators for %s' % netrc_machine)
1281             except (OSError, netrc.NetrcParseError) as err:
1282                 self.report_warning(
1283                     'parsing .netrc: %s' % error_to_compat_str(err))
1284
1285         return username, password
1286
1287     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1288         """
1289         Get the login info as (username, password)
1290         First look for the manually specified credentials using username_option
1291         and password_option as keys in params dictionary. If no such credentials
1292         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1293         value.
1294         If there's no info available, return (None, None)
1295         """
1296
1297         # Attempt to use provided username and password or .netrc data
1298         username = self.get_param(username_option)
1299         if username is not None:
1300             password = self.get_param(password_option)
1301         else:
1302             username, password = self._get_netrc_login_info(netrc_machine)
1303
1304         return username, password
1305
1306     def _get_tfa_info(self, note='two-factor verification code'):
1307         """
1308         Get the two-factor authentication info
1309         TODO - asking the user will be required for sms/phone verify
1310         currently just uses the command line option
1311         If there's no info available, return None
1312         """
1313
1314         tfa = self.get_param('twofactor')
1315         if tfa is not None:
1316             return tfa
1317
1318         return getpass.getpass('Type %s and press [Return]: ' % note)
1319
1320     # Helper functions for extracting OpenGraph info
1321     @staticmethod
1322     def _og_regexes(prop):
1323         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1324         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1325                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1326         template = r'<meta[^>]+?%s[^>]+?%s'
1327         return [
1328             template % (property_re, content_re),
1329             template % (content_re, property_re),
1330         ]
1331
1332     @staticmethod
1333     def _meta_regex(prop):
1334         return r'''(?isx)<meta
1335                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1336                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1337
1338     def _og_search_property(self, prop, html, name=None, **kargs):
1339         prop = variadic(prop)
1340         if name is None:
1341             name = 'OpenGraph %s' % prop[0]
1342         og_regexes = []
1343         for p in prop:
1344             og_regexes.extend(self._og_regexes(p))
1345         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1346         if escaped is None:
1347             return None
1348         return unescapeHTML(escaped)
1349
1350     def _og_search_thumbnail(self, html, **kargs):
1351         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1352
1353     def _og_search_description(self, html, **kargs):
1354         return self._og_search_property('description', html, fatal=False, **kargs)
1355
1356     def _og_search_title(self, html, *, fatal=False, **kargs):
1357         return self._og_search_property('title', html, fatal=fatal, **kargs)
1358
1359     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1360         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1361         if secure:
1362             regexes = self._og_regexes('video:secure_url') + regexes
1363         return self._html_search_regex(regexes, html, name, **kargs)
1364
1365     def _og_search_url(self, html, **kargs):
1366         return self._og_search_property('url', html, **kargs)
1367
1368     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1369         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1370
1371     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1372         name = variadic(name)
1373         if display_name is None:
1374             display_name = name[0]
1375         return self._html_search_regex(
1376             [self._meta_regex(n) for n in name],
1377             html, display_name, fatal=fatal, group='content', **kwargs)
1378
1379     def _dc_search_uploader(self, html):
1380         return self._html_search_meta('dc.creator', html, 'uploader')
1381
1382     @staticmethod
1383     def _rta_search(html):
1384         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1385         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1386                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1387                      html):
1388             return 18
1389
1390         # And then there are the jokers who advertise that they use RTA, but actually don't.
1391         AGE_LIMIT_MARKERS = [
1392             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1393         ]
1394         if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1395             return 18
1396         return 0
1397
1398     def _media_rating_search(self, html):
1399         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1400         rating = self._html_search_meta('rating', html)
1401
1402         if not rating:
1403             return None
1404
1405         RATING_TABLE = {
1406             'safe for kids': 0,
1407             'general': 8,
1408             '14 years': 14,
1409             'mature': 17,
1410             'restricted': 19,
1411         }
1412         return RATING_TABLE.get(rating.lower())
1413
1414     def _family_friendly_search(self, html):
1415         # See http://schema.org/VideoObject
1416         family_friendly = self._html_search_meta(
1417             'isFamilyFriendly', html, default=None)
1418
1419         if not family_friendly:
1420             return None
1421
1422         RATING_TABLE = {
1423             '1': 0,
1424             'true': 0,
1425             '0': 18,
1426             'false': 18,
1427         }
1428         return RATING_TABLE.get(family_friendly.lower())
1429
1430     def _twitter_search_player(self, html):
1431         return self._html_search_meta('twitter:player', html,
1432                                       'twitter card player')
1433
1434     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1435         """Yield all json ld objects in the html"""
1436         if default is not NO_DEFAULT:
1437             fatal = False
1438         for mobj in re.finditer(JSON_LD_RE, html):
1439             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1440             for json_ld in variadic(json_ld_item):
1441                 if isinstance(json_ld, dict):
1442                     yield json_ld
1443
1444     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1445         """Search for a video in any json ld in the html"""
1446         if default is not NO_DEFAULT:
1447             fatal = False
1448         info = self._json_ld(
1449             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1450             video_id, fatal=fatal, expected_type=expected_type)
1451         if info:
1452             return info
1453         if default is not NO_DEFAULT:
1454             return default
1455         elif fatal:
1456             raise RegexNotFoundError('Unable to extract JSON-LD')
1457         else:
1458             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1459             return {}
1460
1461     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1462         if isinstance(json_ld, str):
1463             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1464         if not json_ld:
1465             return {}
1466         info = {}
1467         if not isinstance(json_ld, (list, tuple, dict)):
1468             return info
1469         if isinstance(json_ld, dict):
1470             json_ld = [json_ld]
1471
1472         INTERACTION_TYPE_MAP = {
1473             'CommentAction': 'comment',
1474             'AgreeAction': 'like',
1475             'DisagreeAction': 'dislike',
1476             'LikeAction': 'like',
1477             'DislikeAction': 'dislike',
1478             'ListenAction': 'view',
1479             'WatchAction': 'view',
1480             'ViewAction': 'view',
1481         }
1482
1483         def is_type(e, *expected_types):
1484             type = variadic(traverse_obj(e, '@type'))
1485             return any(x in type for x in expected_types)
1486
1487         def extract_interaction_type(e):
1488             interaction_type = e.get('interactionType')
1489             if isinstance(interaction_type, dict):
1490                 interaction_type = interaction_type.get('@type')
1491             return str_or_none(interaction_type)
1492
1493         def extract_interaction_statistic(e):
1494             interaction_statistic = e.get('interactionStatistic')
1495             if isinstance(interaction_statistic, dict):
1496                 interaction_statistic = [interaction_statistic]
1497             if not isinstance(interaction_statistic, list):
1498                 return
1499             for is_e in interaction_statistic:
1500                 if not is_type(is_e, 'InteractionCounter'):
1501                     continue
1502                 interaction_type = extract_interaction_type(is_e)
1503                 if not interaction_type:
1504                     continue
1505                 # For interaction count some sites provide string instead of
1506                 # an integer (as per spec) with non digit characters (e.g. ",")
1507                 # so extracting count with more relaxed str_to_int
1508                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1509                 if interaction_count is None:
1510                     continue
1511                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1512                 if not count_kind:
1513                     continue
1514                 count_key = '%s_count' % count_kind
1515                 if info.get(count_key) is not None:
1516                     continue
1517                 info[count_key] = interaction_count
1518
1519         def extract_chapter_information(e):
1520             chapters = [{
1521                 'title': part.get('name'),
1522                 'start_time': part.get('startOffset'),
1523                 'end_time': part.get('endOffset'),
1524             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1525             for idx, (last_c, current_c, next_c) in enumerate(zip(
1526                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1527                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1528                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1529                 if None in current_c.values():
1530                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1531                     return
1532             if chapters:
1533                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1534                 info['chapters'] = chapters
1535
1536         def extract_video_object(e):
1537             assert is_type(e, 'VideoObject')
1538             author = e.get('author')
1539             info.update({
1540                 'url': url_or_none(e.get('contentUrl')),
1541                 'title': unescapeHTML(e.get('name')),
1542                 'description': unescapeHTML(e.get('description')),
1543                 'thumbnails': [{'url': unescapeHTML(url)}
1544                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1545                                if url_or_none(url)],
1546                 'duration': parse_duration(e.get('duration')),
1547                 'timestamp': unified_timestamp(e.get('uploadDate')),
1548                 # author can be an instance of 'Organization' or 'Person' types.
1549                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1550                 # however some websites are using 'Text' type instead.
1551                 # 1. https://schema.org/VideoObject
1552                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1553                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1554                 'tbr': int_or_none(e.get('bitrate')),
1555                 'width': int_or_none(e.get('width')),
1556                 'height': int_or_none(e.get('height')),
1557                 'view_count': int_or_none(e.get('interactionCount')),
1558             })
1559             extract_interaction_statistic(e)
1560             extract_chapter_information(e)
1561
1562         def traverse_json_ld(json_ld, at_top_level=True):
1563             for e in json_ld:
1564                 if at_top_level and '@context' not in e:
1565                     continue
1566                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1567                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1568                     break
1569                 if expected_type is not None and not is_type(e, expected_type):
1570                     continue
1571                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1572                 if rating is not None:
1573                     info['average_rating'] = rating
1574                 if is_type(e, 'TVEpisode', 'Episode'):
1575                     episode_name = unescapeHTML(e.get('name'))
1576                     info.update({
1577                         'episode': episode_name,
1578                         'episode_number': int_or_none(e.get('episodeNumber')),
1579                         'description': unescapeHTML(e.get('description')),
1580                     })
1581                     if not info.get('title') and episode_name:
1582                         info['title'] = episode_name
1583                     part_of_season = e.get('partOfSeason')
1584                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1585                         info.update({
1586                             'season': unescapeHTML(part_of_season.get('name')),
1587                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1588                         })
1589                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1590                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1591                         info['series'] = unescapeHTML(part_of_series.get('name'))
1592                 elif is_type(e, 'Movie'):
1593                     info.update({
1594                         'title': unescapeHTML(e.get('name')),
1595                         'description': unescapeHTML(e.get('description')),
1596                         'duration': parse_duration(e.get('duration')),
1597                         'timestamp': unified_timestamp(e.get('dateCreated')),
1598                     })
1599                 elif is_type(e, 'Article', 'NewsArticle'):
1600                     info.update({
1601                         'timestamp': parse_iso8601(e.get('datePublished')),
1602                         'title': unescapeHTML(e.get('headline')),
1603                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1604                     })
1605                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1606                         extract_video_object(e['video'][0])
1607                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1608                         extract_video_object(e['subjectOf'][0])
1609                 elif is_type(e, 'VideoObject'):
1610                     extract_video_object(e)
1611                     if expected_type is None:
1612                         continue
1613                     else:
1614                         break
1615                 video = e.get('video')
1616                 if is_type(video, 'VideoObject'):
1617                     extract_video_object(video)
1618                 if expected_type is None:
1619                     continue
1620                 else:
1621                     break
1622         traverse_json_ld(json_ld)
1623
1624         return filter_dict(info)
1625
1626     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1627         return self._parse_json(
1628             self._search_regex(
1629                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1630                 webpage, 'next.js data', fatal=fatal, **kw),
1631             video_id, transform_source=transform_source, fatal=fatal)
1632
1633     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1634         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1635         rectx = re.escape(context_name)
1636         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1637         js, arg_keys, arg_vals = self._search_regex(
1638             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1639             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
1640
1641         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1642
1643         for key, val in args.items():
1644             if val in ('undefined', 'void 0'):
1645                 args[key] = 'null'
1646
1647         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1648         return traverse_obj(ret, traverse) or {}
1649
1650     @staticmethod
1651     def _hidden_inputs(html):
1652         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1653         hidden_inputs = {}
1654         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1655             attrs = extract_attributes(input)
1656             if not input:
1657                 continue
1658             if attrs.get('type') not in ('hidden', 'submit'):
1659                 continue
1660             name = attrs.get('name') or attrs.get('id')
1661             value = attrs.get('value')
1662             if name and value is not None:
1663                 hidden_inputs[name] = value
1664         return hidden_inputs
1665
1666     def _form_hidden_inputs(self, form_id, html):
1667         form = self._search_regex(
1668             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1669             html, '%s form' % form_id, group='form')
1670         return self._hidden_inputs(form)
1671
1672     class FormatSort:
1673         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1674
1675         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1676                    'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
1677                    'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1678         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1679                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1680                         'fps', 'fs_approx', 'source', 'id')
1681
1682         settings = {
1683             'vcodec': {'type': 'ordered', 'regex': True,
1684                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1685             'acodec': {'type': 'ordered', 'regex': True,
1686                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1687             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1688                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1689             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1690                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1691             'vext': {'type': 'ordered', 'field': 'video_ext',
1692                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1693                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1694             'aext': {'type': 'ordered', 'field': 'audio_ext',
1695                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1696                      'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
1697             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1698             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1699                            'field': ('vcodec', 'acodec'),
1700                            'function': lambda it: int(any(v != 'none' for v in it))},
1701             'ie_pref': {'priority': True, 'type': 'extractor'},
1702             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1703             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1704             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1705             'quality': {'convert': 'float', 'default': -1},
1706             'filesize': {'convert': 'bytes'},
1707             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1708             'id': {'convert': 'string', 'field': 'format_id'},
1709             'height': {'convert': 'float_none'},
1710             'width': {'convert': 'float_none'},
1711             'fps': {'convert': 'float_none'},
1712             'channels': {'convert': 'float_none', 'field': 'audio_channels'},
1713             'tbr': {'convert': 'float_none'},
1714             'vbr': {'convert': 'float_none'},
1715             'abr': {'convert': 'float_none'},
1716             'asr': {'convert': 'float_none'},
1717             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1718
1719             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1720             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1721             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1722             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1723             'res': {'type': 'multiple', 'field': ('height', 'width'),
1724                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1725
1726             # Actual field names
1727             'format_id': {'type': 'alias', 'field': 'id'},
1728             'preference': {'type': 'alias', 'field': 'ie_pref'},
1729             'language_preference': {'type': 'alias', 'field': 'lang'},
1730             'source_preference': {'type': 'alias', 'field': 'source'},
1731             'protocol': {'type': 'alias', 'field': 'proto'},
1732             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1733             'audio_channels': {'type': 'alias', 'field': 'channels'},
1734
1735             # Deprecated
1736             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1737             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1738             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1739             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1740             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1741             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1742             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1743             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1744             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1745             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1746             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1747             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1748             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1749             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1750             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1751             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1752             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1753             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1754             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1755             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1756         }
1757
1758         def __init__(self, ie, field_preference):
1759             self._order = []
1760             self.ydl = ie._downloader
1761             self.evaluate_params(self.ydl.params, field_preference)
1762             if ie.get_param('verbose'):
1763                 self.print_verbose_info(self.ydl.write_debug)
1764
1765         def _get_field_setting(self, field, key):
1766             if field not in self.settings:
1767                 if key in ('forced', 'priority'):
1768                     return False
1769                 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
1770                                             'deprecated and may be removed in a future version')
1771                 self.settings[field] = {}
1772             propObj = self.settings[field]
1773             if key not in propObj:
1774                 type = propObj.get('type')
1775                 if key == 'field':
1776                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1777                 elif key == 'convert':
1778                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1779                 else:
1780                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1781                 propObj[key] = default
1782             return propObj[key]
1783
1784         def _resolve_field_value(self, field, value, convertNone=False):
1785             if value is None:
1786                 if not convertNone:
1787                     return None
1788             else:
1789                 value = value.lower()
1790             conversion = self._get_field_setting(field, 'convert')
1791             if conversion == 'ignore':
1792                 return None
1793             if conversion == 'string':
1794                 return value
1795             elif conversion == 'float_none':
1796                 return float_or_none(value)
1797             elif conversion == 'bytes':
1798                 return FileDownloader.parse_bytes(value)
1799             elif conversion == 'order':
1800                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1801                 use_regex = self._get_field_setting(field, 'regex')
1802                 list_length = len(order_list)
1803                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1804                 if use_regex and value is not None:
1805                     for i, regex in enumerate(order_list):
1806                         if regex and re.match(regex, value):
1807                             return list_length - i
1808                     return list_length - empty_pos  # not in list
1809                 else:  # not regex or  value = None
1810                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1811             else:
1812                 if value.isnumeric():
1813                     return float(value)
1814                 else:
1815                     self.settings[field]['convert'] = 'string'
1816                     return value
1817
1818         def evaluate_params(self, params, sort_extractor):
1819             self._use_free_order = params.get('prefer_free_formats', False)
1820             self._sort_user = params.get('format_sort', [])
1821             self._sort_extractor = sort_extractor
1822
1823             def add_item(field, reverse, closest, limit_text):
1824                 field = field.lower()
1825                 if field in self._order:
1826                     return
1827                 self._order.append(field)
1828                 limit = self._resolve_field_value(field, limit_text)
1829                 data = {
1830                     'reverse': reverse,
1831                     'closest': False if limit is None else closest,
1832                     'limit_text': limit_text,
1833                     'limit': limit}
1834                 if field in self.settings:
1835                     self.settings[field].update(data)
1836                 else:
1837                     self.settings[field] = data
1838
1839             sort_list = (
1840                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1841                 + (tuple() if params.get('format_sort_force', False)
1842                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1843                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1844
1845             for item in sort_list:
1846                 match = re.match(self.regex, item)
1847                 if match is None:
1848                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1849                 field = match.group('field')
1850                 if field is None:
1851                     continue
1852                 if self._get_field_setting(field, 'type') == 'alias':
1853                     alias, field = field, self._get_field_setting(field, 'field')
1854                     if self._get_field_setting(alias, 'deprecated'):
1855                         self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
1856                                                     'be removed in a future version. Please use {field} instead')
1857                 reverse = match.group('reverse') is not None
1858                 closest = match.group('separator') == '~'
1859                 limit_text = match.group('limit')
1860
1861                 has_limit = limit_text is not None
1862                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1863                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1864
1865                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1866                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1867                 limit_count = len(limits)
1868                 for (i, f) in enumerate(fields):
1869                     add_item(f, reverse, closest,
1870                              limits[i] if i < limit_count
1871                              else limits[0] if has_limit and not has_multiple_limits
1872                              else None)
1873
1874         def print_verbose_info(self, write_debug):
1875             if self._sort_user:
1876                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1877             if self._sort_extractor:
1878                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1879             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1880                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1881                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1882                               self._get_field_setting(field, 'limit_text'),
1883                               self._get_field_setting(field, 'limit'))
1884                 if self._get_field_setting(field, 'limit_text') is not None else '')
1885                 for field in self._order if self._get_field_setting(field, 'visible')]))
1886
1887         def _calculate_field_preference_from_value(self, format, field, type, value):
1888             reverse = self._get_field_setting(field, 'reverse')
1889             closest = self._get_field_setting(field, 'closest')
1890             limit = self._get_field_setting(field, 'limit')
1891
1892             if type == 'extractor':
1893                 maximum = self._get_field_setting(field, 'max')
1894                 if value is None or (maximum is not None and value >= maximum):
1895                     value = -1
1896             elif type == 'boolean':
1897                 in_list = self._get_field_setting(field, 'in_list')
1898                 not_in_list = self._get_field_setting(field, 'not_in_list')
1899                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1900             elif type == 'ordered':
1901                 value = self._resolve_field_value(field, value, True)
1902
1903             # try to convert to number
1904             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1905             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1906             if is_num:
1907                 value = val_num
1908
1909             return ((-10, 0) if value is None
1910                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1911                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1912                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1913                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1914                     else (-1, value, 0))
1915
1916         def _calculate_field_preference(self, format, field):
1917             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1918             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1919             if type == 'multiple':
1920                 type = 'field'  # Only 'field' is allowed in multiple for now
1921                 actual_fields = self._get_field_setting(field, 'field')
1922
1923                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1924             else:
1925                 value = get_value(field)
1926             return self._calculate_field_preference_from_value(format, field, type, value)
1927
1928         def calculate_preference(self, format):
1929             # Determine missing protocol
1930             if not format.get('protocol'):
1931                 format['protocol'] = determine_protocol(format)
1932
1933             # Determine missing ext
1934             if not format.get('ext') and 'url' in format:
1935                 format['ext'] = determine_ext(format['url'])
1936             if format.get('vcodec') == 'none':
1937                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1938                 format['video_ext'] = 'none'
1939             else:
1940                 format['video_ext'] = format['ext']
1941                 format['audio_ext'] = 'none'
1942             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1943             #    format['preference'] = -1000
1944
1945             # Determine missing bitrates
1946             if format.get('tbr') is None:
1947                 if format.get('vbr') is not None and format.get('abr') is not None:
1948                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1949             else:
1950                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1951                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1952                 if format.get('acodec') != 'none' and format.get('abr') is None:
1953                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1954
1955             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1956
1957     def _sort_formats(self, formats, field_preference=[]):
1958         if not formats:
1959             return
1960         formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1961
1962     def _check_formats(self, formats, video_id):
1963         if formats:
1964             formats[:] = filter(
1965                 lambda f: self._is_valid_url(
1966                     f['url'], video_id,
1967                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1968                 formats)
1969
1970     @staticmethod
1971     def _remove_duplicate_formats(formats):
1972         format_urls = set()
1973         unique_formats = []
1974         for f in formats:
1975             if f['url'] not in format_urls:
1976                 format_urls.add(f['url'])
1977                 unique_formats.append(f)
1978         formats[:] = unique_formats
1979
1980     def _is_valid_url(self, url, video_id, item='video', headers={}):
1981         url = self._proto_relative_url(url, scheme='http:')
1982         # For now assume non HTTP(S) URLs always valid
1983         if not (url.startswith('http://') or url.startswith('https://')):
1984             return True
1985         try:
1986             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1987             return True
1988         except ExtractorError as e:
1989             self.to_screen(
1990                 '%s: %s URL is invalid, skipping: %s'
1991                 % (video_id, item, error_to_compat_str(e.cause)))
1992             return False
1993
1994     def http_scheme(self):
1995         """ Either "http:" or "https:", depending on the user's preferences """
1996         return (
1997             'http:'
1998             if self.get_param('prefer_insecure', False)
1999             else 'https:')
2000
2001     def _proto_relative_url(self, url, scheme=None):
2002         scheme = scheme or self.http_scheme()
2003         assert scheme.endswith(':')
2004         return sanitize_url(url, scheme=scheme[:-1])
2005
2006     def _sleep(self, timeout, video_id, msg_template=None):
2007         if msg_template is None:
2008             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
2009         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2010         self.to_screen(msg)
2011         time.sleep(timeout)
2012
2013     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2014                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
2015                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
2016         res = self._download_xml_handle(
2017             manifest_url, video_id, 'Downloading f4m manifest',
2018             'Unable to download f4m manifest',
2019             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
2020             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
2021             transform_source=transform_source,
2022             fatal=fatal, data=data, headers=headers, query=query)
2023         if res is False:
2024             return []
2025
2026         manifest, urlh = res
2027         manifest_url = urlh.geturl()
2028
2029         return self._parse_f4m_formats(
2030             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2031             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2032
2033     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2034                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2035                            fatal=True, m3u8_id=None):
2036         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2037             return []
2038
2039         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2040         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2041         if akamai_pv is not None and ';' in akamai_pv.text:
2042             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2043             if playerVerificationChallenge.strip() != '':
2044                 return []
2045
2046         formats = []
2047         manifest_version = '1.0'
2048         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2049         if not media_nodes:
2050             manifest_version = '2.0'
2051             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2052         # Remove unsupported DRM protected media from final formats
2053         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2054         media_nodes = remove_encrypted_media(media_nodes)
2055         if not media_nodes:
2056             return formats
2057
2058         manifest_base_url = get_base_url(manifest)
2059
2060         bootstrap_info = xpath_element(
2061             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2062             'bootstrap info', default=None)
2063
2064         vcodec = None
2065         mime_type = xpath_text(
2066             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2067             'base URL', default=None)
2068         if mime_type and mime_type.startswith('audio/'):
2069             vcodec = 'none'
2070
2071         for i, media_el in enumerate(media_nodes):
2072             tbr = int_or_none(media_el.attrib.get('bitrate'))
2073             width = int_or_none(media_el.attrib.get('width'))
2074             height = int_or_none(media_el.attrib.get('height'))
2075             format_id = join_nonempty(f4m_id, tbr or i)
2076             # If <bootstrapInfo> is present, the specified f4m is a
2077             # stream-level manifest, and only set-level manifests may refer to
2078             # external resources.  See section 11.4 and section 4 of F4M spec
2079             if bootstrap_info is None:
2080                 media_url = None
2081                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2082                 if manifest_version == '2.0':
2083                     media_url = media_el.attrib.get('href')
2084                 if media_url is None:
2085                     media_url = media_el.attrib.get('url')
2086                 if not media_url:
2087                     continue
2088                 manifest_url = (
2089                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2090                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2091                 # If media_url is itself a f4m manifest do the recursive extraction
2092                 # since bitrates in parent manifest (this one) and media_url manifest
2093                 # may differ leading to inability to resolve the format by requested
2094                 # bitrate in f4m downloader
2095                 ext = determine_ext(manifest_url)
2096                 if ext == 'f4m':
2097                     f4m_formats = self._extract_f4m_formats(
2098                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2099                         transform_source=transform_source, fatal=fatal)
2100                     # Sometimes stream-level manifest contains single media entry that
2101                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2102                     # At the same time parent's media entry in set-level manifest may
2103                     # contain it. We will copy it from parent in such cases.
2104                     if len(f4m_formats) == 1:
2105                         f = f4m_formats[0]
2106                         f.update({
2107                             'tbr': f.get('tbr') or tbr,
2108                             'width': f.get('width') or width,
2109                             'height': f.get('height') or height,
2110                             'format_id': f.get('format_id') if not tbr else format_id,
2111                             'vcodec': vcodec,
2112                         })
2113                     formats.extend(f4m_formats)
2114                     continue
2115                 elif ext == 'm3u8':
2116                     formats.extend(self._extract_m3u8_formats(
2117                         manifest_url, video_id, 'mp4', preference=preference,
2118                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2119                     continue
2120             formats.append({
2121                 'format_id': format_id,
2122                 'url': manifest_url,
2123                 'manifest_url': manifest_url,
2124                 'ext': 'flv' if bootstrap_info is not None else None,
2125                 'protocol': 'f4m',
2126                 'tbr': tbr,
2127                 'width': width,
2128                 'height': height,
2129                 'vcodec': vcodec,
2130                 'preference': preference,
2131                 'quality': quality,
2132             })
2133         return formats
2134
2135     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2136         return {
2137             'format_id': join_nonempty(m3u8_id, 'meta'),
2138             'url': m3u8_url,
2139             'ext': ext,
2140             'protocol': 'm3u8',
2141             'preference': preference - 100 if preference else -100,
2142             'quality': quality,
2143             'resolution': 'multiple',
2144             'format_note': 'Quality selection URL',
2145         }
2146
2147     def _report_ignoring_subs(self, name):
2148         self.report_warning(bug_reports_message(
2149             f'Ignoring subtitle tracks found in the {name} manifest; '
2150             'if any subtitle tracks are missing,'
2151         ), only_once=True)
2152
2153     def _extract_m3u8_formats(self, *args, **kwargs):
2154         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2155         if subs:
2156             self._report_ignoring_subs('HLS')
2157         return fmts
2158
2159     def _extract_m3u8_formats_and_subtitles(
2160             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2161             preference=None, quality=None, m3u8_id=None, note=None,
2162             errnote=None, fatal=True, live=False, data=None, headers={},
2163             query={}):
2164
2165         res = self._download_webpage_handle(
2166             m3u8_url, video_id,
2167             note='Downloading m3u8 information' if note is None else note,
2168             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2169             fatal=fatal, data=data, headers=headers, query=query)
2170
2171         if res is False:
2172             return [], {}
2173
2174         m3u8_doc, urlh = res
2175         m3u8_url = urlh.geturl()
2176
2177         return self._parse_m3u8_formats_and_subtitles(
2178             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2179             preference=preference, quality=quality, m3u8_id=m3u8_id,
2180             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2181             headers=headers, query=query, video_id=video_id)
2182
2183     def _parse_m3u8_formats_and_subtitles(
2184             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2185             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2186             errnote=None, fatal=True, data=None, headers={}, query={},
2187             video_id=None):
2188         formats, subtitles = [], {}
2189
2190         has_drm = re.search('|'.join([
2191             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2192             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2193         ]), m3u8_doc)
2194
2195         def format_url(url):
2196             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2197
2198         if self.get_param('hls_split_discontinuity', False):
2199             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2200                 if not m3u8_doc:
2201                     if not manifest_url:
2202                         return []
2203                     m3u8_doc = self._download_webpage(
2204                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2205                         note=False, errnote='Failed to download m3u8 playlist information')
2206                     if m3u8_doc is False:
2207                         return []
2208                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2209
2210         else:
2211             def _extract_m3u8_playlist_indices(*args, **kwargs):
2212                 return [None]
2213
2214         # References:
2215         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2216         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2217         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2218
2219         # We should try extracting formats only from master playlists [1, 4.3.4],
2220         # i.e. playlists that describe available qualities. On the other hand
2221         # media playlists [1, 4.3.3] should be returned as is since they contain
2222         # just the media without qualities renditions.
2223         # Fortunately, master playlist can be easily distinguished from media
2224         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2225         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2226         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2227         # media playlist and MUST NOT appear in master playlist thus we can
2228         # clearly detect media playlist with this criterion.
2229
2230         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2231             formats = [{
2232                 'format_id': join_nonempty(m3u8_id, idx),
2233                 'format_index': idx,
2234                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2235                 'ext': ext,
2236                 'protocol': entry_protocol,
2237                 'preference': preference,
2238                 'quality': quality,
2239                 'has_drm': has_drm,
2240             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2241
2242             return formats, subtitles
2243
2244         groups = {}
2245         last_stream_inf = {}
2246
2247         def extract_media(x_media_line):
2248             media = parse_m3u8_attributes(x_media_line)
2249             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2250             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2251             if not (media_type and group_id and name):
2252                 return
2253             groups.setdefault(group_id, []).append(media)
2254             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2255             if media_type == 'SUBTITLES':
2256                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2257                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2258                 # However, lack of URI has been spotted in the wild.
2259                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2260                 if not media.get('URI'):
2261                     return
2262                 url = format_url(media['URI'])
2263                 sub_info = {
2264                     'url': url,
2265                     'ext': determine_ext(url),
2266                 }
2267                 if sub_info['ext'] == 'm3u8':
2268                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2269                     # files may contain is WebVTT:
2270                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2271                     sub_info['ext'] = 'vtt'
2272                     sub_info['protocol'] = 'm3u8_native'
2273                 lang = media.get('LANGUAGE') or 'und'
2274                 subtitles.setdefault(lang, []).append(sub_info)
2275             if media_type not in ('VIDEO', 'AUDIO'):
2276                 return
2277             media_url = media.get('URI')
2278             if media_url:
2279                 manifest_url = format_url(media_url)
2280                 formats.extend({
2281                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2282                     'format_note': name,
2283                     'format_index': idx,
2284                     'url': manifest_url,
2285                     'manifest_url': m3u8_url,
2286                     'language': media.get('LANGUAGE'),
2287                     'ext': ext,
2288                     'protocol': entry_protocol,
2289                     'preference': preference,
2290                     'quality': quality,
2291                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2292                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2293
2294         def build_stream_name():
2295             # Despite specification does not mention NAME attribute for
2296             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2297             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2298             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2299             stream_name = last_stream_inf.get('NAME')
2300             if stream_name:
2301                 return stream_name
2302             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2303             # from corresponding rendition group
2304             stream_group_id = last_stream_inf.get('VIDEO')
2305             if not stream_group_id:
2306                 return
2307             stream_group = groups.get(stream_group_id)
2308             if not stream_group:
2309                 return stream_group_id
2310             rendition = stream_group[0]
2311             return rendition.get('NAME') or stream_group_id
2312
2313         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2314         # chance to detect video only formats when EXT-X-STREAM-INF tags
2315         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2316         for line in m3u8_doc.splitlines():
2317             if line.startswith('#EXT-X-MEDIA:'):
2318                 extract_media(line)
2319
2320         for line in m3u8_doc.splitlines():
2321             if line.startswith('#EXT-X-STREAM-INF:'):
2322                 last_stream_inf = parse_m3u8_attributes(line)
2323             elif line.startswith('#') or not line.strip():
2324                 continue
2325             else:
2326                 tbr = float_or_none(
2327                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2328                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2329                 manifest_url = format_url(line.strip())
2330
2331                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2332                     format_id = [m3u8_id, None, idx]
2333                     # Bandwidth of live streams may differ over time thus making
2334                     # format_id unpredictable. So it's better to keep provided
2335                     # format_id intact.
2336                     if not live:
2337                         stream_name = build_stream_name()
2338                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2339                     f = {
2340                         'format_id': join_nonempty(*format_id),
2341                         'format_index': idx,
2342                         'url': manifest_url,
2343                         'manifest_url': m3u8_url,
2344                         'tbr': tbr,
2345                         'ext': ext,
2346                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2347                         'protocol': entry_protocol,
2348                         'preference': preference,
2349                         'quality': quality,
2350                     }
2351                     resolution = last_stream_inf.get('RESOLUTION')
2352                     if resolution:
2353                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2354                         if mobj:
2355                             f['width'] = int(mobj.group('width'))
2356                             f['height'] = int(mobj.group('height'))
2357                     # Unified Streaming Platform
2358                     mobj = re.search(
2359                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2360                     if mobj:
2361                         abr, vbr = mobj.groups()
2362                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2363                         f.update({
2364                             'vbr': vbr,
2365                             'abr': abr,
2366                         })
2367                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2368                     f.update(codecs)
2369                     audio_group_id = last_stream_inf.get('AUDIO')
2370                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2371                     # references a rendition group MUST have a CODECS attribute.
2372                     # However, this is not always respected. E.g. [2]
2373                     # contains EXT-X-STREAM-INF tag which references AUDIO
2374                     # rendition group but does not have CODECS and despite
2375                     # referencing an audio group it represents a complete
2376                     # (with audio and video) format. So, for such cases we will
2377                     # ignore references to rendition groups and treat them
2378                     # as complete formats.
2379                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2380                         audio_group = groups.get(audio_group_id)
2381                         if audio_group and audio_group[0].get('URI'):
2382                             # TODO: update acodec for audio only formats with
2383                             # the same GROUP-ID
2384                             f['acodec'] = 'none'
2385                     if not f.get('ext'):
2386                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2387                     formats.append(f)
2388
2389                     # for DailyMotion
2390                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2391                     if progressive_uri:
2392                         http_f = f.copy()
2393                         del http_f['manifest_url']
2394                         http_f.update({
2395                             'format_id': f['format_id'].replace('hls-', 'http-'),
2396                             'protocol': 'http',
2397                             'url': progressive_uri,
2398                         })
2399                         formats.append(http_f)
2400
2401                 last_stream_inf = {}
2402         return formats, subtitles
2403
2404     def _extract_m3u8_vod_duration(
2405             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2406
2407         m3u8_vod = self._download_webpage(
2408             m3u8_vod_url, video_id,
2409             note='Downloading m3u8 VOD manifest' if note is None else note,
2410             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2411             fatal=False, data=data, headers=headers, query=query)
2412
2413         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2414
2415     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2416         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2417             return None
2418
2419         return int(sum(
2420             float(line[len('#EXTINF:'):].split(',')[0])
2421             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2422
2423     @staticmethod
2424     def _xpath_ns(path, namespace=None):
2425         if not namespace:
2426             return path
2427         out = []
2428         for c in path.split('/'):
2429             if not c or c == '.':
2430                 out.append(c)
2431             else:
2432                 out.append('{%s}%s' % (namespace, c))
2433         return '/'.join(out)
2434
2435     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2436         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2437         if res is False:
2438             assert not fatal
2439             return [], {}
2440
2441         smil, urlh = res
2442         smil_url = urlh.geturl()
2443
2444         namespace = self._parse_smil_namespace(smil)
2445
2446         fmts = self._parse_smil_formats(
2447             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2448         subs = self._parse_smil_subtitles(
2449             smil, namespace=namespace)
2450
2451         return fmts, subs
2452
2453     def _extract_smil_formats(self, *args, **kwargs):
2454         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2455         if subs:
2456             self._report_ignoring_subs('SMIL')
2457         return fmts
2458
2459     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2460         res = self._download_smil(smil_url, video_id, fatal=fatal)
2461         if res is False:
2462             return {}
2463
2464         smil, urlh = res
2465         smil_url = urlh.geturl()
2466
2467         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2468
2469     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2470         return self._download_xml_handle(
2471             smil_url, video_id, 'Downloading SMIL file',
2472             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2473
2474     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2475         namespace = self._parse_smil_namespace(smil)
2476
2477         formats = self._parse_smil_formats(
2478             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2479         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2480
2481         video_id = os.path.splitext(url_basename(smil_url))[0]
2482         title = None
2483         description = None
2484         upload_date = None
2485         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2486             name = meta.attrib.get('name')
2487             content = meta.attrib.get('content')
2488             if not name or not content:
2489                 continue
2490             if not title and name == 'title':
2491                 title = content
2492             elif not description and name in ('description', 'abstract'):
2493                 description = content
2494             elif not upload_date and name == 'date':
2495                 upload_date = unified_strdate(content)
2496
2497         thumbnails = [{
2498             'id': image.get('type'),
2499             'url': image.get('src'),
2500             'width': int_or_none(image.get('width')),
2501             'height': int_or_none(image.get('height')),
2502         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2503
2504         return {
2505             'id': video_id,
2506             'title': title or video_id,
2507             'description': description,
2508             'upload_date': upload_date,
2509             'thumbnails': thumbnails,
2510             'formats': formats,
2511             'subtitles': subtitles,
2512         }
2513
2514     def _parse_smil_namespace(self, smil):
2515         return self._search_regex(
2516             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2517
2518     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2519         base = smil_url
2520         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2521             b = meta.get('base') or meta.get('httpBase')
2522             if b:
2523                 base = b
2524                 break
2525
2526         formats = []
2527         rtmp_count = 0
2528         http_count = 0
2529         m3u8_count = 0
2530         imgs_count = 0
2531
2532         srcs = set()
2533         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2534         for medium in media:
2535             src = medium.get('src')
2536             if not src or src in srcs:
2537                 continue
2538             srcs.add(src)
2539
2540             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2541             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2542             width = int_or_none(medium.get('width'))
2543             height = int_or_none(medium.get('height'))
2544             proto = medium.get('proto')
2545             ext = medium.get('ext')
2546             src_ext = determine_ext(src)
2547             streamer = medium.get('streamer') or base
2548
2549             if proto == 'rtmp' or streamer.startswith('rtmp'):
2550                 rtmp_count += 1
2551                 formats.append({
2552                     'url': streamer,
2553                     'play_path': src,
2554                     'ext': 'flv',
2555                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2556                     'tbr': bitrate,
2557                     'filesize': filesize,
2558                     'width': width,
2559                     'height': height,
2560                 })
2561                 if transform_rtmp_url:
2562                     streamer, src = transform_rtmp_url(streamer, src)
2563                     formats[-1].update({
2564                         'url': streamer,
2565                         'play_path': src,
2566                     })
2567                 continue
2568
2569             src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2570             src_url = src_url.strip()
2571
2572             if proto == 'm3u8' or src_ext == 'm3u8':
2573                 m3u8_formats = self._extract_m3u8_formats(
2574                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2575                 if len(m3u8_formats) == 1:
2576                     m3u8_count += 1
2577                     m3u8_formats[0].update({
2578                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2579                         'tbr': bitrate,
2580                         'width': width,
2581                         'height': height,
2582                     })
2583                 formats.extend(m3u8_formats)
2584             elif src_ext == 'f4m':
2585                 f4m_url = src_url
2586                 if not f4m_params:
2587                     f4m_params = {
2588                         'hdcore': '3.2.0',
2589                         'plugin': 'flowplayer-3.2.0.1',
2590                     }
2591                 f4m_url += '&' if '?' in f4m_url else '?'
2592                 f4m_url += urllib.parse.urlencode(f4m_params)
2593                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2594             elif src_ext == 'mpd':
2595                 formats.extend(self._extract_mpd_formats(
2596                     src_url, video_id, mpd_id='dash', fatal=False))
2597             elif re.search(r'\.ism/[Mm]anifest', src_url):
2598                 formats.extend(self._extract_ism_formats(
2599                     src_url, video_id, ism_id='mss', fatal=False))
2600             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2601                 http_count += 1
2602                 formats.append({
2603                     'url': src_url,
2604                     'ext': ext or src_ext or 'flv',
2605                     'format_id': 'http-%d' % (bitrate or http_count),
2606                     'tbr': bitrate,
2607                     'filesize': filesize,
2608                     'width': width,
2609                     'height': height,
2610                 })
2611
2612         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2613             src = medium.get('src')
2614             if not src or src in srcs:
2615                 continue
2616             srcs.add(src)
2617
2618             imgs_count += 1
2619             formats.append({
2620                 'format_id': 'imagestream-%d' % (imgs_count),
2621                 'url': src,
2622                 'ext': mimetype2ext(medium.get('type')),
2623                 'acodec': 'none',
2624                 'vcodec': 'none',
2625                 'width': int_or_none(medium.get('width')),
2626                 'height': int_or_none(medium.get('height')),
2627                 'format_note': 'SMIL storyboards',
2628             })
2629
2630         return formats
2631
2632     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2633         urls = []
2634         subtitles = {}
2635         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2636             src = textstream.get('src')
2637             if not src or src in urls:
2638                 continue
2639             urls.append(src)
2640             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2641             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2642             subtitles.setdefault(lang, []).append({
2643                 'url': src,
2644                 'ext': ext,
2645             })
2646         return subtitles
2647
2648     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2649         res = self._download_xml_handle(
2650             xspf_url, playlist_id, 'Downloading xpsf playlist',
2651             'Unable to download xspf manifest', fatal=fatal)
2652         if res is False:
2653             return []
2654
2655         xspf, urlh = res
2656         xspf_url = urlh.geturl()
2657
2658         return self._parse_xspf(
2659             xspf, playlist_id, xspf_url=xspf_url,
2660             xspf_base_url=base_url(xspf_url))
2661
2662     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2663         NS_MAP = {
2664             'xspf': 'http://xspf.org/ns/0/',
2665             's1': 'http://static.streamone.nl/player/ns/0',
2666         }
2667
2668         entries = []
2669         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2670             title = xpath_text(
2671                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2672             description = xpath_text(
2673                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2674             thumbnail = xpath_text(
2675                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2676             duration = float_or_none(
2677                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2678
2679             formats = []
2680             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2681                 format_url = urljoin(xspf_base_url, location.text)
2682                 if not format_url:
2683                     continue
2684                 formats.append({
2685                     'url': format_url,
2686                     'manifest_url': xspf_url,
2687                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2688                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2689                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2690                 })
2691             self._sort_formats(formats)
2692
2693             entries.append({
2694                 'id': playlist_id,
2695                 'title': title,
2696                 'description': description,
2697                 'thumbnail': thumbnail,
2698                 'duration': duration,
2699                 'formats': formats,
2700             })
2701         return entries
2702
2703     def _extract_mpd_formats(self, *args, **kwargs):
2704         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2705         if subs:
2706             self._report_ignoring_subs('DASH')
2707         return fmts
2708
2709     def _extract_mpd_formats_and_subtitles(
2710             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2711             fatal=True, data=None, headers={}, query={}):
2712         res = self._download_xml_handle(
2713             mpd_url, video_id,
2714             note='Downloading MPD manifest' if note is None else note,
2715             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2716             fatal=fatal, data=data, headers=headers, query=query)
2717         if res is False:
2718             return [], {}
2719         mpd_doc, urlh = res
2720         if mpd_doc is None:
2721             return [], {}
2722
2723         # We could have been redirected to a new url when we retrieved our mpd file.
2724         mpd_url = urlh.geturl()
2725         mpd_base_url = base_url(mpd_url)
2726
2727         return self._parse_mpd_formats_and_subtitles(
2728             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2729
2730     def _parse_mpd_formats(self, *args, **kwargs):
2731         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2732         if subs:
2733             self._report_ignoring_subs('DASH')
2734         return fmts
2735
2736     def _parse_mpd_formats_and_subtitles(
2737             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2738         """
2739         Parse formats from MPD manifest.
2740         References:
2741          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2742             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2743          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2744         """
2745         if not self.get_param('dynamic_mpd', True):
2746             if mpd_doc.get('type') == 'dynamic':
2747                 return [], {}
2748
2749         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2750
2751         def _add_ns(path):
2752             return self._xpath_ns(path, namespace)
2753
2754         def is_drm_protected(element):
2755             return element.find(_add_ns('ContentProtection')) is not None
2756
2757         def extract_multisegment_info(element, ms_parent_info):
2758             ms_info = ms_parent_info.copy()
2759
2760             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2761             # common attributes and elements.  We will only extract relevant
2762             # for us.
2763             def extract_common(source):
2764                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2765                 if segment_timeline is not None:
2766                     s_e = segment_timeline.findall(_add_ns('S'))
2767                     if s_e:
2768                         ms_info['total_number'] = 0
2769                         ms_info['s'] = []
2770                         for s in s_e:
2771                             r = int(s.get('r', 0))
2772                             ms_info['total_number'] += 1 + r
2773                             ms_info['s'].append({
2774                                 't': int(s.get('t', 0)),
2775                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2776                                 'd': int(s.attrib['d']),
2777                                 'r': r,
2778                             })
2779                 start_number = source.get('startNumber')
2780                 if start_number:
2781                     ms_info['start_number'] = int(start_number)
2782                 timescale = source.get('timescale')
2783                 if timescale:
2784                     ms_info['timescale'] = int(timescale)
2785                 segment_duration = source.get('duration')
2786                 if segment_duration:
2787                     ms_info['segment_duration'] = float(segment_duration)
2788
2789             def extract_Initialization(source):
2790                 initialization = source.find(_add_ns('Initialization'))
2791                 if initialization is not None:
2792                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2793
2794             segment_list = element.find(_add_ns('SegmentList'))
2795             if segment_list is not None:
2796                 extract_common(segment_list)
2797                 extract_Initialization(segment_list)
2798                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2799                 if segment_urls_e:
2800                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2801             else:
2802                 segment_template = element.find(_add_ns('SegmentTemplate'))
2803                 if segment_template is not None:
2804                     extract_common(segment_template)
2805                     media = segment_template.get('media')
2806                     if media:
2807                         ms_info['media'] = media
2808                     initialization = segment_template.get('initialization')
2809                     if initialization:
2810                         ms_info['initialization'] = initialization
2811                     else:
2812                         extract_Initialization(segment_template)
2813             return ms_info
2814
2815         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2816         formats, subtitles = [], {}
2817         stream_numbers = collections.defaultdict(int)
2818         for period in mpd_doc.findall(_add_ns('Period')):
2819             period_duration = parse_duration(period.get('duration')) or mpd_duration
2820             period_ms_info = extract_multisegment_info(period, {
2821                 'start_number': 1,
2822                 'timescale': 1,
2823             })
2824             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2825                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2826                 for representation in adaptation_set.findall(_add_ns('Representation')):
2827                     representation_attrib = adaptation_set.attrib.copy()
2828                     representation_attrib.update(representation.attrib)
2829                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2830                     mime_type = representation_attrib['mimeType']
2831                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2832
2833                     codec_str = representation_attrib.get('codecs', '')
2834                     # Some kind of binary subtitle found in some youtube livestreams
2835                     if mime_type == 'application/x-rawcc':
2836                         codecs = {'scodec': codec_str}
2837                     else:
2838                         codecs = parse_codecs(codec_str)
2839                     if content_type not in ('video', 'audio', 'text'):
2840                         if mime_type == 'image/jpeg':
2841                             content_type = mime_type
2842                         elif codecs.get('vcodec', 'none') != 'none':
2843                             content_type = 'video'
2844                         elif codecs.get('acodec', 'none') != 'none':
2845                             content_type = 'audio'
2846                         elif codecs.get('scodec', 'none') != 'none':
2847                             content_type = 'text'
2848                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2849                             content_type = 'text'
2850                         else:
2851                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2852                             continue
2853
2854                     base_url = ''
2855                     for element in (representation, adaptation_set, period, mpd_doc):
2856                         base_url_e = element.find(_add_ns('BaseURL'))
2857                         if try_call(lambda: base_url_e.text) is not None:
2858                             base_url = base_url_e.text + base_url
2859                             if re.match(r'^https?://', base_url):
2860                                 break
2861                     if mpd_base_url and base_url.startswith('/'):
2862                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2863                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2864                         if not mpd_base_url.endswith('/'):
2865                             mpd_base_url += '/'
2866                         base_url = mpd_base_url + base_url
2867                     representation_id = representation_attrib.get('id')
2868                     lang = representation_attrib.get('lang')
2869                     url_el = representation.find(_add_ns('BaseURL'))
2870                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2871                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2872                     if representation_id is not None:
2873                         format_id = representation_id
2874                     else:
2875                         format_id = content_type
2876                     if mpd_id:
2877                         format_id = mpd_id + '-' + format_id
2878                     if content_type in ('video', 'audio'):
2879                         f = {
2880                             'format_id': format_id,
2881                             'manifest_url': mpd_url,
2882                             'ext': mimetype2ext(mime_type),
2883                             'width': int_or_none(representation_attrib.get('width')),
2884                             'height': int_or_none(representation_attrib.get('height')),
2885                             'tbr': float_or_none(bandwidth, 1000),
2886                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2887                             'fps': int_or_none(representation_attrib.get('frameRate')),
2888                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2889                             'format_note': 'DASH %s' % content_type,
2890                             'filesize': filesize,
2891                             'container': mimetype2ext(mime_type) + '_dash',
2892                             **codecs
2893                         }
2894                     elif content_type == 'text':
2895                         f = {
2896                             'ext': mimetype2ext(mime_type),
2897                             'manifest_url': mpd_url,
2898                             'filesize': filesize,
2899                         }
2900                     elif content_type == 'image/jpeg':
2901                         # See test case in VikiIE
2902                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2903                         f = {
2904                             'format_id': format_id,
2905                             'ext': 'mhtml',
2906                             'manifest_url': mpd_url,
2907                             'format_note': 'DASH storyboards (jpeg)',
2908                             'acodec': 'none',
2909                             'vcodec': 'none',
2910                         }
2911                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2912                         f['has_drm'] = True
2913                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2914
2915                     def prepare_template(template_name, identifiers):
2916                         tmpl = representation_ms_info[template_name]
2917                         # First of, % characters outside $...$ templates
2918                         # must be escaped by doubling for proper processing
2919                         # by % operator string formatting used further (see
2920                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2921                         t = ''
2922                         in_template = False
2923                         for c in tmpl:
2924                             t += c
2925                             if c == '$':
2926                                 in_template = not in_template
2927                             elif c == '%' and not in_template:
2928                                 t += c
2929                         # Next, $...$ templates are translated to their
2930                         # %(...) counterparts to be used with % operator
2931                         if representation_id is not None:
2932                             t = t.replace('$RepresentationID$', representation_id)
2933                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2934                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2935                         t.replace('$$', '$')
2936                         return t
2937
2938                     # @initialization is a regular template like @media one
2939                     # so it should be handled just the same way (see
2940                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2941                     if 'initialization' in representation_ms_info:
2942                         initialization_template = prepare_template(
2943                             'initialization',
2944                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2945                             # $Time$ shall not be included for @initialization thus
2946                             # only $Bandwidth$ remains
2947                             ('Bandwidth', ))
2948                         representation_ms_info['initialization_url'] = initialization_template % {
2949                             'Bandwidth': bandwidth,
2950                         }
2951
2952                     def location_key(location):
2953                         return 'url' if re.match(r'^https?://', location) else 'path'
2954
2955                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2956
2957                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2958                         media_location_key = location_key(media_template)
2959
2960                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2961                         # can't be used at the same time
2962                         if '%(Number' in media_template and 's' not in representation_ms_info:
2963                             segment_duration = None
2964                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2965                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2966                                 representation_ms_info['total_number'] = int(math.ceil(
2967                                     float_or_none(period_duration, segment_duration, default=0)))
2968                             representation_ms_info['fragments'] = [{
2969                                 media_location_key: media_template % {
2970                                     'Number': segment_number,
2971                                     'Bandwidth': bandwidth,
2972                                 },
2973                                 'duration': segment_duration,
2974                             } for segment_number in range(
2975                                 representation_ms_info['start_number'],
2976                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2977                         else:
2978                             # $Number*$ or $Time$ in media template with S list available
2979                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2980                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2981                             representation_ms_info['fragments'] = []
2982                             segment_time = 0
2983                             segment_d = None
2984                             segment_number = representation_ms_info['start_number']
2985
2986                             def add_segment_url():
2987                                 segment_url = media_template % {
2988                                     'Time': segment_time,
2989                                     'Bandwidth': bandwidth,
2990                                     'Number': segment_number,
2991                                 }
2992                                 representation_ms_info['fragments'].append({
2993                                     media_location_key: segment_url,
2994                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2995                                 })
2996
2997                             for num, s in enumerate(representation_ms_info['s']):
2998                                 segment_time = s.get('t') or segment_time
2999                                 segment_d = s['d']
3000                                 add_segment_url()
3001                                 segment_number += 1
3002                                 for r in range(s.get('r', 0)):
3003                                     segment_time += segment_d
3004                                     add_segment_url()
3005                                     segment_number += 1
3006                                 segment_time += segment_d
3007                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
3008                         # No media template,
3009                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
3010                         # or any YouTube dashsegments video
3011                         fragments = []
3012                         segment_index = 0
3013                         timescale = representation_ms_info['timescale']
3014                         for s in representation_ms_info['s']:
3015                             duration = float_or_none(s['d'], timescale)
3016                             for r in range(s.get('r', 0) + 1):
3017                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
3018                                 fragments.append({
3019                                     location_key(segment_uri): segment_uri,
3020                                     'duration': duration,
3021                                 })
3022                                 segment_index += 1
3023                         representation_ms_info['fragments'] = fragments
3024                     elif 'segment_urls' in representation_ms_info:
3025                         # Segment URLs with no SegmentTimeline
3026                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3027                         # https://github.com/ytdl-org/youtube-dl/pull/14844
3028                         fragments = []
3029                         segment_duration = float_or_none(
3030                             representation_ms_info['segment_duration'],
3031                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3032                         for segment_url in representation_ms_info['segment_urls']:
3033                             fragment = {
3034                                 location_key(segment_url): segment_url,
3035                             }
3036                             if segment_duration:
3037                                 fragment['duration'] = segment_duration
3038                             fragments.append(fragment)
3039                         representation_ms_info['fragments'] = fragments
3040                     # If there is a fragments key available then we correctly recognized fragmented media.
3041                     # Otherwise we will assume unfragmented media with direct access. Technically, such
3042                     # assumption is not necessarily correct since we may simply have no support for
3043                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3044                     if 'fragments' in representation_ms_info:
3045                         f.update({
3046                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3047                             'url': mpd_url or base_url,
3048                             'fragment_base_url': base_url,
3049                             'fragments': [],
3050                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3051                         })
3052                         if 'initialization_url' in representation_ms_info:
3053                             initialization_url = representation_ms_info['initialization_url']
3054                             if not f.get('url'):
3055                                 f['url'] = initialization_url
3056                             f['fragments'].append({location_key(initialization_url): initialization_url})
3057                         f['fragments'].extend(representation_ms_info['fragments'])
3058                         if not period_duration:
3059                             period_duration = try_get(
3060                                 representation_ms_info,
3061                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3062                     else:
3063                         # Assuming direct URL to unfragmented media.
3064                         f['url'] = base_url
3065                     if content_type in ('video', 'audio', 'image/jpeg'):
3066                         f['manifest_stream_number'] = stream_numbers[f['url']]
3067                         stream_numbers[f['url']] += 1
3068                         formats.append(f)
3069                     elif content_type == 'text':
3070                         subtitles.setdefault(lang or 'und', []).append(f)
3071
3072         return formats, subtitles
3073
3074     def _extract_ism_formats(self, *args, **kwargs):
3075         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3076         if subs:
3077             self._report_ignoring_subs('ISM')
3078         return fmts
3079
3080     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3081         res = self._download_xml_handle(
3082             ism_url, video_id,
3083             note='Downloading ISM manifest' if note is None else note,
3084             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3085             fatal=fatal, data=data, headers=headers, query=query)
3086         if res is False:
3087             return [], {}
3088         ism_doc, urlh = res
3089         if ism_doc is None:
3090             return [], {}
3091
3092         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3093
3094     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3095         """
3096         Parse formats from ISM manifest.
3097         References:
3098          1. [MS-SSTR]: Smooth Streaming Protocol,
3099             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3100         """
3101         if ism_doc.get('IsLive') == 'TRUE':
3102             return [], {}
3103
3104         duration = int(ism_doc.attrib['Duration'])
3105         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3106
3107         formats = []
3108         subtitles = {}
3109         for stream in ism_doc.findall('StreamIndex'):
3110             stream_type = stream.get('Type')
3111             if stream_type not in ('video', 'audio', 'text'):
3112                 continue
3113             url_pattern = stream.attrib['Url']
3114             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3115             stream_name = stream.get('Name')
3116             stream_language = stream.get('Language', 'und')
3117             for track in stream.findall('QualityLevel'):
3118                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3119                 # TODO: add support for WVC1 and WMAP
3120                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3121                     self.report_warning('%s is not a supported codec' % fourcc)
3122                     continue
3123                 tbr = int(track.attrib['Bitrate']) // 1000
3124                 # [1] does not mention Width and Height attributes. However,
3125                 # they're often present while MaxWidth and MaxHeight are
3126                 # missing, so should be used as fallbacks
3127                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3128                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3129                 sampling_rate = int_or_none(track.get('SamplingRate'))
3130
3131                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3132                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3133
3134                 fragments = []
3135                 fragment_ctx = {
3136                     'time': 0,
3137                 }
3138                 stream_fragments = stream.findall('c')
3139                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3140                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3141                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3142                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3143                     if not fragment_ctx['duration']:
3144                         try:
3145                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3146                         except IndexError:
3147                             next_fragment_time = duration
3148                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3149                     for _ in range(fragment_repeat):
3150                         fragments.append({
3151                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3152                             'duration': fragment_ctx['duration'] / stream_timescale,
3153                         })
3154                         fragment_ctx['time'] += fragment_ctx['duration']
3155
3156                 if stream_type == 'text':
3157                     subtitles.setdefault(stream_language, []).append({
3158                         'ext': 'ismt',
3159                         'protocol': 'ism',
3160                         'url': ism_url,
3161                         'manifest_url': ism_url,
3162                         'fragments': fragments,
3163                         '_download_params': {
3164                             'stream_type': stream_type,
3165                             'duration': duration,
3166                             'timescale': stream_timescale,
3167                             'fourcc': fourcc,
3168                             'language': stream_language,
3169                             'codec_private_data': track.get('CodecPrivateData'),
3170                         }
3171                     })
3172                 elif stream_type in ('video', 'audio'):
3173                     formats.append({
3174                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3175                         'url': ism_url,
3176                         'manifest_url': ism_url,
3177                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3178                         'width': width,
3179                         'height': height,
3180                         'tbr': tbr,
3181                         'asr': sampling_rate,
3182                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3183                         'acodec': 'none' if stream_type == 'video' else fourcc,
3184                         'protocol': 'ism',
3185                         'fragments': fragments,
3186                         'has_drm': ism_doc.find('Protection') is not None,
3187                         '_download_params': {
3188                             'stream_type': stream_type,
3189                             'duration': duration,
3190                             'timescale': stream_timescale,
3191                             'width': width or 0,
3192                             'height': height or 0,
3193                             'fourcc': fourcc,
3194                             'language': stream_language,
3195                             'codec_private_data': track.get('CodecPrivateData'),
3196                             'sampling_rate': sampling_rate,
3197                             'channels': int_or_none(track.get('Channels', 2)),
3198                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3199                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3200                         },
3201                     })
3202         return formats, subtitles
3203
3204     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3205         def absolute_url(item_url):
3206             return urljoin(base_url, item_url)
3207
3208         def parse_content_type(content_type):
3209             if not content_type:
3210                 return {}
3211             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3212             if ctr:
3213                 mimetype, codecs = ctr.groups()
3214                 f = parse_codecs(codecs)
3215                 f['ext'] = mimetype2ext(mimetype)
3216                 return f
3217             return {}
3218
3219         def _media_formats(src, cur_media_type, type_info=None):
3220             type_info = type_info or {}
3221             full_url = absolute_url(src)
3222             ext = type_info.get('ext') or determine_ext(full_url)
3223             if ext == 'm3u8':
3224                 is_plain_url = False
3225                 formats = self._extract_m3u8_formats(
3226                     full_url, video_id, ext='mp4',
3227                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3228                     preference=preference, quality=quality, fatal=False)
3229             elif ext == 'mpd':
3230                 is_plain_url = False
3231                 formats = self._extract_mpd_formats(
3232                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3233             else:
3234                 is_plain_url = True
3235                 formats = [{
3236                     'url': full_url,
3237                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3238                     'ext': ext,
3239                 }]
3240             return is_plain_url, formats
3241
3242         entries = []
3243         # amp-video and amp-audio are very similar to their HTML5 counterparts
3244         # so we will include them right here (see
3245         # https://www.ampproject.org/docs/reference/components/amp-video)
3246         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3247         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3248         media_tags = [(media_tag, media_tag_name, media_type, '')
3249                       for media_tag, media_tag_name, media_type
3250                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3251         media_tags.extend(re.findall(
3252             # We only allow video|audio followed by a whitespace or '>'.
3253             # Allowing more characters may end up in significant slow down (see
3254             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3255             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3256             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3257         for media_tag, _, media_type, media_content in media_tags:
3258             media_info = {
3259                 'formats': [],
3260                 'subtitles': {},
3261             }
3262             media_attributes = extract_attributes(media_tag)
3263             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3264             if src:
3265                 f = parse_content_type(media_attributes.get('type'))
3266                 _, formats = _media_formats(src, media_type, f)
3267                 media_info['formats'].extend(formats)
3268             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3269             if media_content:
3270                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3271                     s_attr = extract_attributes(source_tag)
3272                     # data-video-src and data-src are non standard but seen
3273                     # several times in the wild
3274                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3275                     if not src:
3276                         continue
3277                     f = parse_content_type(s_attr.get('type'))
3278                     is_plain_url, formats = _media_formats(src, media_type, f)
3279                     if is_plain_url:
3280                         # width, height, res, label and title attributes are
3281                         # all not standard but seen several times in the wild
3282                         labels = [
3283                             s_attr.get(lbl)
3284                             for lbl in ('label', 'title')
3285                             if str_or_none(s_attr.get(lbl))
3286                         ]
3287                         width = int_or_none(s_attr.get('width'))
3288                         height = (int_or_none(s_attr.get('height'))
3289                                   or int_or_none(s_attr.get('res')))
3290                         if not width or not height:
3291                             for lbl in labels:
3292                                 resolution = parse_resolution(lbl)
3293                                 if not resolution:
3294                                     continue
3295                                 width = width or resolution.get('width')
3296                                 height = height or resolution.get('height')
3297                         for lbl in labels:
3298                             tbr = parse_bitrate(lbl)
3299                             if tbr:
3300                                 break
3301                         else:
3302                             tbr = None
3303                         f.update({
3304                             'width': width,
3305                             'height': height,
3306                             'tbr': tbr,
3307                             'format_id': s_attr.get('label') or s_attr.get('title'),
3308                         })
3309                         f.update(formats[0])
3310                         media_info['formats'].append(f)
3311                     else:
3312                         media_info['formats'].extend(formats)
3313                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3314                     track_attributes = extract_attributes(track_tag)
3315                     kind = track_attributes.get('kind')
3316                     if not kind or kind in ('subtitles', 'captions'):
3317                         src = strip_or_none(track_attributes.get('src'))
3318                         if not src:
3319                             continue
3320                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3321                         media_info['subtitles'].setdefault(lang, []).append({
3322                             'url': absolute_url(src),
3323                         })
3324             for f in media_info['formats']:
3325                 f.setdefault('http_headers', {})['Referer'] = base_url
3326             if media_info['formats'] or media_info['subtitles']:
3327                 entries.append(media_info)
3328         return entries
3329
3330     def _extract_akamai_formats(self, *args, **kwargs):
3331         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3332         if subs:
3333             self._report_ignoring_subs('akamai')
3334         return fmts
3335
3336     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3337         signed = 'hdnea=' in manifest_url
3338         if not signed:
3339             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3340             manifest_url = re.sub(
3341                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3342                 '', manifest_url).strip('?')
3343
3344         formats = []
3345         subtitles = {}
3346
3347         hdcore_sign = 'hdcore=3.7.0'
3348         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3349         hds_host = hosts.get('hds')
3350         if hds_host:
3351             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3352         if 'hdcore=' not in f4m_url:
3353             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3354         f4m_formats = self._extract_f4m_formats(
3355             f4m_url, video_id, f4m_id='hds', fatal=False)
3356         for entry in f4m_formats:
3357             entry.update({'extra_param_to_segment_url': hdcore_sign})
3358         formats.extend(f4m_formats)
3359
3360         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3361         hls_host = hosts.get('hls')
3362         if hls_host:
3363             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3364         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3365             m3u8_url, video_id, 'mp4', 'm3u8_native',
3366             m3u8_id='hls', fatal=False)
3367         formats.extend(m3u8_formats)
3368         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3369
3370         http_host = hosts.get('http')
3371         if http_host and m3u8_formats and not signed:
3372             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3373             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3374             qualities_length = len(qualities)
3375             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3376                 i = 0
3377                 for f in m3u8_formats:
3378                     if f['vcodec'] != 'none':
3379                         for protocol in ('http', 'https'):
3380                             http_f = f.copy()
3381                             del http_f['manifest_url']
3382                             http_url = re.sub(
3383                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3384                             http_f.update({
3385                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3386                                 'url': http_url,
3387                                 'protocol': protocol,
3388                             })
3389                             formats.append(http_f)
3390                         i += 1
3391
3392         return formats, subtitles
3393
3394     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3395         query = urllib.parse.urlparse(url).query
3396         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3397         mobj = re.search(
3398             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3399         url_base = mobj.group('url')
3400         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3401         formats = []
3402
3403         def manifest_url(manifest):
3404             m_url = f'{http_base_url}/{manifest}'
3405             if query:
3406                 m_url += '?%s' % query
3407             return m_url
3408
3409         if 'm3u8' not in skip_protocols:
3410             formats.extend(self._extract_m3u8_formats(
3411                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3412                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3413         if 'f4m' not in skip_protocols:
3414             formats.extend(self._extract_f4m_formats(
3415                 manifest_url('manifest.f4m'),
3416                 video_id, f4m_id='hds', fatal=False))
3417         if 'dash' not in skip_protocols:
3418             formats.extend(self._extract_mpd_formats(
3419                 manifest_url('manifest.mpd'),
3420                 video_id, mpd_id='dash', fatal=False))
3421         if re.search(r'(?:/smil:|\.smil)', url_base):
3422             if 'smil' not in skip_protocols:
3423                 rtmp_formats = self._extract_smil_formats(
3424                     manifest_url('jwplayer.smil'),
3425                     video_id, fatal=False)
3426                 for rtmp_format in rtmp_formats:
3427                     rtsp_format = rtmp_format.copy()
3428                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3429                     del rtsp_format['play_path']
3430                     del rtsp_format['ext']
3431                     rtsp_format.update({
3432                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3433                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3434                         'protocol': 'rtsp',
3435                     })
3436                     formats.extend([rtmp_format, rtsp_format])
3437         else:
3438             for protocol in ('rtmp', 'rtsp'):
3439                 if protocol not in skip_protocols:
3440                     formats.append({
3441                         'url': f'{protocol}:{url_base}',
3442                         'format_id': protocol,
3443                         'protocol': protocol,
3444                     })
3445         return formats
3446
3447     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3448         mobj = re.search(
3449             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3450             webpage)
3451         if mobj:
3452             try:
3453                 jwplayer_data = self._parse_json(mobj.group('options'),
3454                                                  video_id=video_id,
3455                                                  transform_source=transform_source)
3456             except ExtractorError:
3457                 pass
3458             else:
3459                 if isinstance(jwplayer_data, dict):
3460                     return jwplayer_data
3461
3462     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3463         jwplayer_data = self._find_jwplayer_data(
3464             webpage, video_id, transform_source=js_to_json)
3465         return self._parse_jwplayer_data(
3466             jwplayer_data, video_id, *args, **kwargs)
3467
3468     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3469                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3470         # JWPlayer backward compatibility: flattened playlists
3471         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3472         if 'playlist' not in jwplayer_data:
3473             jwplayer_data = {'playlist': [jwplayer_data]}
3474
3475         entries = []
3476
3477         # JWPlayer backward compatibility: single playlist item
3478         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3479         if not isinstance(jwplayer_data['playlist'], list):
3480             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3481
3482         for video_data in jwplayer_data['playlist']:
3483             # JWPlayer backward compatibility: flattened sources
3484             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3485             if 'sources' not in video_data:
3486                 video_data['sources'] = [video_data]
3487
3488             this_video_id = video_id or video_data['mediaid']
3489
3490             formats = self._parse_jwplayer_formats(
3491                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3492                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3493
3494             subtitles = {}
3495             tracks = video_data.get('tracks')
3496             if tracks and isinstance(tracks, list):
3497                 for track in tracks:
3498                     if not isinstance(track, dict):
3499                         continue
3500                     track_kind = track.get('kind')
3501                     if not track_kind or not isinstance(track_kind, str):
3502                         continue
3503                     if track_kind.lower() not in ('captions', 'subtitles'):
3504                         continue
3505                     track_url = urljoin(base_url, track.get('file'))
3506                     if not track_url:
3507                         continue
3508                     subtitles.setdefault(track.get('label') or 'en', []).append({
3509                         'url': self._proto_relative_url(track_url)
3510                     })
3511
3512             entry = {
3513                 'id': this_video_id,
3514                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3515                 'description': clean_html(video_data.get('description')),
3516                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3517                 'timestamp': int_or_none(video_data.get('pubdate')),
3518                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3519                 'subtitles': subtitles,
3520             }
3521             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3522             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3523                 entry.update({
3524                     '_type': 'url_transparent',
3525                     'url': formats[0]['url'],
3526                 })
3527             else:
3528                 self._sort_formats(formats)
3529                 entry['formats'] = formats
3530             entries.append(entry)
3531         if len(entries) == 1:
3532             return entries[0]
3533         else:
3534             return self.playlist_result(entries)
3535
3536     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3537                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3538         urls = []
3539         formats = []
3540         for source in jwplayer_sources_data:
3541             if not isinstance(source, dict):
3542                 continue
3543             source_url = urljoin(
3544                 base_url, self._proto_relative_url(source.get('file')))
3545             if not source_url or source_url in urls:
3546                 continue
3547             urls.append(source_url)
3548             source_type = source.get('type') or ''
3549             ext = mimetype2ext(source_type) or determine_ext(source_url)
3550             if source_type == 'hls' or ext == 'm3u8':
3551                 formats.extend(self._extract_m3u8_formats(
3552                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3553                     m3u8_id=m3u8_id, fatal=False))
3554             elif source_type == 'dash' or ext == 'mpd':
3555                 formats.extend(self._extract_mpd_formats(
3556                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3557             elif ext == 'smil':
3558                 formats.extend(self._extract_smil_formats(
3559                     source_url, video_id, fatal=False))
3560             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3561             elif source_type.startswith('audio') or ext in (
3562                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3563                 formats.append({
3564                     'url': source_url,
3565                     'vcodec': 'none',
3566                     'ext': ext,
3567                 })
3568             else:
3569                 height = int_or_none(source.get('height'))
3570                 if height is None:
3571                     # Often no height is provided but there is a label in
3572                     # format like "1080p", "720p SD", or 1080.
3573                     height = int_or_none(self._search_regex(
3574                         r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3575                         'height', default=None))
3576                 a_format = {
3577                     'url': source_url,
3578                     'width': int_or_none(source.get('width')),
3579                     'height': height,
3580                     'tbr': int_or_none(source.get('bitrate')),
3581                     'ext': ext,
3582                 }
3583                 if source_url.startswith('rtmp'):
3584                     a_format['ext'] = 'flv'
3585                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3586                     # of jwplayer.flash.swf
3587                     rtmp_url_parts = re.split(
3588                         r'((?:mp4|mp3|flv):)', source_url, 1)
3589                     if len(rtmp_url_parts) == 3:
3590                         rtmp_url, prefix, play_path = rtmp_url_parts
3591                         a_format.update({
3592                             'url': rtmp_url,
3593                             'play_path': prefix + play_path,
3594                         })
3595                     if rtmp_params:
3596                         a_format.update(rtmp_params)
3597                 formats.append(a_format)
3598         return formats
3599
3600     def _live_title(self, name):
3601         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3602         return name
3603
3604     def _int(self, v, name, fatal=False, **kwargs):
3605         res = int_or_none(v, **kwargs)
3606         if res is None:
3607             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3608             if fatal:
3609                 raise ExtractorError(msg)
3610             else:
3611                 self.report_warning(msg)
3612         return res
3613
3614     def _float(self, v, name, fatal=False, **kwargs):
3615         res = float_or_none(v, **kwargs)
3616         if res is None:
3617             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3618             if fatal:
3619                 raise ExtractorError(msg)
3620             else:
3621                 self.report_warning(msg)
3622         return res
3623
3624     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3625                     path='/', secure=False, discard=False, rest={}, **kwargs):
3626         cookie = http.cookiejar.Cookie(
3627             0, name, value, port, port is not None, domain, True,
3628             domain.startswith('.'), path, True, secure, expire_time,
3629             discard, None, None, rest)
3630         self.cookiejar.set_cookie(cookie)
3631
3632     def _get_cookies(self, url):
3633         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3634         return http.cookies.SimpleCookie(self._downloader._calc_cookies(url))
3635
3636     def _apply_first_set_cookie_header(self, url_handle, cookie):
3637         """
3638         Apply first Set-Cookie header instead of the last. Experimental.
3639
3640         Some sites (e.g. [1-3]) may serve two cookies under the same name
3641         in Set-Cookie header and expect the first (old) one to be set rather
3642         than second (new). However, as of RFC6265 the newer one cookie
3643         should be set into cookie store what actually happens.
3644         We will workaround this issue by resetting the cookie to
3645         the first one manually.
3646         1. https://new.vk.com/
3647         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3648         3. https://learning.oreilly.com/
3649         """
3650         for header, cookies in url_handle.headers.items():
3651             if header.lower() != 'set-cookie':
3652                 continue
3653             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3654             cookie_value = re.search(
3655                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3656             if cookie_value:
3657                 value, domain = cookie_value.groups()
3658                 self._set_cookie(domain, cookie, value)
3659                 break
3660
3661     @classmethod
3662     def get_testcases(cls, include_onlymatching=False):
3663         t = getattr(cls, '_TEST', None)
3664         if t:
3665             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3666             tests = [t]
3667         else:
3668             tests = getattr(cls, '_TESTS', [])
3669         for t in tests:
3670             if not include_onlymatching and t.get('only_matching', False):
3671                 continue
3672             t['name'] = cls.ie_key()
3673             yield t
3674
3675     @classmethod
3676     def get_webpage_testcases(cls):
3677         tests = getattr(cls, '_WEBPAGE_TESTS', [])
3678         for t in tests:
3679             t['name'] = cls.ie_key()
3680         return tests
3681
3682     @classproperty
3683     def age_limit(cls):
3684         """Get age limit from the testcases"""
3685         return max(traverse_obj(
3686             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3687             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3688
3689     @classmethod
3690     def is_suitable(cls, age_limit):
3691         """Test whether the extractor is generally suitable for the given age limit"""
3692         return not age_restricted(cls.age_limit, age_limit)
3693
3694     @classmethod
3695     def description(cls, *, markdown=True, search_examples=None):
3696         """Description of the extractor"""
3697         desc = ''
3698         if cls._NETRC_MACHINE:
3699             if markdown:
3700                 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3701             else:
3702                 desc += f' [{cls._NETRC_MACHINE}]'
3703         if cls.IE_DESC is False:
3704             desc += ' [HIDDEN]'
3705         elif cls.IE_DESC:
3706             desc += f' {cls.IE_DESC}'
3707         if cls.SEARCH_KEY:
3708             desc += f'; "{cls.SEARCH_KEY}:" prefix'
3709             if search_examples:
3710                 _COUNTS = ('', '5', '10', 'all')
3711                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3712         if not cls.working():
3713             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3714
3715         name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
3716         return f'{name}:{desc}' if desc else name
3717
3718     def extract_subtitles(self, *args, **kwargs):
3719         if (self.get_param('writesubtitles', False)
3720                 or self.get_param('listsubtitles')):
3721             return self._get_subtitles(*args, **kwargs)
3722         return {}
3723
3724     def _get_subtitles(self, *args, **kwargs):
3725         raise NotImplementedError('This method must be implemented by subclasses')
3726
3727     def extract_comments(self, *args, **kwargs):
3728         if not self.get_param('getcomments'):
3729             return None
3730         generator = self._get_comments(*args, **kwargs)
3731
3732         def extractor():
3733             comments = []
3734             interrupted = True
3735             try:
3736                 while True:
3737                     comments.append(next(generator))
3738             except StopIteration:
3739                 interrupted = False
3740             except KeyboardInterrupt:
3741                 self.to_screen('Interrupted by user')
3742             except Exception as e:
3743                 if self.get_param('ignoreerrors') is not True:
3744                     raise
3745                 self._downloader.report_error(e)
3746             comment_count = len(comments)
3747             self.to_screen(f'Extracted {comment_count} comments')
3748             return {
3749                 'comments': comments,
3750                 'comment_count': None if interrupted else comment_count
3751             }
3752         return extractor
3753
3754     def _get_comments(self, *args, **kwargs):
3755         raise NotImplementedError('This method must be implemented by subclasses')
3756
3757     @staticmethod
3758     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3759         """ Merge subtitle items for one language. Items with duplicated URLs/data
3760         will be dropped. """
3761         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3762         ret = list(subtitle_list1)
3763         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3764         return ret
3765
3766     @classmethod
3767     def _merge_subtitles(cls, *dicts, target=None):
3768         """ Merge subtitle dictionaries, language by language. """
3769         if target is None:
3770             target = {}
3771         for d in dicts:
3772             for lang, subs in d.items():
3773                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3774         return target
3775
3776     def extract_automatic_captions(self, *args, **kwargs):
3777         if (self.get_param('writeautomaticsub', False)
3778                 or self.get_param('listsubtitles')):
3779             return self._get_automatic_captions(*args, **kwargs)
3780         return {}
3781
3782     def _get_automatic_captions(self, *args, **kwargs):
3783         raise NotImplementedError('This method must be implemented by subclasses')
3784
3785     @functools.cached_property
3786     def _cookies_passed(self):
3787         """Whether cookies have been passed to YoutubeDL"""
3788         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3789
3790     def mark_watched(self, *args, **kwargs):
3791         if not self.get_param('mark_watched', False):
3792             return
3793         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3794             self._mark_watched(*args, **kwargs)
3795
3796     def _mark_watched(self, *args, **kwargs):
3797         raise NotImplementedError('This method must be implemented by subclasses')
3798
3799     def geo_verification_headers(self):
3800         headers = {}
3801         geo_verification_proxy = self.get_param('geo_verification_proxy')
3802         if geo_verification_proxy:
3803             headers['Ytdl-request-proxy'] = geo_verification_proxy
3804         return headers
3805
3806     @staticmethod
3807     def _generic_id(url):
3808         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3809
3810     @staticmethod
3811     def _generic_title(url):
3812         return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3813
3814     @staticmethod
3815     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3816         all_known = all(map(
3817             lambda x: x is not None,
3818             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3819         return (
3820             'private' if is_private
3821             else 'premium_only' if needs_premium
3822             else 'subscriber_only' if needs_subscription
3823             else 'needs_auth' if needs_auth
3824             else 'unlisted' if is_unlisted
3825             else 'public' if all_known
3826             else None)
3827
3828     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3829         '''
3830         @returns            A list of values for the extractor argument given by "key"
3831                             or "default" if no such key is present
3832         @param default      The default value to return when the key is not present (default: [])
3833         @param casesense    When false, the values are converted to lower case
3834         '''
3835         val = traverse_obj(
3836             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3837         if val is None:
3838             return [] if default is NO_DEFAULT else default
3839         return list(val) if casesense else [x.lower() for x in val]
3840
3841     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3842         if not playlist_id or not video_id:
3843             return not video_id
3844
3845         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3846         if no_playlist is not None:
3847             return not no_playlist
3848
3849         video_id = '' if video_id is True else f' {video_id}'
3850         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3851         if self.get_param('noplaylist'):
3852             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3853             return False
3854         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3855         return True
3856
3857     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3858         RetryManager.report_retry(err, _count or int(fatal), _retries, info=self.to_screen, warn=self.report_warning,
3859                                   sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3860
3861     def RetryManager(self, **kwargs):
3862         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3863
3864     @classmethod
3865     def extract_from_webpage(cls, ydl, url, webpage):
3866         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3867               else ydl.get_info_extractor(cls.ie_key()))
3868         for info in ie._extract_from_webpage(url, webpage) or []:
3869             # url = None since we do not want to set (webpage/original)_url
3870             ydl.add_default_extra_info(info, ie, None)
3871             yield info
3872
3873     @classmethod
3874     def _extract_from_webpage(cls, url, webpage):
3875         for embed_url in orderedSet(
3876                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3877             yield cls.url_result(embed_url, cls)
3878
3879     @classmethod
3880     def _extract_embed_urls(cls, url, webpage):
3881         """@returns all the embed urls on the webpage"""
3882         if '_EMBED_URL_RE' not in cls.__dict__:
3883             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3884             for idx, regex in enumerate(cls._EMBED_REGEX):
3885                 assert regex.count('(?P<url>') == 1, \
3886                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3887             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3888
3889         for regex in cls._EMBED_URL_RE:
3890             for mobj in regex.finditer(webpage):
3891                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3892                 if cls._VALID_URL is False or cls.suitable(embed_url):
3893                     yield embed_url
3894
3895     class StopExtraction(Exception):
3896         pass
3897
3898     @classmethod
3899     def _extract_url(cls, webpage):  # TODO: Remove
3900         """Only for compatibility with some older extractors"""
3901         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3902
3903
3904 class SearchInfoExtractor(InfoExtractor):
3905     """
3906     Base class for paged search queries extractors.
3907     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3908     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3909     """
3910
3911     _MAX_RESULTS = float('inf')
3912
3913     @classproperty
3914     def _VALID_URL(cls):
3915         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3916
3917     def _real_extract(self, query):
3918         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3919         if prefix == '':
3920             return self._get_n_results(query, 1)
3921         elif prefix == 'all':
3922             return self._get_n_results(query, self._MAX_RESULTS)
3923         else:
3924             n = int(prefix)
3925             if n <= 0:
3926                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3927             elif n > self._MAX_RESULTS:
3928                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3929                 n = self._MAX_RESULTS
3930             return self._get_n_results(query, n)
3931
3932     def _get_n_results(self, query, n):
3933         """Get a specified number of results for a query.
3934         Either this function or _search_results must be overridden by subclasses """
3935         return self.playlist_result(
3936             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3937             query, query)
3938
3939     def _search_results(self, query):
3940         """Returns an iterator of search results"""
3941         raise NotImplementedError('This method must be implemented by subclasses')
3942
3943     @classproperty
3944     def SEARCH_KEY(cls):
3945         return cls._SEARCH_KEY
3946
3947
3948 class UnsupportedURLIE(InfoExtractor):
3949     _VALID_URL = '.*'
3950     _ENABLED = False
3951     IE_DESC = False
3952
3953     def _real_extract(self, url):
3954         raise UnsupportedError(url)