yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import collections
   6 import hashlib
   7 import itertools
   8 import json
   9 import netrc
  10 import os
  11 import random
  12 import re
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar_Cookie,
  19     compat_cookies_SimpleCookie,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_Pattern,
  27     compat_str,
  28     compat_urllib_error,
  29     compat_urllib_parse_unquote,
  30     compat_urllib_parse_urlencode,
  31     compat_urllib_request,
  32     compat_urlparse,
  33     compat_xml_parse_error,
  34 )
  35 from ..downloader import FileDownloader
  36 from ..downloader.f4m import (
  37     get_base_url,
  38     remove_encrypted_media,
  39 )
  40 from ..utils import (
  41     age_restricted,
  42     base_url,
  43     bug_reports_message,
  44     clean_html,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     encode_data_uri,
  49     error_to_compat_str,
  50     extract_attributes,
  51     ExtractorError,
  52     filter_dict,
  53     fix_xml_ampersands,
  54     float_or_none,
  55     format_field,
  56     GeoRestrictedError,
  57     GeoUtils,
  58     int_or_none,
  59     join_nonempty,
  60     js_to_json,
  61     JSON_LD_RE,
  62     mimetype2ext,
  63     network_exceptions,
  64     NO_DEFAULT,
  65     orderedSet,
  66     parse_bitrate,
  67     parse_codecs,
  68     parse_duration,
  69     parse_iso8601,
  70     parse_m3u8_attributes,
  71     parse_resolution,
  72     RegexNotFoundError,
  73     sanitize_filename,
  74     sanitized_Request,
  75     str_or_none,
  76     str_to_int,
  77     strip_or_none,
  78     traverse_obj,
  79     try_get,
  80     unescapeHTML,
  81     UnsupportedError,
  82     unified_strdate,
  83     unified_timestamp,
  84     update_Request,
  85     update_url_query,
  86     url_basename,
  87     url_or_none,
  88     urljoin,
  89     variadic,
  90     xpath_element,
  91     xpath_text,
  92     xpath_with_ns,
  93 )
  94
  95
  96 class InfoExtractor(object):
  97     """Information Extractor class.
  98
  99     Information extractors are the classes that, given a URL, extract
 100     information about the video (or videos) the URL refers to. This
 101     information includes the real video URL, the video title, author and
 102     others. The information is stored in a dictionary which is then
 103     passed to the YoutubeDL. The YoutubeDL processes this
 104     information possibly downloading the video to the file system, among
 105     other possible outcomes.
 106
 107     The type field determines the type of the result.
 108     By far the most common value (and the default if _type is missing) is
 109     "video", which indicates a single video.
 110
 111     For a video, the dictionaries must include the following fields:
 112
 113     id:             Video identifier.
 114     title:          Video title, unescaped.
 115
 116     Additionally, it must contain either a formats entry or a url one:
 117
 118     formats:        A list of dictionaries for each format available, ordered
 119                     from worst to best quality.
 120
 121                     Potential fields:
 122                     * url        The mandatory URL representing the media:
 123                                    for plain file media - HTTP URL of this file,
 124                                    for RTMP - RTMP URL,
 125                                    for HLS - URL of the M3U8 media playlist,
 126                                    for HDS - URL of the F4M manifest,
 127                                    for DASH
 128                                      - HTTP URL to plain file media (in case of
 129                                        unfragmented media)
 130                                      - URL of the MPD manifest or base URL
 131                                        representing the media if MPD manifest
 132                                        is parsed from a string (in case of
 133                                        fragmented media)
 134                                    for MSS - URL of the ISM manifest.
 135                     * manifest_url
 136                                  The URL of the manifest file in case of
 137                                  fragmented media:
 138                                    for HLS - URL of the M3U8 master playlist,
 139                                    for HDS - URL of the F4M manifest,
 140                                    for DASH - URL of the MPD manifest,
 141                                    for MSS - URL of the ISM manifest.
 142                     * manifest_stream_number  (For internal use only)
 143                                  The index of the stream in the manifest file
 144                     * ext        Will be calculated from URL if missing
 145                     * format     A human-readable description of the format
 146                                  ("mp4 container with h264/opus").
 147                                  Calculated from the format_id, width, height.
 148                                  and format_note fields if missing.
 149                     * format_id  A short description of the format
 150                                  ("mp4_h264_opus" or "19").
 151                                 Technically optional, but strongly recommended.
 152                     * format_note Additional info about the format
 153                                  ("3D" or "DASH video")
 154                     * width      Width of the video, if known
 155                     * height     Height of the video, if known
 156                     * resolution Textual description of width and height
 157                     * dynamic_range The dynamic range of the video. One of:
 158                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 159                     * tbr        Average bitrate of audio and video in KBit/s
 160                     * abr        Average audio bitrate in KBit/s
 161                     * acodec     Name of the audio codec in use
 162                     * asr        Audio sampling rate in Hertz
 163                     * vbr        Average video bitrate in KBit/s
 164                     * fps        Frame rate
 165                     * vcodec     Name of the video codec in use
 166                     * container  Name of the container format
 167                     * filesize   The number of bytes, if known in advance
 168                     * filesize_approx  An estimate for the number of bytes
 169                     * player_url SWF Player URL (used for rtmpdump).
 170                     * protocol   The protocol that will be used for the actual
 171                                  download, lower-case. One of "http", "https" or
 172                                  one of the protocols defined in downloader.PROTOCOL_MAP
 173                     * fragment_base_url
 174                                  Base URL for fragments. Each fragment's path
 175                                  value (if present) will be relative to
 176                                  this URL.
 177                     * fragments  A list of fragments of a fragmented media.
 178                                  Each fragment entry must contain either an url
 179                                  or a path. If an url is present it should be
 180                                  considered by a client. Otherwise both path and
 181                                  fragment_base_url must be present. Here is
 182                                  the list of all potential fields:
 183                                  * "url" - fragment's URL
 184                                  * "path" - fragment's path relative to
 185                                             fragment_base_url
 186                                  * "duration" (optional, int or float)
 187                                  * "filesize" (optional, int)
 188                     * is_from_start  Is a live format that can be downloaded
 189                                 from the start. Boolean
 190                     * preference Order number of this format. If this field is
 191                                  present and not None, the formats get sorted
 192                                  by this field, regardless of all other values.
 193                                  -1 for default (order by other properties),
 194                                  -2 or smaller for less than default.
 195                                  < -1000 to hide the format (if there is
 196                                     another one which is strictly better)
 197                     * language   Language code, e.g. "de" or "en-US".
 198                     * language_preference  Is this in the language mentioned in
 199                                  the URL?
 200                                  10 if it's what the URL is about,
 201                                  -1 for default (don't know),
 202                                  -10 otherwise, other values reserved for now.
 203                     * quality    Order number of the video quality of this
 204                                  format, irrespective of the file format.
 205                                  -1 for default (order by other properties),
 206                                  -2 or smaller for less than default.
 207                     * source_preference  Order number for this video source
 208                                   (quality takes higher priority)
 209                                  -1 for default (order by other properties),
 210                                  -2 or smaller for less than default.
 211                     * http_headers  A dictionary of additional HTTP headers
 212                                  to add to the request.
 213                     * stretched_ratio  If given and not 1, indicates that the
 214                                  video's pixels are not square.
 215                                  width : height ratio as float.
 216                     * no_resume  The server does not support resuming the
 217                                  (HTTP or RTMP) download. Boolean.
 218                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 219                     * downloader_options  A dictionary of downloader options as
 220                                  described in FileDownloader (For internal use only)
 221                     RTMP formats can also have the additional fields: page_url,
 222                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 223                     rtmp_protocol, rtmp_real_time
 224
 225     url:            Final video URL.
 226     ext:            Video filename extension.
 227     format:         The video format, defaults to ext (used for --get-format)
 228     player_url:     SWF Player URL (used for rtmpdump).
 229
 230     The following fields are optional:
 231
 232     direct:         True if a direct video file was given (must only be set by GenericIE)
 233     alt_title:      A secondary title of the video.
 234     display_id      An alternative identifier for the video, not necessarily
 235                     unique, but available before title. Typically, id is
 236                     something like "4234987", title "Dancing naked mole rats",
 237                     and display_id "dancing-naked-mole-rats"
 238     thumbnails:     A list of dictionaries, with the following entries:
 239                         * "id" (optional, string) - Thumbnail format ID
 240                         * "url"
 241                         * "preference" (optional, int) - quality of the image
 242                         * "width" (optional, int)
 243                         * "height" (optional, int)
 244                         * "resolution" (optional, string "{width}x{height}",
 245                                         deprecated)
 246                         * "filesize" (optional, int)
 247                         * "http_headers" (dict) - HTTP headers for the request
 248     thumbnail:      Full URL to a video thumbnail image.
 249     description:    Full video description.
 250     uploader:       Full name of the video uploader.
 251     license:        License name the video is licensed under.
 252     creator:        The creator of the video.
 253     timestamp:      UNIX timestamp of the moment the video was uploaded
 254     upload_date:    Video upload date in UTC (YYYYMMDD).
 255                     If not explicitly set, calculated from timestamp
 256     release_timestamp: UNIX timestamp of the moment the video was released.
 257                     If it is not clear whether to use timestamp or this, use the former
 258     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 259                     If not explicitly set, calculated from release_timestamp
 260     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 261     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 262                     If not explicitly set, calculated from modified_timestamp
 263     uploader_id:    Nickname or id of the video uploader.
 264     uploader_url:   Full URL to a personal webpage of the video uploader.
 265     channel:        Full name of the channel the video is uploaded on.
 266                     Note that channel fields may or may not repeat uploader
 267                     fields. This depends on a particular extractor.
 268     channel_id:     Id of the channel.
 269     channel_url:    Full URL to a channel webpage.
 270     channel_follower_count: Number of followers of the channel.
 271     location:       Physical location where the video was filmed.
 272     subtitles:      The available subtitles as a dictionary in the format
 273                     {tag: subformats}. "tag" is usually a language code, and
 274                     "subformats" is a list sorted from lower to higher
 275                     preference, each element is a dictionary with the "ext"
 276                     entry and one of:
 277                         * "data": The subtitles file contents
 278                         * "url": A URL pointing to the subtitles file
 279                     It can optionally also have:
 280                         * "name": Name or description of the subtitles
 281                         * "http_headers": A dictionary of additional HTTP headers
 282                                   to add to the request.
 283                     "ext" will be calculated from URL if missing
 284     automatic_captions: Like 'subtitles'; contains automatically generated
 285                     captions instead of normal subtitles
 286     duration:       Length of the video in seconds, as an integer or float.
 287     view_count:     How many users have watched the video on the platform.
 288     like_count:     Number of positive ratings of the video
 289     dislike_count:  Number of negative ratings of the video
 290     repost_count:   Number of reposts of the video
 291     average_rating: Average rating give by users, the scale used depends on the webpage
 292     comment_count:  Number of comments on the video
 293     comments:       A list of comments, each with one or more of the following
 294                     properties (all but one of text or html optional):
 295                         * "author" - human-readable name of the comment author
 296                         * "author_id" - user ID of the comment author
 297                         * "author_thumbnail" - The thumbnail of the comment author
 298                         * "id" - Comment ID
 299                         * "html" - Comment as HTML
 300                         * "text" - Plain text of the comment
 301                         * "timestamp" - UNIX timestamp of comment
 302                         * "parent" - ID of the comment this one is replying to.
 303                                      Set to "root" to indicate that this is a
 304                                      comment to the original video.
 305                         * "like_count" - Number of positive ratings of the comment
 306                         * "dislike_count" - Number of negative ratings of the comment
 307                         * "is_favorited" - Whether the comment is marked as
 308                                            favorite by the video uploader
 309                         * "author_is_uploader" - Whether the comment is made by
 310                                                  the video uploader
 311     age_limit:      Age restriction for the video, as an integer (years)
 312     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 313                     should allow to get the same result again. (It will be set
 314                     by YoutubeDL if it's missing)
 315     categories:     A list of categories that the video falls in, for example
 316                     ["Sports", "Berlin"]
 317     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 318     cast:           A list of the video cast
 319     is_live:        True, False, or None (=unknown). Whether this video is a
 320                     live stream that goes on instead of a fixed-length video.
 321     was_live:       True, False, or None (=unknown). Whether this video was
 322                     originally a live stream.
 323     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 324                     If absent, automatically set from is_live, was_live
 325     start_time:     Time in seconds where the reproduction should start, as
 326                     specified in the URL.
 327     end_time:       Time in seconds where the reproduction should end, as
 328                     specified in the URL.
 329     chapters:       A list of dictionaries, with the following entries:
 330                         * "start_time" - The start time of the chapter in seconds
 331                         * "end_time" - The end time of the chapter in seconds
 332                         * "title" (optional, string)
 333     playable_in_embed: Whether this video is allowed to play in embedded
 334                     players on other sites. Can be True (=always allowed),
 335                     False (=never allowed), None (=unknown), or a string
 336                     specifying the criteria for embedability (Eg: 'whitelist')
 337     availability:   Under what condition the video is available. One of
 338                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 339                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 340                     to set it
 341     __post_extractor: A function to be called just before the metadata is
 342                     written to either disk, logger or console. The function
 343                     must return a dict which will be added to the info_dict.
 344                     This is usefull for additional information that is
 345                     time-consuming to extract. Note that the fields thus
 346                     extracted will not be available to output template and
 347                     match_filter. So, only "comments" and "comment_count" are
 348                     currently allowed to be extracted via this method.
 349
 350     The following fields should only be used when the video belongs to some logical
 351     chapter or section:
 352
 353     chapter:        Name or title of the chapter the video belongs to.
 354     chapter_number: Number of the chapter the video belongs to, as an integer.
 355     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 356
 357     The following fields should only be used when the video is an episode of some
 358     series, programme or podcast:
 359
 360     series:         Title of the series or programme the video episode belongs to.
 361     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 362     season:         Title of the season the video episode belongs to.
 363     season_number:  Number of the season the video episode belongs to, as an integer.
 364     season_id:      Id of the season the video episode belongs to, as a unicode string.
 365     episode:        Title of the video episode. Unlike mandatory video title field,
 366                     this field should denote the exact title of the video episode
 367                     without any kind of decoration.
 368     episode_number: Number of the video episode within a season, as an integer.
 369     episode_id:     Id of the video episode, as a unicode string.
 370
 371     The following fields should only be used when the media is a track or a part of
 372     a music album:
 373
 374     track:          Title of the track.
 375     track_number:   Number of the track within an album or a disc, as an integer.
 376     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 377                     as a unicode string.
 378     artist:         Artist(s) of the track.
 379     genre:          Genre(s) of the track.
 380     album:          Title of the album the track belongs to.
 381     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 382     album_artist:   List of all artists appeared on the album (e.g.
 383                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 384                     and compilations).
 385     disc_number:    Number of the disc or other physical medium the track belongs to,
 386                     as an integer.
 387     release_year:   Year (YYYY) when the album was released.
 388     composer:       Composer of the piece
 389
 390     Unless mentioned otherwise, the fields should be Unicode strings.
 391
 392     Unless mentioned otherwise, None is equivalent to absence of information.
 393
 394
 395     _type "playlist" indicates multiple videos.
 396     There must be a key "entries", which is a list, an iterable, or a PagedList
 397     object, each element of which is a valid dictionary by this specification.
 398
 399     Additionally, playlists can have "id", "title", and any other relevent
 400     attributes with the same semantics as videos (see above).
 401
 402     It can also have the following optional fields:
 403
 404     playlist_count: The total number of videos in a playlist. If not given,
 405                     YoutubeDL tries to calculate it from "entries"
 406
 407
 408     _type "multi_video" indicates that there are multiple videos that
 409     form a single show, for examples multiple acts of an opera or TV episode.
 410     It must have an entries key like a playlist and contain all the keys
 411     required for a video at the same time.
 412
 413
 414     _type "url" indicates that the video must be extracted from another
 415     location, possibly by a different extractor. Its only required key is:
 416     "url" - the next URL to extract.
 417     The key "ie_key" can be set to the class name (minus the trailing "IE",
 418     e.g. "Youtube") if the extractor class is known in advance.
 419     Additionally, the dictionary may have any properties of the resolved entity
 420     known in advance, for example "title" if the title of the referred video is
 421     known ahead of time.
 422
 423
 424     _type "url_transparent" entities have the same specification as "url", but
 425     indicate that the given additional information is more precise than the one
 426     associated with the resolved URL.
 427     This is useful when a site employs a video service that hosts the video and
 428     its technical metadata, but that video service does not embed a useful
 429     title, description etc.
 430
 431
 432     Subclasses of this should define a _VALID_URL regexp and, re-define the
 433     _real_extract() and (optionally) _real_initialize() methods.
 434     Probably, they should also be added to the list of extractors.
 435
 436     Subclasses may also override suitable() if necessary, but ensure the function
 437     signature is preserved and that this function imports everything it needs
 438     (except other extractors), so that lazy_extractors works correctly.
 439
 440     To support username + password (or netrc) login, the extractor must define a
 441     _NETRC_MACHINE and re-define _perform_login(username, password) and
 442     (optionally) _initialize_pre_login() methods. The _perform_login method will
 443     be called between _initialize_pre_login and _real_initialize if credentials
 444     are passed by the user. In cases where it is necessary to have the login
 445     process as part of the extraction rather than initialization, _perform_login
 446     can be left undefined.
 447
 448     _GEO_BYPASS attribute may be set to False in order to disable
 449     geo restriction bypass mechanisms for a particular extractor.
 450     Though it won't disable explicit geo restriction bypass based on
 451     country code provided with geo_bypass_country.
 452
 453     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 454     countries for this extractor. One of these countries will be used by
 455     geo restriction bypass mechanism right away in order to bypass
 456     geo restriction, of course, if the mechanism is not disabled.
 457
 458     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 459     IP blocks in CIDR notation for this extractor. One of these IP blocks
 460     will be used by geo restriction bypass mechanism similarly
 461     to _GEO_COUNTRIES.
 462
 463     The _WORKING attribute should be set to False for broken IEs
 464     in order to warn the users and skip the tests.
 465     """
 466
 467     _ready = False
 468     _downloader = None
 469     _x_forwarded_for_ip = None
 470     _GEO_BYPASS = True
 471     _GEO_COUNTRIES = None
 472     _GEO_IP_BLOCKS = None
 473     _WORKING = True
 474     _NETRC_MACHINE = None
 475     IE_DESC = None
 476
 477     _LOGIN_HINTS = {
 478         'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials',
 479         'cookies': (
 480             'Use --cookies-from-browser or --cookies for the authentication. '
 481             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 482         'password': 'Use --username and --password, or --netrc to provide account credentials',
 483     }
 484
 485     def __init__(self, downloader=None):
 486         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 487         If a downloader is not passed during initialization,
 488         it must be set using "set_downloader()" before "extract()" is called"""
 489         self._ready = False
 490         self._x_forwarded_for_ip = None
 491         self._printed_messages = set()
 492         self.set_downloader(downloader)
 493
 494     @classmethod
 495     def _match_valid_url(cls, url):
 496         # This does not use has/getattr intentionally - we want to know whether
 497         # we have cached the regexp for *this* class, whereas getattr would also
 498         # match the superclass
 499         if '_VALID_URL_RE' not in cls.__dict__:
 500             if '_VALID_URL' not in cls.__dict__:
 501                 cls._VALID_URL = cls._make_valid_url()
 502             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 503         return cls._VALID_URL_RE.match(url)
 504
 505     @classmethod
 506     def suitable(cls, url):
 507         """Receives a URL and returns True if suitable for this IE."""
 508         # This function must import everything it needs (except other extractors),
 509         # so that lazy_extractors works correctly
 510         return cls._match_valid_url(url) is not None
 511
 512     @classmethod
 513     def _match_id(cls, url):
 514         return cls._match_valid_url(url).group('id')
 515
 516     @classmethod
 517     def get_temp_id(cls, url):
 518         try:
 519             return cls._match_id(url)
 520         except (IndexError, AttributeError):
 521             return None
 522
 523     @classmethod
 524     def working(cls):
 525         """Getter method for _WORKING."""
 526         return cls._WORKING
 527
 528     @classmethod
 529     def supports_login(cls):
 530         return bool(cls._NETRC_MACHINE)
 531
 532     def initialize(self):
 533         """Initializes an instance (authentication, etc)."""
 534         self._printed_messages = set()
 535         self._initialize_geo_bypass({
 536             'countries': self._GEO_COUNTRIES,
 537             'ip_blocks': self._GEO_IP_BLOCKS,
 538         })
 539         if not self._ready:
 540             self._initialize_pre_login()
 541             if self.supports_login():
 542                 username, password = self._get_login_info()
 543                 if username:
 544                     self._perform_login(username, password)
 545             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 546                 self.report_warning(f'Login with password is not supported for this website. {self._LOGIN_HINTS["cookies"]}')
 547             self._real_initialize()
 548             self._ready = True
 549
 550     def _initialize_geo_bypass(self, geo_bypass_context):
 551         """
 552         Initialize geo restriction bypass mechanism.
 553
 554         This method is used to initialize geo bypass mechanism based on faking
 555         X-Forwarded-For HTTP header. A random country from provided country list
 556         is selected and a random IP belonging to this country is generated. This
 557         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 558         HTTP requests.
 559
 560         This method will be used for initial geo bypass mechanism initialization
 561         during the instance initialization with _GEO_COUNTRIES and
 562         _GEO_IP_BLOCKS.
 563
 564         You may also manually call it from extractor's code if geo bypass
 565         information is not available beforehand (e.g. obtained during
 566         extraction) or due to some other reason. In this case you should pass
 567         this information in geo bypass context passed as first argument. It may
 568         contain following fields:
 569
 570         countries:  List of geo unrestricted countries (similar
 571                     to _GEO_COUNTRIES)
 572         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 573                     (similar to _GEO_IP_BLOCKS)
 574
 575         """
 576         if not self._x_forwarded_for_ip:
 577
 578             # Geo bypass mechanism is explicitly disabled by user
 579             if not self.get_param('geo_bypass', True):
 580                 return
 581
 582             if not geo_bypass_context:
 583                 geo_bypass_context = {}
 584
 585             # Backward compatibility: previously _initialize_geo_bypass
 586             # expected a list of countries, some 3rd party code may still use
 587             # it this way
 588             if isinstance(geo_bypass_context, (list, tuple)):
 589                 geo_bypass_context = {
 590                     'countries': geo_bypass_context,
 591                 }
 592
 593             # The whole point of geo bypass mechanism is to fake IP
 594             # as X-Forwarded-For HTTP header based on some IP block or
 595             # country code.
 596
 597             # Path 1: bypassing based on IP block in CIDR notation
 598
 599             # Explicit IP block specified by user, use it right away
 600             # regardless of whether extractor is geo bypassable or not
 601             ip_block = self.get_param('geo_bypass_ip_block', None)
 602
 603             # Otherwise use random IP block from geo bypass context but only
 604             # if extractor is known as geo bypassable
 605             if not ip_block:
 606                 ip_blocks = geo_bypass_context.get('ip_blocks')
 607                 if self._GEO_BYPASS and ip_blocks:
 608                     ip_block = random.choice(ip_blocks)
 609
 610             if ip_block:
 611                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 612                 self._downloader.write_debug(
 613                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 614                 return
 615
 616             # Path 2: bypassing based on country code
 617
 618             # Explicit country code specified by user, use it right away
 619             # regardless of whether extractor is geo bypassable or not
 620             country = self.get_param('geo_bypass_country', None)
 621
 622             # Otherwise use random country code from geo bypass context but
 623             # only if extractor is known as geo bypassable
 624             if not country:
 625                 countries = geo_bypass_context.get('countries')
 626                 if self._GEO_BYPASS and countries:
 627                     country = random.choice(countries)
 628
 629             if country:
 630                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 631                 self._downloader.write_debug(
 632                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 633
 634     def extract(self, url):
 635         """Extracts URL information and returns it in list of dicts."""
 636         try:
 637             for _ in range(2):
 638                 try:
 639                     self.initialize()
 640                     self.write_debug('Extracting URL: %s' % url)
 641                     ie_result = self._real_extract(url)
 642                     if ie_result is None:
 643                         return None
 644                     if self._x_forwarded_for_ip:
 645                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 646                     subtitles = ie_result.get('subtitles')
 647                     if (subtitles and 'live_chat' in subtitles
 648                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 649                         del subtitles['live_chat']
 650                     return ie_result
 651                 except GeoRestrictedError as e:
 652                     if self.__maybe_fake_ip_and_retry(e.countries):
 653                         continue
 654                     raise
 655         except UnsupportedError:
 656             raise
 657         except ExtractorError as e:
 658             kwargs = {
 659                 'video_id': e.video_id or self.get_temp_id(url),
 660                 'ie': self.IE_NAME,
 661                 'tb': e.traceback or sys.exc_info()[2],
 662                 'expected': e.expected,
 663                 'cause': e.cause
 664             }
 665             if hasattr(e, 'countries'):
 666                 kwargs['countries'] = e.countries
 667             raise type(e)(e.orig_msg, **kwargs)
 668         except compat_http_client.IncompleteRead as e:
 669             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 670         except (KeyError, StopIteration) as e:
 671             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 672
 673     def __maybe_fake_ip_and_retry(self, countries):
 674         if (not self.get_param('geo_bypass_country', None)
 675                 and self._GEO_BYPASS
 676                 and self.get_param('geo_bypass', True)
 677                 and not self._x_forwarded_for_ip
 678                 and countries):
 679             country_code = random.choice(countries)
 680             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 681             if self._x_forwarded_for_ip:
 682                 self.report_warning(
 683                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 684                     % (self._x_forwarded_for_ip, country_code.upper()))
 685                 return True
 686         return False
 687
 688     def set_downloader(self, downloader):
 689         """Sets a YoutubeDL instance as the downloader for this IE."""
 690         self._downloader = downloader
 691
 692     def _initialize_pre_login(self):
 693         """ Intialization before login. Redefine in subclasses."""
 694         pass
 695
 696     def _perform_login(self, username, password):
 697         """ Login with username and password. Redefine in subclasses."""
 698         pass
 699
 700     def _real_initialize(self):
 701         """Real initialization process. Redefine in subclasses."""
 702         pass
 703
 704     def _real_extract(self, url):
 705         """Real extraction process. Redefine in subclasses."""
 706         raise NotImplementedError('This method must be implemented by subclasses')
 707
 708     @classmethod
 709     def ie_key(cls):
 710         """A string for getting the InfoExtractor with get_info_extractor"""
 711         return cls.__name__[:-2]
 712
 713     @property
 714     def IE_NAME(self):
 715         return compat_str(type(self).__name__[:-2])
 716
 717     @staticmethod
 718     def __can_accept_status_code(err, expected_status):
 719         assert isinstance(err, compat_urllib_error.HTTPError)
 720         if expected_status is None:
 721             return False
 722         elif callable(expected_status):
 723             return expected_status(err.code) is True
 724         else:
 725             return err.code in variadic(expected_status)
 726
 727     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 728         """
 729         Return the response handle.
 730
 731         See _download_webpage docstring for arguments specification.
 732         """
 733         if not self._downloader._first_webpage_request:
 734             sleep_interval = self.get_param('sleep_interval_requests') or 0
 735             if sleep_interval > 0:
 736                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 737                 time.sleep(sleep_interval)
 738         else:
 739             self._downloader._first_webpage_request = False
 740
 741         if note is None:
 742             self.report_download_webpage(video_id)
 743         elif note is not False:
 744             if video_id is None:
 745                 self.to_screen('%s' % (note,))
 746             else:
 747                 self.to_screen('%s: %s' % (video_id, note))
 748
 749         # Some sites check X-Forwarded-For HTTP header in order to figure out
 750         # the origin of the client behind proxy. This allows bypassing geo
 751         # restriction by faking this header's value to IP that belongs to some
 752         # geo unrestricted country. We will do so once we encounter any
 753         # geo restriction error.
 754         if self._x_forwarded_for_ip:
 755             if 'X-Forwarded-For' not in headers:
 756                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 757
 758         if isinstance(url_or_request, compat_urllib_request.Request):
 759             url_or_request = update_Request(
 760                 url_or_request, data=data, headers=headers, query=query)
 761         else:
 762             if query:
 763                 url_or_request = update_url_query(url_or_request, query)
 764             if data is not None or headers:
 765                 url_or_request = sanitized_Request(url_or_request, data, headers)
 766         try:
 767             return self._downloader.urlopen(url_or_request)
 768         except network_exceptions as err:
 769             if isinstance(err, compat_urllib_error.HTTPError):
 770                 if self.__can_accept_status_code(err, expected_status):
 771                     # Retain reference to error to prevent file object from
 772                     # being closed before it can be read. Works around the
 773                     # effects of <https://bugs.python.org/issue15002>
 774                     # introduced in Python 3.4.1.
 775                     err.fp._error = err
 776                     return err.fp
 777
 778             if errnote is False:
 779                 return False
 780             if errnote is None:
 781                 errnote = 'Unable to download webpage'
 782
 783             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 784             if fatal:
 785                 raise ExtractorError(errmsg, cause=err)
 786             else:
 787                 self.report_warning(errmsg)
 788                 return False
 789
 790     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 791         """
 792         Return a tuple (page content as string, URL handle).
 793
 794         See _download_webpage docstring for arguments specification.
 795         """
 796         # Strip hashes from the URL (#1038)
 797         if isinstance(url_or_request, (compat_str, str)):
 798             url_or_request = url_or_request.partition('#')[0]
 799
 800         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 801         if urlh is False:
 802             assert not fatal
 803             return False
 804         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 805         return (content, urlh)
 806
 807     @staticmethod
 808     def _guess_encoding_from_content(content_type, webpage_bytes):
 809         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 810         if m:
 811             encoding = m.group(1)
 812         else:
 813             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 814                           webpage_bytes[:1024])
 815             if m:
 816                 encoding = m.group(1).decode('ascii')
 817             elif webpage_bytes.startswith(b'\xff\xfe'):
 818                 encoding = 'utf-16'
 819             else:
 820                 encoding = 'utf-8'
 821
 822         return encoding
 823
 824     def __check_blocked(self, content):
 825         first_block = content[:512]
 826         if ('<title>Access to this site is blocked</title>' in content
 827                 and 'Websense' in first_block):
 828             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 829             blocked_iframe = self._html_search_regex(
 830                 r'<iframe src="([^"]+)"', content,
 831                 'Websense information URL', default=None)
 832             if blocked_iframe:
 833                 msg += ' Visit %s for more details' % blocked_iframe
 834             raise ExtractorError(msg, expected=True)
 835         if '<title>The URL you requested has been blocked</title>' in first_block:
 836             msg = (
 837                 'Access to this webpage has been blocked by Indian censorship. '
 838                 'Use a VPN or proxy server (with --proxy) to route around it.')
 839             block_msg = self._html_search_regex(
 840                 r'</h1><p>(.*?)</p>',
 841                 content, 'block message', default=None)
 842             if block_msg:
 843                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 844             raise ExtractorError(msg, expected=True)
 845         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 846                 and 'blocklist.rkn.gov.ru' in content):
 847             raise ExtractorError(
 848                 'Access to this webpage has been blocked by decision of the Russian government. '
 849                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 850                 expected=True)
 851
 852     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 853         content_type = urlh.headers.get('Content-Type', '')
 854         webpage_bytes = urlh.read()
 855         if prefix is not None:
 856             webpage_bytes = prefix + webpage_bytes
 857         if not encoding:
 858             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 859         if self.get_param('dump_intermediate_pages', False):
 860             self.to_screen('Dumping request to ' + urlh.geturl())
 861             dump = base64.b64encode(webpage_bytes).decode('ascii')
 862             self._downloader.to_screen(dump)
 863         if self.get_param('write_pages', False):
 864             basen = '%s_%s' % (video_id, urlh.geturl())
 865             trim_length = self.get_param('trim_file_name') or 240
 866             if len(basen) > trim_length:
 867                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 868                 basen = basen[:trim_length - len(h)] + h
 869             raw_filename = basen + '.dump'
 870             filename = sanitize_filename(raw_filename, restricted=True)
 871             self.to_screen('Saving request to ' + filename)
 872             # Working around MAX_PATH limitation on Windows (see
 873             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 874             if compat_os_name == 'nt':
 875                 absfilepath = os.path.abspath(filename)
 876                 if len(absfilepath) > 259:
 877                     filename = '\\\\?\\' + absfilepath
 878             with open(filename, 'wb') as outf:
 879                 outf.write(webpage_bytes)
 880
 881         try:
 882             content = webpage_bytes.decode(encoding, 'replace')
 883         except LookupError:
 884             content = webpage_bytes.decode('utf-8', 'replace')
 885
 886         self.__check_blocked(content)
 887
 888         return content
 889
 890     def _download_webpage(
 891             self, url_or_request, video_id, note=None, errnote=None,
 892             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 893             headers={}, query={}, expected_status=None):
 894         """
 895         Return the data of the page as a string.
 896
 897         Arguments:
 898         url_or_request -- plain text URL as a string or
 899             a compat_urllib_request.Requestobject
 900         video_id -- Video/playlist/item identifier (string)
 901
 902         Keyword arguments:
 903         note -- note printed before downloading (string)
 904         errnote -- note printed in case of an error (string)
 905         fatal -- flag denoting whether error should be considered fatal,
 906             i.e. whether it should cause ExtractionError to be raised,
 907             otherwise a warning will be reported and extraction continued
 908         tries -- number of tries
 909         timeout -- sleep interval between tries
 910         encoding -- encoding for a page content decoding, guessed automatically
 911             when not explicitly specified
 912         data -- POST data (bytes)
 913         headers -- HTTP headers (dict)
 914         query -- URL query (dict)
 915         expected_status -- allows to accept failed HTTP requests (non 2xx
 916             status code) by explicitly specifying a set of accepted status
 917             codes. Can be any of the following entities:
 918                 - an integer type specifying an exact failed status code to
 919                   accept
 920                 - a list or a tuple of integer types specifying a list of
 921                   failed status codes to accept
 922                 - a callable accepting an actual failed status code and
 923                   returning True if it should be accepted
 924             Note that this argument does not affect success status codes (2xx)
 925             which are always accepted.
 926         """
 927
 928         success = False
 929         try_count = 0
 930         while success is False:
 931             try:
 932                 res = self._download_webpage_handle(
 933                     url_or_request, video_id, note, errnote, fatal,
 934                     encoding=encoding, data=data, headers=headers, query=query,
 935                     expected_status=expected_status)
 936                 success = True
 937             except compat_http_client.IncompleteRead as e:
 938                 try_count += 1
 939                 if try_count >= tries:
 940                     raise e
 941                 self._sleep(timeout, video_id)
 942         if res is False:
 943             return res
 944         else:
 945             content, _ = res
 946             return content
 947
 948     def _download_xml_handle(
 949             self, url_or_request, video_id, note='Downloading XML',
 950             errnote='Unable to download XML', transform_source=None,
 951             fatal=True, encoding=None, data=None, headers={}, query={},
 952             expected_status=None):
 953         """
 954         Return a tuple (xml as an compat_etree_Element, URL handle).
 955
 956         See _download_webpage docstring for arguments specification.
 957         """
 958         res = self._download_webpage_handle(
 959             url_or_request, video_id, note, errnote, fatal=fatal,
 960             encoding=encoding, data=data, headers=headers, query=query,
 961             expected_status=expected_status)
 962         if res is False:
 963             return res
 964         xml_string, urlh = res
 965         return self._parse_xml(
 966             xml_string, video_id, transform_source=transform_source,
 967             fatal=fatal), urlh
 968
 969     def _download_xml(
 970             self, url_or_request, video_id,
 971             note='Downloading XML', errnote='Unable to download XML',
 972             transform_source=None, fatal=True, encoding=None,
 973             data=None, headers={}, query={}, expected_status=None):
 974         """
 975         Return the xml as an compat_etree_Element.
 976
 977         See _download_webpage docstring for arguments specification.
 978         """
 979         res = self._download_xml_handle(
 980             url_or_request, video_id, note=note, errnote=errnote,
 981             transform_source=transform_source, fatal=fatal, encoding=encoding,
 982             data=data, headers=headers, query=query,
 983             expected_status=expected_status)
 984         return res if res is False else res[0]
 985
 986     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 987         if transform_source:
 988             xml_string = transform_source(xml_string)
 989         try:
 990             return compat_etree_fromstring(xml_string.encode('utf-8'))
 991         except compat_xml_parse_error as ve:
 992             errmsg = '%s: Failed to parse XML ' % video_id
 993             if fatal:
 994                 raise ExtractorError(errmsg, cause=ve)
 995             else:
 996                 self.report_warning(errmsg + str(ve))
 997
 998     def _download_json_handle(
 999             self, url_or_request, video_id, note='Downloading JSON metadata',
1000             errnote='Unable to download JSON metadata', transform_source=None,
1001             fatal=True, encoding=None, data=None, headers={}, query={},
1002             expected_status=None):
1003         """
1004         Return a tuple (JSON object, URL handle).
1005
1006         See _download_webpage docstring for arguments specification.
1007         """
1008         res = self._download_webpage_handle(
1009             url_or_request, video_id, note, errnote, fatal=fatal,
1010             encoding=encoding, data=data, headers=headers, query=query,
1011             expected_status=expected_status)
1012         if res is False:
1013             return res
1014         json_string, urlh = res
1015         return self._parse_json(
1016             json_string, video_id, transform_source=transform_source,
1017             fatal=fatal), urlh
1018
1019     def _download_json(
1020             self, url_or_request, video_id, note='Downloading JSON metadata',
1021             errnote='Unable to download JSON metadata', transform_source=None,
1022             fatal=True, encoding=None, data=None, headers={}, query={},
1023             expected_status=None):
1024         """
1025         Return the JSON object as a dict.
1026
1027         See _download_webpage docstring for arguments specification.
1028         """
1029         res = self._download_json_handle(
1030             url_or_request, video_id, note=note, errnote=errnote,
1031             transform_source=transform_source, fatal=fatal, encoding=encoding,
1032             data=data, headers=headers, query=query,
1033             expected_status=expected_status)
1034         return res if res is False else res[0]
1035
1036     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
1037         if transform_source:
1038             json_string = transform_source(json_string)
1039         try:
1040             return json.loads(json_string, strict=False)
1041         except ValueError as ve:
1042             errmsg = '%s: Failed to parse JSON ' % video_id
1043             if fatal:
1044                 raise ExtractorError(errmsg, cause=ve)
1045             else:
1046                 self.report_warning(errmsg + str(ve))
1047
1048     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1049         return self._parse_json(
1050             data[data.find('{'):data.rfind('}') + 1],
1051             video_id, transform_source, fatal)
1052
1053     def _download_socket_json_handle(
1054             self, url_or_request, video_id, note='Polling socket',
1055             errnote='Unable to poll socket', transform_source=None,
1056             fatal=True, encoding=None, data=None, headers={}, query={},
1057             expected_status=None):
1058         """
1059         Return a tuple (JSON object, URL handle).
1060
1061         See _download_webpage docstring for arguments specification.
1062         """
1063         res = self._download_webpage_handle(
1064             url_or_request, video_id, note, errnote, fatal=fatal,
1065             encoding=encoding, data=data, headers=headers, query=query,
1066             expected_status=expected_status)
1067         if res is False:
1068             return res
1069         webpage, urlh = res
1070         return self._parse_socket_response_as_json(
1071             webpage, video_id, transform_source=transform_source,
1072             fatal=fatal), urlh
1073
1074     def _download_socket_json(
1075             self, url_or_request, video_id, note='Polling socket',
1076             errnote='Unable to poll socket', transform_source=None,
1077             fatal=True, encoding=None, data=None, headers={}, query={},
1078             expected_status=None):
1079         """
1080         Return the JSON object as a dict.
1081
1082         See _download_webpage docstring for arguments specification.
1083         """
1084         res = self._download_socket_json_handle(
1085             url_or_request, video_id, note=note, errnote=errnote,
1086             transform_source=transform_source, fatal=fatal, encoding=encoding,
1087             data=data, headers=headers, query=query,
1088             expected_status=expected_status)
1089         return res if res is False else res[0]
1090
1091     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1092         idstr = format_field(video_id, template='%s: ')
1093         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1094         if only_once:
1095             if f'WARNING: {msg}' in self._printed_messages:
1096                 return
1097             self._printed_messages.add(f'WARNING: {msg}')
1098         self._downloader.report_warning(msg, *args, **kwargs)
1099
1100     def to_screen(self, msg, *args, **kwargs):
1101         """Print msg to screen, prefixing it with '[ie_name]'"""
1102         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1103
1104     def write_debug(self, msg, *args, **kwargs):
1105         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1106
1107     def get_param(self, name, default=None, *args, **kwargs):
1108         if self._downloader:
1109             return self._downloader.params.get(name, default, *args, **kwargs)
1110         return default
1111
1112     def report_drm(self, video_id, partial=False):
1113         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1114
1115     def report_extraction(self, id_or_name):
1116         """Report information extraction."""
1117         self.to_screen('%s: Extracting information' % id_or_name)
1118
1119     def report_download_webpage(self, video_id):
1120         """Report webpage download."""
1121         self.to_screen('%s: Downloading webpage' % video_id)
1122
1123     def report_age_confirmation(self):
1124         """Report attempt to confirm age."""
1125         self.to_screen('Confirming age')
1126
1127     def report_login(self):
1128         """Report attempt to log in."""
1129         self.to_screen('Logging in')
1130
1131     def raise_login_required(
1132             self, msg='This video is only available for registered users',
1133             metadata_available=False, method=NO_DEFAULT):
1134         if metadata_available and (
1135                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1136             self.report_warning(msg)
1137             return
1138         if method is NO_DEFAULT:
1139             method = 'any' if self.supports_login() else 'cookies'
1140         if method is not None:
1141             assert method in self._LOGIN_HINTS, 'Invalid login method'
1142             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1143         raise ExtractorError(msg, expected=True)
1144
1145     def raise_geo_restricted(
1146             self, msg='This video is not available from your location due to geo restriction',
1147             countries=None, metadata_available=False):
1148         if metadata_available and (
1149                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1150             self.report_warning(msg)
1151         else:
1152             raise GeoRestrictedError(msg, countries=countries)
1153
1154     def raise_no_formats(self, msg, expected=False, video_id=None):
1155         if expected and (
1156                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1157             self.report_warning(msg, video_id)
1158         elif isinstance(msg, ExtractorError):
1159             raise msg
1160         else:
1161             raise ExtractorError(msg, expected=expected, video_id=video_id)
1162
1163     # Methods for following #608
1164     @staticmethod
1165     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1166         """Returns a URL that points to a page that should be processed"""
1167         if ie is not None:
1168             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1169         if video_id is not None:
1170             kwargs['id'] = video_id
1171         if video_title is not None:
1172             kwargs['title'] = video_title
1173         return {
1174             **kwargs,
1175             '_type': 'url_transparent' if url_transparent else 'url',
1176             'url': url,
1177         }
1178
1179     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1180         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1181                 for m in orderedSet(map(getter, matches) if getter else matches))
1182         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1183
1184     @staticmethod
1185     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1186         """Returns a playlist"""
1187         if playlist_id:
1188             kwargs['id'] = playlist_id
1189         if playlist_title:
1190             kwargs['title'] = playlist_title
1191         if playlist_description is not None:
1192             kwargs['description'] = playlist_description
1193         return {
1194             **kwargs,
1195             '_type': 'multi_video' if multi_video else 'playlist',
1196             'entries': entries,
1197         }
1198
1199     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1200         """
1201         Perform a regex search on the given string, using a single or a list of
1202         patterns returning the first matching group.
1203         In case of failure return a default value or raise a WARNING or a
1204         RegexNotFoundError, depending on fatal, specifying the field name.
1205         """
1206         if string is None:
1207             mobj = None
1208         elif isinstance(pattern, (str, compat_Pattern)):
1209             mobj = re.search(pattern, string, flags)
1210         else:
1211             for p in pattern:
1212                 mobj = re.search(p, string, flags)
1213                 if mobj:
1214                     break
1215
1216         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1217
1218         if mobj:
1219             if group is None:
1220                 # return the first matching group
1221                 return next(g for g in mobj.groups() if g is not None)
1222             elif isinstance(group, (list, tuple)):
1223                 return tuple(mobj.group(g) for g in group)
1224             else:
1225                 return mobj.group(group)
1226         elif default is not NO_DEFAULT:
1227             return default
1228         elif fatal:
1229             raise RegexNotFoundError('Unable to extract %s' % _name)
1230         else:
1231             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1232             return None
1233
1234     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1235         """
1236         Like _search_regex, but strips HTML tags and unescapes entities.
1237         """
1238         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1239         if res:
1240             return clean_html(res).strip()
1241         else:
1242             return res
1243
1244     def _get_netrc_login_info(self, netrc_machine=None):
1245         username = None
1246         password = None
1247         netrc_machine = netrc_machine or self._NETRC_MACHINE
1248
1249         if self.get_param('usenetrc', False):
1250             try:
1251                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1252                 if os.path.isdir(netrc_file):
1253                     netrc_file = os.path.join(netrc_file, '.netrc')
1254                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1255                 if info is not None:
1256                     username = info[0]
1257                     password = info[2]
1258                 else:
1259                     raise netrc.NetrcParseError(
1260                         'No authenticators for %s' % netrc_machine)
1261             except (IOError, netrc.NetrcParseError) as err:
1262                 self.report_warning(
1263                     'parsing .netrc: %s' % error_to_compat_str(err))
1264
1265         return username, password
1266
1267     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1268         """
1269         Get the login info as (username, password)
1270         First look for the manually specified credentials using username_option
1271         and password_option as keys in params dictionary. If no such credentials
1272         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1273         value.
1274         If there's no info available, return (None, None)
1275         """
1276
1277         # Attempt to use provided username and password or .netrc data
1278         username = self.get_param(username_option)
1279         if username is not None:
1280             password = self.get_param(password_option)
1281         else:
1282             username, password = self._get_netrc_login_info(netrc_machine)
1283
1284         return username, password
1285
1286     def _get_tfa_info(self, note='two-factor verification code'):
1287         """
1288         Get the two-factor authentication info
1289         TODO - asking the user will be required for sms/phone verify
1290         currently just uses the command line option
1291         If there's no info available, return None
1292         """
1293
1294         tfa = self.get_param('twofactor')
1295         if tfa is not None:
1296             return tfa
1297
1298         return compat_getpass('Type %s and press [Return]: ' % note)
1299
1300     # Helper functions for extracting OpenGraph info
1301     @staticmethod
1302     def _og_regexes(prop):
1303         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1304         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1305                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1306         template = r'<meta[^>]+?%s[^>]+?%s'
1307         return [
1308             template % (property_re, content_re),
1309             template % (content_re, property_re),
1310         ]
1311
1312     @staticmethod
1313     def _meta_regex(prop):
1314         return r'''(?isx)<meta
1315                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1316                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1317
1318     def _og_search_property(self, prop, html, name=None, **kargs):
1319         prop = variadic(prop)
1320         if name is None:
1321             name = 'OpenGraph %s' % prop[0]
1322         og_regexes = []
1323         for p in prop:
1324             og_regexes.extend(self._og_regexes(p))
1325         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1326         if escaped is None:
1327             return None
1328         return unescapeHTML(escaped)
1329
1330     def _og_search_thumbnail(self, html, **kargs):
1331         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1332
1333     def _og_search_description(self, html, **kargs):
1334         return self._og_search_property('description', html, fatal=False, **kargs)
1335
1336     def _og_search_title(self, html, *, fatal=False, **kargs):
1337         return self._og_search_property('title', html, fatal=fatal, **kargs)
1338
1339     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1340         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1341         if secure:
1342             regexes = self._og_regexes('video:secure_url') + regexes
1343         return self._html_search_regex(regexes, html, name, **kargs)
1344
1345     def _og_search_url(self, html, **kargs):
1346         return self._og_search_property('url', html, **kargs)
1347
1348     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1349         return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1350
1351     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1352         name = variadic(name)
1353         if display_name is None:
1354             display_name = name[0]
1355         return self._html_search_regex(
1356             [self._meta_regex(n) for n in name],
1357             html, display_name, fatal=fatal, group='content', **kwargs)
1358
1359     def _dc_search_uploader(self, html):
1360         return self._html_search_meta('dc.creator', html, 'uploader')
1361
1362     def _rta_search(self, html):
1363         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1364         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1365                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1366                      html):
1367             return 18
1368         return 0
1369
1370     def _media_rating_search(self, html):
1371         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1372         rating = self._html_search_meta('rating', html)
1373
1374         if not rating:
1375             return None
1376
1377         RATING_TABLE = {
1378             'safe for kids': 0,
1379             'general': 8,
1380             '14 years': 14,
1381             'mature': 17,
1382             'restricted': 19,
1383         }
1384         return RATING_TABLE.get(rating.lower())
1385
1386     def _family_friendly_search(self, html):
1387         # See http://schema.org/VideoObject
1388         family_friendly = self._html_search_meta(
1389             'isFamilyFriendly', html, default=None)
1390
1391         if not family_friendly:
1392             return None
1393
1394         RATING_TABLE = {
1395             '1': 0,
1396             'true': 0,
1397             '0': 18,
1398             'false': 18,
1399         }
1400         return RATING_TABLE.get(family_friendly.lower())
1401
1402     def _twitter_search_player(self, html):
1403         return self._html_search_meta('twitter:player', html,
1404                                       'twitter card player')
1405
1406     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1407         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1408         default = kwargs.get('default', NO_DEFAULT)
1409         # JSON-LD may be malformed and thus `fatal` should be respected.
1410         # At the same time `default` may be passed that assumes `fatal=False`
1411         # for _search_regex. Let's simulate the same behavior here as well.
1412         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1413         json_ld = []
1414         for mobj in json_ld_list:
1415             json_ld_item = self._parse_json(
1416                 mobj.group('json_ld'), video_id, fatal=fatal)
1417             if not json_ld_item:
1418                 continue
1419             if isinstance(json_ld_item, dict):
1420                 json_ld.append(json_ld_item)
1421             elif isinstance(json_ld_item, (list, tuple)):
1422                 json_ld.extend(json_ld_item)
1423         if json_ld:
1424             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1425         if json_ld:
1426             return json_ld
1427         if default is not NO_DEFAULT:
1428             return default
1429         elif fatal:
1430             raise RegexNotFoundError('Unable to extract JSON-LD')
1431         else:
1432             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1433             return {}
1434
1435     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1436         if isinstance(json_ld, compat_str):
1437             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1438         if not json_ld:
1439             return {}
1440         info = {}
1441         if not isinstance(json_ld, (list, tuple, dict)):
1442             return info
1443         if isinstance(json_ld, dict):
1444             json_ld = [json_ld]
1445
1446         INTERACTION_TYPE_MAP = {
1447             'CommentAction': 'comment',
1448             'AgreeAction': 'like',
1449             'DisagreeAction': 'dislike',
1450             'LikeAction': 'like',
1451             'DislikeAction': 'dislike',
1452             'ListenAction': 'view',
1453             'WatchAction': 'view',
1454             'ViewAction': 'view',
1455         }
1456
1457         def extract_interaction_type(e):
1458             interaction_type = e.get('interactionType')
1459             if isinstance(interaction_type, dict):
1460                 interaction_type = interaction_type.get('@type')
1461             return str_or_none(interaction_type)
1462
1463         def extract_interaction_statistic(e):
1464             interaction_statistic = e.get('interactionStatistic')
1465             if isinstance(interaction_statistic, dict):
1466                 interaction_statistic = [interaction_statistic]
1467             if not isinstance(interaction_statistic, list):
1468                 return
1469             for is_e in interaction_statistic:
1470                 if not isinstance(is_e, dict):
1471                     continue
1472                 if is_e.get('@type') != 'InteractionCounter':
1473                     continue
1474                 interaction_type = extract_interaction_type(is_e)
1475                 if not interaction_type:
1476                     continue
1477                 # For interaction count some sites provide string instead of
1478                 # an integer (as per spec) with non digit characters (e.g. ",")
1479                 # so extracting count with more relaxed str_to_int
1480                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1481                 if interaction_count is None:
1482                     continue
1483                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1484                 if not count_kind:
1485                     continue
1486                 count_key = '%s_count' % count_kind
1487                 if info.get(count_key) is not None:
1488                     continue
1489                 info[count_key] = interaction_count
1490
1491         def extract_chapter_information(e):
1492             chapters = [{
1493                 'title': part.get('name'),
1494                 'start_time': part.get('startOffset'),
1495                 'end_time': part.get('endOffset'),
1496             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1497             for idx, (last_c, current_c, next_c) in enumerate(zip(
1498                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1499                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1500                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1501                 if None in current_c.values():
1502                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1503                     return
1504             if chapters:
1505                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1506                 info['chapters'] = chapters
1507
1508         def extract_video_object(e):
1509             assert e['@type'] == 'VideoObject'
1510             author = e.get('author')
1511             info.update({
1512                 'url': url_or_none(e.get('contentUrl')),
1513                 'title': unescapeHTML(e.get('name')),
1514                 'description': unescapeHTML(e.get('description')),
1515                 'thumbnails': [{'url': url_or_none(url)}
1516                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1517                 'duration': parse_duration(e.get('duration')),
1518                 'timestamp': unified_timestamp(e.get('uploadDate')),
1519                 # author can be an instance of 'Organization' or 'Person' types.
1520                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1521                 # however some websites are using 'Text' type instead.
1522                 # 1. https://schema.org/VideoObject
1523                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1524                 'filesize': float_or_none(e.get('contentSize')),
1525                 'tbr': int_or_none(e.get('bitrate')),
1526                 'width': int_or_none(e.get('width')),
1527                 'height': int_or_none(e.get('height')),
1528                 'view_count': int_or_none(e.get('interactionCount')),
1529             })
1530             extract_interaction_statistic(e)
1531             extract_chapter_information(e)
1532
1533         def traverse_json_ld(json_ld, at_top_level=True):
1534             for e in json_ld:
1535                 if at_top_level and '@context' not in e:
1536                     continue
1537                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1538                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1539                     break
1540                 item_type = e.get('@type')
1541                 if expected_type is not None and expected_type != item_type:
1542                     continue
1543                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1544                 if rating is not None:
1545                     info['average_rating'] = rating
1546                 if item_type in ('TVEpisode', 'Episode'):
1547                     episode_name = unescapeHTML(e.get('name'))
1548                     info.update({
1549                         'episode': episode_name,
1550                         'episode_number': int_or_none(e.get('episodeNumber')),
1551                         'description': unescapeHTML(e.get('description')),
1552                     })
1553                     if not info.get('title') and episode_name:
1554                         info['title'] = episode_name
1555                     part_of_season = e.get('partOfSeason')
1556                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1557                         info.update({
1558                             'season': unescapeHTML(part_of_season.get('name')),
1559                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1560                         })
1561                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1562                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1563                         info['series'] = unescapeHTML(part_of_series.get('name'))
1564                 elif item_type == 'Movie':
1565                     info.update({
1566                         'title': unescapeHTML(e.get('name')),
1567                         'description': unescapeHTML(e.get('description')),
1568                         'duration': parse_duration(e.get('duration')),
1569                         'timestamp': unified_timestamp(e.get('dateCreated')),
1570                     })
1571                 elif item_type in ('Article', 'NewsArticle'):
1572                     info.update({
1573                         'timestamp': parse_iso8601(e.get('datePublished')),
1574                         'title': unescapeHTML(e.get('headline')),
1575                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1576                     })
1577                     if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1578                         extract_video_object(e['video'][0])
1579                 elif item_type == 'VideoObject':
1580                     extract_video_object(e)
1581                     if expected_type is None:
1582                         continue
1583                     else:
1584                         break
1585                 video = e.get('video')
1586                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1587                     extract_video_object(video)
1588                 if expected_type is None:
1589                     continue
1590                 else:
1591                     break
1592         traverse_json_ld(json_ld)
1593
1594         return filter_dict(info)
1595
1596     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1597         return self._parse_json(
1598             self._search_regex(
1599                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1600                 webpage, 'next.js data', fatal=fatal, **kw),
1601             video_id, transform_source=transform_source, fatal=fatal)
1602
1603     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1604         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1605         # not all website do this, but it can be changed
1606         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1607         rectx = re.escape(context_name)
1608         js, arg_keys, arg_vals = self._search_regex(
1609             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1610              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1611             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1612
1613         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1614
1615         for key, val in args.items():
1616             if val in ('undefined', 'void 0'):
1617                 args[key] = 'null'
1618
1619         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1620
1621     @staticmethod
1622     def _hidden_inputs(html):
1623         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1624         hidden_inputs = {}
1625         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1626             attrs = extract_attributes(input)
1627             if not input:
1628                 continue
1629             if attrs.get('type') not in ('hidden', 'submit'):
1630                 continue
1631             name = attrs.get('name') or attrs.get('id')
1632             value = attrs.get('value')
1633             if name and value is not None:
1634                 hidden_inputs[name] = value
1635         return hidden_inputs
1636
1637     def _form_hidden_inputs(self, form_id, html):
1638         form = self._search_regex(
1639             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1640             html, '%s form' % form_id, group='form')
1641         return self._hidden_inputs(form)
1642
1643     class FormatSort:
1644         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1645
1646         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1647                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1648                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1649         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1650                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1651                         'fps', 'fs_approx', 'source', 'id')
1652
1653         settings = {
1654             'vcodec': {'type': 'ordered', 'regex': True,
1655                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1656             'acodec': {'type': 'ordered', 'regex': True,
1657                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1658             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1659                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1660             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1661                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1662             'vext': {'type': 'ordered', 'field': 'video_ext',
1663                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1664                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1665             'aext': {'type': 'ordered', 'field': 'audio_ext',
1666                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1667                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1668             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1669             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1670                            'field': ('vcodec', 'acodec'),
1671                            'function': lambda it: int(any(v != 'none' for v in it))},
1672             'ie_pref': {'priority': True, 'type': 'extractor'},
1673             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1674             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1675             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1676             'quality': {'convert': 'float', 'default': -1},
1677             'filesize': {'convert': 'bytes'},
1678             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1679             'id': {'convert': 'string', 'field': 'format_id'},
1680             'height': {'convert': 'float_none'},
1681             'width': {'convert': 'float_none'},
1682             'fps': {'convert': 'float_none'},
1683             'tbr': {'convert': 'float_none'},
1684             'vbr': {'convert': 'float_none'},
1685             'abr': {'convert': 'float_none'},
1686             'asr': {'convert': 'float_none'},
1687             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1688
1689             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1690             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1691             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1692             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1693             'res': {'type': 'multiple', 'field': ('height', 'width'),
1694                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1695
1696             # For compatibility with youtube-dl
1697             'format_id': {'type': 'alias', 'field': 'id'},
1698             'preference': {'type': 'alias', 'field': 'ie_pref'},
1699             'language_preference': {'type': 'alias', 'field': 'lang'},
1700             'source_preference': {'type': 'alias', 'field': 'source'},
1701             'protocol': {'type': 'alias', 'field': 'proto'},
1702             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1703
1704             # Deprecated
1705             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1706             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1707             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1708             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1709             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1710             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1711             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1712             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1713             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1714             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1715             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1716             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1717             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1718             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1719             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1720             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1721             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1722             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1723             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1724             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1725         }
1726
1727         def __init__(self, ie, field_preference):
1728             self._order = []
1729             self.ydl = ie._downloader
1730             self.evaluate_params(self.ydl.params, field_preference)
1731             if ie.get_param('verbose'):
1732                 self.print_verbose_info(self.ydl.write_debug)
1733
1734         def _get_field_setting(self, field, key):
1735             if field not in self.settings:
1736                 if key in ('forced', 'priority'):
1737                     return False
1738                 self.ydl.deprecation_warning(
1739                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1740                     'and may be removed in a future version')
1741                 self.settings[field] = {}
1742             propObj = self.settings[field]
1743             if key not in propObj:
1744                 type = propObj.get('type')
1745                 if key == 'field':
1746                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1747                 elif key == 'convert':
1748                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1749                 else:
1750                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1751                 propObj[key] = default
1752             return propObj[key]
1753
1754         def _resolve_field_value(self, field, value, convertNone=False):
1755             if value is None:
1756                 if not convertNone:
1757                     return None
1758             else:
1759                 value = value.lower()
1760             conversion = self._get_field_setting(field, 'convert')
1761             if conversion == 'ignore':
1762                 return None
1763             if conversion == 'string':
1764                 return value
1765             elif conversion == 'float_none':
1766                 return float_or_none(value)
1767             elif conversion == 'bytes':
1768                 return FileDownloader.parse_bytes(value)
1769             elif conversion == 'order':
1770                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1771                 use_regex = self._get_field_setting(field, 'regex')
1772                 list_length = len(order_list)
1773                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1774                 if use_regex and value is not None:
1775                     for i, regex in enumerate(order_list):
1776                         if regex and re.match(regex, value):
1777                             return list_length - i
1778                     return list_length - empty_pos  # not in list
1779                 else:  # not regex or  value = None
1780                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1781             else:
1782                 if value.isnumeric():
1783                     return float(value)
1784                 else:
1785                     self.settings[field]['convert'] = 'string'
1786                     return value
1787
1788         def evaluate_params(self, params, sort_extractor):
1789             self._use_free_order = params.get('prefer_free_formats', False)
1790             self._sort_user = params.get('format_sort', [])
1791             self._sort_extractor = sort_extractor
1792
1793             def add_item(field, reverse, closest, limit_text):
1794                 field = field.lower()
1795                 if field in self._order:
1796                     return
1797                 self._order.append(field)
1798                 limit = self._resolve_field_value(field, limit_text)
1799                 data = {
1800                     'reverse': reverse,
1801                     'closest': False if limit is None else closest,
1802                     'limit_text': limit_text,
1803                     'limit': limit}
1804                 if field in self.settings:
1805                     self.settings[field].update(data)
1806                 else:
1807                     self.settings[field] = data
1808
1809             sort_list = (
1810                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1811                 + (tuple() if params.get('format_sort_force', False)
1812                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1813                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1814
1815             for item in sort_list:
1816                 match = re.match(self.regex, item)
1817                 if match is None:
1818                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1819                 field = match.group('field')
1820                 if field is None:
1821                     continue
1822                 if self._get_field_setting(field, 'type') == 'alias':
1823                     alias, field = field, self._get_field_setting(field, 'field')
1824                     if self._get_field_setting(alias, 'deprecated'):
1825                         self.ydl.deprecation_warning(
1826                             f'Format sorting alias {alias} is deprecated '
1827                             f'and may be removed in a future version. Please use {field} instead')
1828                 reverse = match.group('reverse') is not None
1829                 closest = match.group('separator') == '~'
1830                 limit_text = match.group('limit')
1831
1832                 has_limit = limit_text is not None
1833                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1834                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1835
1836                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1837                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1838                 limit_count = len(limits)
1839                 for (i, f) in enumerate(fields):
1840                     add_item(f, reverse, closest,
1841                              limits[i] if i < limit_count
1842                              else limits[0] if has_limit and not has_multiple_limits
1843                              else None)
1844
1845         def print_verbose_info(self, write_debug):
1846             if self._sort_user:
1847                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1848             if self._sort_extractor:
1849                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1850             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1851                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1852                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1853                               self._get_field_setting(field, 'limit_text'),
1854                               self._get_field_setting(field, 'limit'))
1855                 if self._get_field_setting(field, 'limit_text') is not None else '')
1856                 for field in self._order if self._get_field_setting(field, 'visible')]))
1857
1858         def _calculate_field_preference_from_value(self, format, field, type, value):
1859             reverse = self._get_field_setting(field, 'reverse')
1860             closest = self._get_field_setting(field, 'closest')
1861             limit = self._get_field_setting(field, 'limit')
1862
1863             if type == 'extractor':
1864                 maximum = self._get_field_setting(field, 'max')
1865                 if value is None or (maximum is not None and value >= maximum):
1866                     value = -1
1867             elif type == 'boolean':
1868                 in_list = self._get_field_setting(field, 'in_list')
1869                 not_in_list = self._get_field_setting(field, 'not_in_list')
1870                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1871             elif type == 'ordered':
1872                 value = self._resolve_field_value(field, value, True)
1873
1874             # try to convert to number
1875             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1876             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1877             if is_num:
1878                 value = val_num
1879
1880             return ((-10, 0) if value is None
1881                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1882                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1883                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1884                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1885                     else (-1, value, 0))
1886
1887         def _calculate_field_preference(self, format, field):
1888             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1889             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1890             if type == 'multiple':
1891                 type = 'field'  # Only 'field' is allowed in multiple for now
1892                 actual_fields = self._get_field_setting(field, 'field')
1893
1894                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1895             else:
1896                 value = get_value(field)
1897             return self._calculate_field_preference_from_value(format, field, type, value)
1898
1899         def calculate_preference(self, format):
1900             # Determine missing protocol
1901             if not format.get('protocol'):
1902                 format['protocol'] = determine_protocol(format)
1903
1904             # Determine missing ext
1905             if not format.get('ext') and 'url' in format:
1906                 format['ext'] = determine_ext(format['url'])
1907             if format.get('vcodec') == 'none':
1908                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1909                 format['video_ext'] = 'none'
1910             else:
1911                 format['video_ext'] = format['ext']
1912                 format['audio_ext'] = 'none'
1913             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1914             #    format['preference'] = -1000
1915
1916             # Determine missing bitrates
1917             if format.get('tbr') is None:
1918                 if format.get('vbr') is not None and format.get('abr') is not None:
1919                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1920             else:
1921                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1922                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1923                 if format.get('acodec') != 'none' and format.get('abr') is None:
1924                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1925
1926             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1927
1928     def _sort_formats(self, formats, field_preference=[]):
1929         if not formats:
1930             return
1931         format_sort = self.FormatSort(self, field_preference)
1932         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1933
1934     def _check_formats(self, formats, video_id):
1935         if formats:
1936             formats[:] = filter(
1937                 lambda f: self._is_valid_url(
1938                     f['url'], video_id,
1939                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1940                 formats)
1941
1942     @staticmethod
1943     def _remove_duplicate_formats(formats):
1944         format_urls = set()
1945         unique_formats = []
1946         for f in formats:
1947             if f['url'] not in format_urls:
1948                 format_urls.add(f['url'])
1949                 unique_formats.append(f)
1950         formats[:] = unique_formats
1951
1952     def _is_valid_url(self, url, video_id, item='video', headers={}):
1953         url = self._proto_relative_url(url, scheme='http:')
1954         # For now assume non HTTP(S) URLs always valid
1955         if not (url.startswith('http://') or url.startswith('https://')):
1956             return True
1957         try:
1958             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1959             return True
1960         except ExtractorError as e:
1961             self.to_screen(
1962                 '%s: %s URL is invalid, skipping: %s'
1963                 % (video_id, item, error_to_compat_str(e.cause)))
1964             return False
1965
1966     def http_scheme(self):
1967         """ Either "http:" or "https:", depending on the user's preferences """
1968         return (
1969             'http:'
1970             if self.get_param('prefer_insecure', False)
1971             else 'https:')
1972
1973     def _proto_relative_url(self, url, scheme=None):
1974         if url is None:
1975             return url
1976         if url.startswith('//'):
1977             if scheme is None:
1978                 scheme = self.http_scheme()
1979             return scheme + url
1980         else:
1981             return url
1982
1983     def _sleep(self, timeout, video_id, msg_template=None):
1984         if msg_template is None:
1985             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1986         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1987         self.to_screen(msg)
1988         time.sleep(timeout)
1989
1990     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1991                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1992                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1993         manifest = self._download_xml(
1994             manifest_url, video_id, 'Downloading f4m manifest',
1995             'Unable to download f4m manifest',
1996             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1997             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1998             transform_source=transform_source,
1999             fatal=fatal, data=data, headers=headers, query=query)
2000
2001         if manifest is False:
2002             return []
2003
2004         return self._parse_f4m_formats(
2005             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2006             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2007
2008     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2009                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2010                            fatal=True, m3u8_id=None):
2011         if not isinstance(manifest, compat_etree_Element) and not fatal:
2012             return []
2013
2014         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2015         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2016         if akamai_pv is not None and ';' in akamai_pv.text:
2017             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2018             if playerVerificationChallenge.strip() != '':
2019                 return []
2020
2021         formats = []
2022         manifest_version = '1.0'
2023         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2024         if not media_nodes:
2025             manifest_version = '2.0'
2026             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2027         # Remove unsupported DRM protected media from final formats
2028         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2029         media_nodes = remove_encrypted_media(media_nodes)
2030         if not media_nodes:
2031             return formats
2032
2033         manifest_base_url = get_base_url(manifest)
2034
2035         bootstrap_info = xpath_element(
2036             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2037             'bootstrap info', default=None)
2038
2039         vcodec = None
2040         mime_type = xpath_text(
2041             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2042             'base URL', default=None)
2043         if mime_type and mime_type.startswith('audio/'):
2044             vcodec = 'none'
2045
2046         for i, media_el in enumerate(media_nodes):
2047             tbr = int_or_none(media_el.attrib.get('bitrate'))
2048             width = int_or_none(media_el.attrib.get('width'))
2049             height = int_or_none(media_el.attrib.get('height'))
2050             format_id = join_nonempty(f4m_id, tbr or i)
2051             # If <bootstrapInfo> is present, the specified f4m is a
2052             # stream-level manifest, and only set-level manifests may refer to
2053             # external resources.  See section 11.4 and section 4 of F4M spec
2054             if bootstrap_info is None:
2055                 media_url = None
2056                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2057                 if manifest_version == '2.0':
2058                     media_url = media_el.attrib.get('href')
2059                 if media_url is None:
2060                     media_url = media_el.attrib.get('url')
2061                 if not media_url:
2062                     continue
2063                 manifest_url = (
2064                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2065                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2066                 # If media_url is itself a f4m manifest do the recursive extraction
2067                 # since bitrates in parent manifest (this one) and media_url manifest
2068                 # may differ leading to inability to resolve the format by requested
2069                 # bitrate in f4m downloader
2070                 ext = determine_ext(manifest_url)
2071                 if ext == 'f4m':
2072                     f4m_formats = self._extract_f4m_formats(
2073                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2074                         transform_source=transform_source, fatal=fatal)
2075                     # Sometimes stream-level manifest contains single media entry that
2076                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2077                     # At the same time parent's media entry in set-level manifest may
2078                     # contain it. We will copy it from parent in such cases.
2079                     if len(f4m_formats) == 1:
2080                         f = f4m_formats[0]
2081                         f.update({
2082                             'tbr': f.get('tbr') or tbr,
2083                             'width': f.get('width') or width,
2084                             'height': f.get('height') or height,
2085                             'format_id': f.get('format_id') if not tbr else format_id,
2086                             'vcodec': vcodec,
2087                         })
2088                     formats.extend(f4m_formats)
2089                     continue
2090                 elif ext == 'm3u8':
2091                     formats.extend(self._extract_m3u8_formats(
2092                         manifest_url, video_id, 'mp4', preference=preference,
2093                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2094                     continue
2095             formats.append({
2096                 'format_id': format_id,
2097                 'url': manifest_url,
2098                 'manifest_url': manifest_url,
2099                 'ext': 'flv' if bootstrap_info is not None else None,
2100                 'protocol': 'f4m',
2101                 'tbr': tbr,
2102                 'width': width,
2103                 'height': height,
2104                 'vcodec': vcodec,
2105                 'preference': preference,
2106                 'quality': quality,
2107             })
2108         return formats
2109
2110     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2111         return {
2112             'format_id': join_nonempty(m3u8_id, 'meta'),
2113             'url': m3u8_url,
2114             'ext': ext,
2115             'protocol': 'm3u8',
2116             'preference': preference - 100 if preference else -100,
2117             'quality': quality,
2118             'resolution': 'multiple',
2119             'format_note': 'Quality selection URL',
2120         }
2121
2122     def _report_ignoring_subs(self, name):
2123         self.report_warning(bug_reports_message(
2124             f'Ignoring subtitle tracks found in the {name} manifest; '
2125             'if any subtitle tracks are missing,'
2126         ), only_once=True)
2127
2128     def _extract_m3u8_formats(self, *args, **kwargs):
2129         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2130         if subs:
2131             self._report_ignoring_subs('HLS')
2132         return fmts
2133
2134     def _extract_m3u8_formats_and_subtitles(
2135             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2136             preference=None, quality=None, m3u8_id=None, note=None,
2137             errnote=None, fatal=True, live=False, data=None, headers={},
2138             query={}):
2139
2140         res = self._download_webpage_handle(
2141             m3u8_url, video_id,
2142             note='Downloading m3u8 information' if note is None else note,
2143             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2144             fatal=fatal, data=data, headers=headers, query=query)
2145
2146         if res is False:
2147             return [], {}
2148
2149         m3u8_doc, urlh = res
2150         m3u8_url = urlh.geturl()
2151
2152         return self._parse_m3u8_formats_and_subtitles(
2153             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2154             preference=preference, quality=quality, m3u8_id=m3u8_id,
2155             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2156             headers=headers, query=query, video_id=video_id)
2157
2158     def _parse_m3u8_formats_and_subtitles(
2159             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2160             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2161             errnote=None, fatal=True, data=None, headers={}, query={},
2162             video_id=None):
2163         formats, subtitles = [], {}
2164
2165         has_drm = re.search('|'.join([
2166             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2167             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2168         ]), m3u8_doc)
2169
2170         def format_url(url):
2171             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2172
2173         if self.get_param('hls_split_discontinuity', False):
2174             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2175                 if not m3u8_doc:
2176                     if not manifest_url:
2177                         return []
2178                     m3u8_doc = self._download_webpage(
2179                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2180                         note=False, errnote='Failed to download m3u8 playlist information')
2181                     if m3u8_doc is False:
2182                         return []
2183                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2184
2185         else:
2186             def _extract_m3u8_playlist_indices(*args, **kwargs):
2187                 return [None]
2188
2189         # References:
2190         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2191         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2192         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2193
2194         # We should try extracting formats only from master playlists [1, 4.3.4],
2195         # i.e. playlists that describe available qualities. On the other hand
2196         # media playlists [1, 4.3.3] should be returned as is since they contain
2197         # just the media without qualities renditions.
2198         # Fortunately, master playlist can be easily distinguished from media
2199         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2200         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2201         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2202         # media playlist and MUST NOT appear in master playlist thus we can
2203         # clearly detect media playlist with this criterion.
2204
2205         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2206             formats = [{
2207                 'format_id': join_nonempty(m3u8_id, idx),
2208                 'format_index': idx,
2209                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2210                 'ext': ext,
2211                 'protocol': entry_protocol,
2212                 'preference': preference,
2213                 'quality': quality,
2214                 'has_drm': has_drm,
2215             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2216
2217             return formats, subtitles
2218
2219         groups = {}
2220         last_stream_inf = {}
2221
2222         def extract_media(x_media_line):
2223             media = parse_m3u8_attributes(x_media_line)
2224             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2225             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2226             if not (media_type and group_id and name):
2227                 return
2228             groups.setdefault(group_id, []).append(media)
2229             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2230             if media_type == 'SUBTITLES':
2231                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2232                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2233                 # However, lack of URI has been spotted in the wild.
2234                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2235                 if not media.get('URI'):
2236                     return
2237                 url = format_url(media['URI'])
2238                 sub_info = {
2239                     'url': url,
2240                     'ext': determine_ext(url),
2241                 }
2242                 if sub_info['ext'] == 'm3u8':
2243                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2244                     # files may contain is WebVTT:
2245                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2246                     sub_info['ext'] = 'vtt'
2247                     sub_info['protocol'] = 'm3u8_native'
2248                 lang = media.get('LANGUAGE') or 'und'
2249                 subtitles.setdefault(lang, []).append(sub_info)
2250             if media_type not in ('VIDEO', 'AUDIO'):
2251                 return
2252             media_url = media.get('URI')
2253             if media_url:
2254                 manifest_url = format_url(media_url)
2255                 formats.extend({
2256                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2257                     'format_note': name,
2258                     'format_index': idx,
2259                     'url': manifest_url,
2260                     'manifest_url': m3u8_url,
2261                     'language': media.get('LANGUAGE'),
2262                     'ext': ext,
2263                     'protocol': entry_protocol,
2264                     'preference': preference,
2265                     'quality': quality,
2266                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2267                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2268
2269         def build_stream_name():
2270             # Despite specification does not mention NAME attribute for
2271             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2272             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2273             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2274             stream_name = last_stream_inf.get('NAME')
2275             if stream_name:
2276                 return stream_name
2277             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2278             # from corresponding rendition group
2279             stream_group_id = last_stream_inf.get('VIDEO')
2280             if not stream_group_id:
2281                 return
2282             stream_group = groups.get(stream_group_id)
2283             if not stream_group:
2284                 return stream_group_id
2285             rendition = stream_group[0]
2286             return rendition.get('NAME') or stream_group_id
2287
2288         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2289         # chance to detect video only formats when EXT-X-STREAM-INF tags
2290         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2291         for line in m3u8_doc.splitlines():
2292             if line.startswith('#EXT-X-MEDIA:'):
2293                 extract_media(line)
2294
2295         for line in m3u8_doc.splitlines():
2296             if line.startswith('#EXT-X-STREAM-INF:'):
2297                 last_stream_inf = parse_m3u8_attributes(line)
2298             elif line.startswith('#') or not line.strip():
2299                 continue
2300             else:
2301                 tbr = float_or_none(
2302                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2303                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2304                 manifest_url = format_url(line.strip())
2305
2306                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2307                     format_id = [m3u8_id, None, idx]
2308                     # Bandwidth of live streams may differ over time thus making
2309                     # format_id unpredictable. So it's better to keep provided
2310                     # format_id intact.
2311                     if not live:
2312                         stream_name = build_stream_name()
2313                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2314                     f = {
2315                         'format_id': join_nonempty(*format_id),
2316                         'format_index': idx,
2317                         'url': manifest_url,
2318                         'manifest_url': m3u8_url,
2319                         'tbr': tbr,
2320                         'ext': ext,
2321                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2322                         'protocol': entry_protocol,
2323                         'preference': preference,
2324                         'quality': quality,
2325                     }
2326                     resolution = last_stream_inf.get('RESOLUTION')
2327                     if resolution:
2328                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2329                         if mobj:
2330                             f['width'] = int(mobj.group('width'))
2331                             f['height'] = int(mobj.group('height'))
2332                     # Unified Streaming Platform
2333                     mobj = re.search(
2334                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2335                     if mobj:
2336                         abr, vbr = mobj.groups()
2337                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2338                         f.update({
2339                             'vbr': vbr,
2340                             'abr': abr,
2341                         })
2342                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2343                     f.update(codecs)
2344                     audio_group_id = last_stream_inf.get('AUDIO')
2345                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2346                     # references a rendition group MUST have a CODECS attribute.
2347                     # However, this is not always respected, for example, [2]
2348                     # contains EXT-X-STREAM-INF tag which references AUDIO
2349                     # rendition group but does not have CODECS and despite
2350                     # referencing an audio group it represents a complete
2351                     # (with audio and video) format. So, for such cases we will
2352                     # ignore references to rendition groups and treat them
2353                     # as complete formats.
2354                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2355                         audio_group = groups.get(audio_group_id)
2356                         if audio_group and audio_group[0].get('URI'):
2357                             # TODO: update acodec for audio only formats with
2358                             # the same GROUP-ID
2359                             f['acodec'] = 'none'
2360                     if not f.get('ext'):
2361                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2362                     formats.append(f)
2363
2364                     # for DailyMotion
2365                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2366                     if progressive_uri:
2367                         http_f = f.copy()
2368                         del http_f['manifest_url']
2369                         http_f.update({
2370                             'format_id': f['format_id'].replace('hls-', 'http-'),
2371                             'protocol': 'http',
2372                             'url': progressive_uri,
2373                         })
2374                         formats.append(http_f)
2375
2376                 last_stream_inf = {}
2377         return formats, subtitles
2378
2379     def _extract_m3u8_vod_duration(
2380             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2381
2382         m3u8_vod = self._download_webpage(
2383             m3u8_vod_url, video_id,
2384             note='Downloading m3u8 VOD manifest' if note is None else note,
2385             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2386             fatal=False, data=data, headers=headers, query=query)
2387
2388         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2389
2390     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2391         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2392             return None
2393
2394         return int(sum(
2395             float(line[len('#EXTINF:'):].split(',')[0])
2396             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2397
2398     @staticmethod
2399     def _xpath_ns(path, namespace=None):
2400         if not namespace:
2401             return path
2402         out = []
2403         for c in path.split('/'):
2404             if not c or c == '.':
2405                 out.append(c)
2406             else:
2407                 out.append('{%s}%s' % (namespace, c))
2408         return '/'.join(out)
2409
2410     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2411         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2412
2413         if smil is False:
2414             assert not fatal
2415             return [], {}
2416
2417         namespace = self._parse_smil_namespace(smil)
2418
2419         fmts = self._parse_smil_formats(
2420             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2421         subs = self._parse_smil_subtitles(
2422             smil, namespace=namespace)
2423
2424         return fmts, subs
2425
2426     def _extract_smil_formats(self, *args, **kwargs):
2427         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2428         if subs:
2429             self._report_ignoring_subs('SMIL')
2430         return fmts
2431
2432     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2433         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2434         if smil is False:
2435             return {}
2436         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2437
2438     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2439         return self._download_xml(
2440             smil_url, video_id, 'Downloading SMIL file',
2441             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2442
2443     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2444         namespace = self._parse_smil_namespace(smil)
2445
2446         formats = self._parse_smil_formats(
2447             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2448         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2449
2450         video_id = os.path.splitext(url_basename(smil_url))[0]
2451         title = None
2452         description = None
2453         upload_date = None
2454         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2455             name = meta.attrib.get('name')
2456             content = meta.attrib.get('content')
2457             if not name or not content:
2458                 continue
2459             if not title and name == 'title':
2460                 title = content
2461             elif not description and name in ('description', 'abstract'):
2462                 description = content
2463             elif not upload_date and name == 'date':
2464                 upload_date = unified_strdate(content)
2465
2466         thumbnails = [{
2467             'id': image.get('type'),
2468             'url': image.get('src'),
2469             'width': int_or_none(image.get('width')),
2470             'height': int_or_none(image.get('height')),
2471         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2472
2473         return {
2474             'id': video_id,
2475             'title': title or video_id,
2476             'description': description,
2477             'upload_date': upload_date,
2478             'thumbnails': thumbnails,
2479             'formats': formats,
2480             'subtitles': subtitles,
2481         }
2482
2483     def _parse_smil_namespace(self, smil):
2484         return self._search_regex(
2485             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2486
2487     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2488         base = smil_url
2489         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2490             b = meta.get('base') or meta.get('httpBase')
2491             if b:
2492                 base = b
2493                 break
2494
2495         formats = []
2496         rtmp_count = 0
2497         http_count = 0
2498         m3u8_count = 0
2499         imgs_count = 0
2500
2501         srcs = set()
2502         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2503         for medium in media:
2504             src = medium.get('src')
2505             if not src or src in srcs:
2506                 continue
2507             srcs.add(src)
2508
2509             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2510             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2511             width = int_or_none(medium.get('width'))
2512             height = int_or_none(medium.get('height'))
2513             proto = medium.get('proto')
2514             ext = medium.get('ext')
2515             src_ext = determine_ext(src)
2516             streamer = medium.get('streamer') or base
2517
2518             if proto == 'rtmp' or streamer.startswith('rtmp'):
2519                 rtmp_count += 1
2520                 formats.append({
2521                     'url': streamer,
2522                     'play_path': src,
2523                     'ext': 'flv',
2524                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2525                     'tbr': bitrate,
2526                     'filesize': filesize,
2527                     'width': width,
2528                     'height': height,
2529                 })
2530                 if transform_rtmp_url:
2531                     streamer, src = transform_rtmp_url(streamer, src)
2532                     formats[-1].update({
2533                         'url': streamer,
2534                         'play_path': src,
2535                     })
2536                 continue
2537
2538             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2539             src_url = src_url.strip()
2540
2541             if proto == 'm3u8' or src_ext == 'm3u8':
2542                 m3u8_formats = self._extract_m3u8_formats(
2543                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2544                 if len(m3u8_formats) == 1:
2545                     m3u8_count += 1
2546                     m3u8_formats[0].update({
2547                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2548                         'tbr': bitrate,
2549                         'width': width,
2550                         'height': height,
2551                     })
2552                 formats.extend(m3u8_formats)
2553             elif src_ext == 'f4m':
2554                 f4m_url = src_url
2555                 if not f4m_params:
2556                     f4m_params = {
2557                         'hdcore': '3.2.0',
2558                         'plugin': 'flowplayer-3.2.0.1',
2559                     }
2560                 f4m_url += '&' if '?' in f4m_url else '?'
2561                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2562                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2563             elif src_ext == 'mpd':
2564                 formats.extend(self._extract_mpd_formats(
2565                     src_url, video_id, mpd_id='dash', fatal=False))
2566             elif re.search(r'\.ism/[Mm]anifest', src_url):
2567                 formats.extend(self._extract_ism_formats(
2568                     src_url, video_id, ism_id='mss', fatal=False))
2569             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2570                 http_count += 1
2571                 formats.append({
2572                     'url': src_url,
2573                     'ext': ext or src_ext or 'flv',
2574                     'format_id': 'http-%d' % (bitrate or http_count),
2575                     'tbr': bitrate,
2576                     'filesize': filesize,
2577                     'width': width,
2578                     'height': height,
2579                 })
2580
2581         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2582             src = medium.get('src')
2583             if not src or src in srcs:
2584                 continue
2585             srcs.add(src)
2586
2587             imgs_count += 1
2588             formats.append({
2589                 'format_id': 'imagestream-%d' % (imgs_count),
2590                 'url': src,
2591                 'ext': mimetype2ext(medium.get('type')),
2592                 'acodec': 'none',
2593                 'vcodec': 'none',
2594                 'width': int_or_none(medium.get('width')),
2595                 'height': int_or_none(medium.get('height')),
2596                 'format_note': 'SMIL storyboards',
2597             })
2598
2599         return formats
2600
2601     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2602         urls = []
2603         subtitles = {}
2604         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2605             src = textstream.get('src')
2606             if not src or src in urls:
2607                 continue
2608             urls.append(src)
2609             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2610             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2611             subtitles.setdefault(lang, []).append({
2612                 'url': src,
2613                 'ext': ext,
2614             })
2615         return subtitles
2616
2617     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2618         xspf = self._download_xml(
2619             xspf_url, playlist_id, 'Downloading xpsf playlist',
2620             'Unable to download xspf manifest', fatal=fatal)
2621         if xspf is False:
2622             return []
2623         return self._parse_xspf(
2624             xspf, playlist_id, xspf_url=xspf_url,
2625             xspf_base_url=base_url(xspf_url))
2626
2627     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2628         NS_MAP = {
2629             'xspf': 'http://xspf.org/ns/0/',
2630             's1': 'http://static.streamone.nl/player/ns/0',
2631         }
2632
2633         entries = []
2634         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2635             title = xpath_text(
2636                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2637             description = xpath_text(
2638                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2639             thumbnail = xpath_text(
2640                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2641             duration = float_or_none(
2642                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2643
2644             formats = []
2645             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2646                 format_url = urljoin(xspf_base_url, location.text)
2647                 if not format_url:
2648                     continue
2649                 formats.append({
2650                     'url': format_url,
2651                     'manifest_url': xspf_url,
2652                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2653                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2654                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2655                 })
2656             self._sort_formats(formats)
2657
2658             entries.append({
2659                 'id': playlist_id,
2660                 'title': title,
2661                 'description': description,
2662                 'thumbnail': thumbnail,
2663                 'duration': duration,
2664                 'formats': formats,
2665             })
2666         return entries
2667
2668     def _extract_mpd_formats(self, *args, **kwargs):
2669         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2670         if subs:
2671             self._report_ignoring_subs('DASH')
2672         return fmts
2673
2674     def _extract_mpd_formats_and_subtitles(
2675             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2676             fatal=True, data=None, headers={}, query={}):
2677         res = self._download_xml_handle(
2678             mpd_url, video_id,
2679             note='Downloading MPD manifest' if note is None else note,
2680             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2681             fatal=fatal, data=data, headers=headers, query=query)
2682         if res is False:
2683             return [], {}
2684         mpd_doc, urlh = res
2685         if mpd_doc is None:
2686             return [], {}
2687         mpd_base_url = base_url(urlh.geturl())
2688
2689         return self._parse_mpd_formats_and_subtitles(
2690             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2691
2692     def _parse_mpd_formats(self, *args, **kwargs):
2693         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2694         if subs:
2695             self._report_ignoring_subs('DASH')
2696         return fmts
2697
2698     def _parse_mpd_formats_and_subtitles(
2699             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2700         """
2701         Parse formats from MPD manifest.
2702         References:
2703          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2704             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2705          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2706         """
2707         if not self.get_param('dynamic_mpd', True):
2708             if mpd_doc.get('type') == 'dynamic':
2709                 return [], {}
2710
2711         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2712
2713         def _add_ns(path):
2714             return self._xpath_ns(path, namespace)
2715
2716         def is_drm_protected(element):
2717             return element.find(_add_ns('ContentProtection')) is not None
2718
2719         def extract_multisegment_info(element, ms_parent_info):
2720             ms_info = ms_parent_info.copy()
2721
2722             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2723             # common attributes and elements.  We will only extract relevant
2724             # for us.
2725             def extract_common(source):
2726                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2727                 if segment_timeline is not None:
2728                     s_e = segment_timeline.findall(_add_ns('S'))
2729                     if s_e:
2730                         ms_info['total_number'] = 0
2731                         ms_info['s'] = []
2732                         for s in s_e:
2733                             r = int(s.get('r', 0))
2734                             ms_info['total_number'] += 1 + r
2735                             ms_info['s'].append({
2736                                 't': int(s.get('t', 0)),
2737                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2738                                 'd': int(s.attrib['d']),
2739                                 'r': r,
2740                             })
2741                 start_number = source.get('startNumber')
2742                 if start_number:
2743                     ms_info['start_number'] = int(start_number)
2744                 timescale = source.get('timescale')
2745                 if timescale:
2746                     ms_info['timescale'] = int(timescale)
2747                 segment_duration = source.get('duration')
2748                 if segment_duration:
2749                     ms_info['segment_duration'] = float(segment_duration)
2750
2751             def extract_Initialization(source):
2752                 initialization = source.find(_add_ns('Initialization'))
2753                 if initialization is not None:
2754                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2755
2756             segment_list = element.find(_add_ns('SegmentList'))
2757             if segment_list is not None:
2758                 extract_common(segment_list)
2759                 extract_Initialization(segment_list)
2760                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2761                 if segment_urls_e:
2762                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2763             else:
2764                 segment_template = element.find(_add_ns('SegmentTemplate'))
2765                 if segment_template is not None:
2766                     extract_common(segment_template)
2767                     media = segment_template.get('media')
2768                     if media:
2769                         ms_info['media'] = media
2770                     initialization = segment_template.get('initialization')
2771                     if initialization:
2772                         ms_info['initialization'] = initialization
2773                     else:
2774                         extract_Initialization(segment_template)
2775             return ms_info
2776
2777         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2778         formats, subtitles = [], {}
2779         stream_numbers = collections.defaultdict(int)
2780         for period in mpd_doc.findall(_add_ns('Period')):
2781             period_duration = parse_duration(period.get('duration')) or mpd_duration
2782             period_ms_info = extract_multisegment_info(period, {
2783                 'start_number': 1,
2784                 'timescale': 1,
2785             })
2786             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2787                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2788                 for representation in adaptation_set.findall(_add_ns('Representation')):
2789                     representation_attrib = adaptation_set.attrib.copy()
2790                     representation_attrib.update(representation.attrib)
2791                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2792                     mime_type = representation_attrib['mimeType']
2793                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2794
2795                     codecs = parse_codecs(representation_attrib.get('codecs', ''))
2796                     if content_type not in ('video', 'audio', 'text'):
2797                         if mime_type == 'image/jpeg':
2798                             content_type = mime_type
2799                         elif codecs['vcodec'] != 'none':
2800                             content_type = 'video'
2801                         elif codecs['acodec'] != 'none':
2802                             content_type = 'audio'
2803                         elif codecs.get('tcodec', 'none') != 'none':
2804                             content_type = 'text'
2805                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2806                             content_type = 'text'
2807                         else:
2808                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2809                             continue
2810
2811                     base_url = ''
2812                     for element in (representation, adaptation_set, period, mpd_doc):
2813                         base_url_e = element.find(_add_ns('BaseURL'))
2814                         if base_url_e is not None:
2815                             base_url = base_url_e.text + base_url
2816                             if re.match(r'^https?://', base_url):
2817                                 break
2818                     if mpd_base_url and base_url.startswith('/'):
2819                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2820                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2821                         if not mpd_base_url.endswith('/'):
2822                             mpd_base_url += '/'
2823                         base_url = mpd_base_url + base_url
2824                     representation_id = representation_attrib.get('id')
2825                     lang = representation_attrib.get('lang')
2826                     url_el = representation.find(_add_ns('BaseURL'))
2827                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2828                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2829                     if representation_id is not None:
2830                         format_id = representation_id
2831                     else:
2832                         format_id = content_type
2833                     if mpd_id:
2834                         format_id = mpd_id + '-' + format_id
2835                     if content_type in ('video', 'audio'):
2836                         f = {
2837                             'format_id': format_id,
2838                             'manifest_url': mpd_url,
2839                             'ext': mimetype2ext(mime_type),
2840                             'width': int_or_none(representation_attrib.get('width')),
2841                             'height': int_or_none(representation_attrib.get('height')),
2842                             'tbr': float_or_none(bandwidth, 1000),
2843                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2844                             'fps': int_or_none(representation_attrib.get('frameRate')),
2845                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2846                             'format_note': 'DASH %s' % content_type,
2847                             'filesize': filesize,
2848                             'container': mimetype2ext(mime_type) + '_dash',
2849                             **codecs
2850                         }
2851                     elif content_type == 'text':
2852                         f = {
2853                             'ext': mimetype2ext(mime_type),
2854                             'manifest_url': mpd_url,
2855                             'filesize': filesize,
2856                         }
2857                     elif content_type == 'image/jpeg':
2858                         # See test case in VikiIE
2859                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2860                         f = {
2861                             'format_id': format_id,
2862                             'ext': 'mhtml',
2863                             'manifest_url': mpd_url,
2864                             'format_note': 'DASH storyboards (jpeg)',
2865                             'acodec': 'none',
2866                             'vcodec': 'none',
2867                         }
2868                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2869                         f['has_drm'] = True
2870                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2871
2872                     def prepare_template(template_name, identifiers):
2873                         tmpl = representation_ms_info[template_name]
2874                         # First of, % characters outside $...$ templates
2875                         # must be escaped by doubling for proper processing
2876                         # by % operator string formatting used further (see
2877                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2878                         t = ''
2879                         in_template = False
2880                         for c in tmpl:
2881                             t += c
2882                             if c == '$':
2883                                 in_template = not in_template
2884                             elif c == '%' and not in_template:
2885                                 t += c
2886                         # Next, $...$ templates are translated to their
2887                         # %(...) counterparts to be used with % operator
2888                         if representation_id is not None:
2889                             t = t.replace('$RepresentationID$', representation_id)
2890                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2891                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2892                         t.replace('$$', '$')
2893                         return t
2894
2895                     # @initialization is a regular template like @media one
2896                     # so it should be handled just the same way (see
2897                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2898                     if 'initialization' in representation_ms_info:
2899                         initialization_template = prepare_template(
2900                             'initialization',
2901                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2902                             # $Time$ shall not be included for @initialization thus
2903                             # only $Bandwidth$ remains
2904                             ('Bandwidth', ))
2905                         representation_ms_info['initialization_url'] = initialization_template % {
2906                             'Bandwidth': bandwidth,
2907                         }
2908
2909                     def location_key(location):
2910                         return 'url' if re.match(r'^https?://', location) else 'path'
2911
2912                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2913
2914                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2915                         media_location_key = location_key(media_template)
2916
2917                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2918                         # can't be used at the same time
2919                         if '%(Number' in media_template and 's' not in representation_ms_info:
2920                             segment_duration = None
2921                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2922                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2923                                 representation_ms_info['total_number'] = int(math.ceil(
2924                                     float_or_none(period_duration, segment_duration, default=0)))
2925                             representation_ms_info['fragments'] = [{
2926                                 media_location_key: media_template % {
2927                                     'Number': segment_number,
2928                                     'Bandwidth': bandwidth,
2929                                 },
2930                                 'duration': segment_duration,
2931                             } for segment_number in range(
2932                                 representation_ms_info['start_number'],
2933                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2934                         else:
2935                             # $Number*$ or $Time$ in media template with S list available
2936                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2937                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2938                             representation_ms_info['fragments'] = []
2939                             segment_time = 0
2940                             segment_d = None
2941                             segment_number = representation_ms_info['start_number']
2942
2943                             def add_segment_url():
2944                                 segment_url = media_template % {
2945                                     'Time': segment_time,
2946                                     'Bandwidth': bandwidth,
2947                                     'Number': segment_number,
2948                                 }
2949                                 representation_ms_info['fragments'].append({
2950                                     media_location_key: segment_url,
2951                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2952                                 })
2953
2954                             for num, s in enumerate(representation_ms_info['s']):
2955                                 segment_time = s.get('t') or segment_time
2956                                 segment_d = s['d']
2957                                 add_segment_url()
2958                                 segment_number += 1
2959                                 for r in range(s.get('r', 0)):
2960                                     segment_time += segment_d
2961                                     add_segment_url()
2962                                     segment_number += 1
2963                                 segment_time += segment_d
2964                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2965                         # No media template
2966                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2967                         # or any YouTube dashsegments video
2968                         fragments = []
2969                         segment_index = 0
2970                         timescale = representation_ms_info['timescale']
2971                         for s in representation_ms_info['s']:
2972                             duration = float_or_none(s['d'], timescale)
2973                             for r in range(s.get('r', 0) + 1):
2974                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2975                                 fragments.append({
2976                                     location_key(segment_uri): segment_uri,
2977                                     'duration': duration,
2978                                 })
2979                                 segment_index += 1
2980                         representation_ms_info['fragments'] = fragments
2981                     elif 'segment_urls' in representation_ms_info:
2982                         # Segment URLs with no SegmentTimeline
2983                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2984                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2985                         fragments = []
2986                         segment_duration = float_or_none(
2987                             representation_ms_info['segment_duration'],
2988                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2989                         for segment_url in representation_ms_info['segment_urls']:
2990                             fragment = {
2991                                 location_key(segment_url): segment_url,
2992                             }
2993                             if segment_duration:
2994                                 fragment['duration'] = segment_duration
2995                             fragments.append(fragment)
2996                         representation_ms_info['fragments'] = fragments
2997                     # If there is a fragments key available then we correctly recognized fragmented media.
2998                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2999                     # assumption is not necessarily correct since we may simply have no support for
3000                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3001                     if 'fragments' in representation_ms_info:
3002                         f.update({
3003                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3004                             'url': mpd_url or base_url,
3005                             'fragment_base_url': base_url,
3006                             'fragments': [],
3007                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3008                         })
3009                         if 'initialization_url' in representation_ms_info:
3010                             initialization_url = representation_ms_info['initialization_url']
3011                             if not f.get('url'):
3012                                 f['url'] = initialization_url
3013                             f['fragments'].append({location_key(initialization_url): initialization_url})
3014                         f['fragments'].extend(representation_ms_info['fragments'])
3015                         if not period_duration:
3016                             period_duration = try_get(
3017                                 representation_ms_info,
3018                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3019                     else:
3020                         # Assuming direct URL to unfragmented media.
3021                         f['url'] = base_url
3022                     if content_type in ('video', 'audio', 'image/jpeg'):
3023                         f['manifest_stream_number'] = stream_numbers[f['url']]
3024                         stream_numbers[f['url']] += 1
3025                         formats.append(f)
3026                     elif content_type == 'text':
3027                         subtitles.setdefault(lang or 'und', []).append(f)
3028
3029         return formats, subtitles
3030
3031     def _extract_ism_formats(self, *args, **kwargs):
3032         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3033         if subs:
3034             self._report_ignoring_subs('ISM')
3035         return fmts
3036
3037     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3038         res = self._download_xml_handle(
3039             ism_url, video_id,
3040             note='Downloading ISM manifest' if note is None else note,
3041             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3042             fatal=fatal, data=data, headers=headers, query=query)
3043         if res is False:
3044             return [], {}
3045         ism_doc, urlh = res
3046         if ism_doc is None:
3047             return [], {}
3048
3049         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3050
3051     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3052         """
3053         Parse formats from ISM manifest.
3054         References:
3055          1. [MS-SSTR]: Smooth Streaming Protocol,
3056             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3057         """
3058         if ism_doc.get('IsLive') == 'TRUE':
3059             return [], {}
3060
3061         duration = int(ism_doc.attrib['Duration'])
3062         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3063
3064         formats = []
3065         subtitles = {}
3066         for stream in ism_doc.findall('StreamIndex'):
3067             stream_type = stream.get('Type')
3068             if stream_type not in ('video', 'audio', 'text'):
3069                 continue
3070             url_pattern = stream.attrib['Url']
3071             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3072             stream_name = stream.get('Name')
3073             stream_language = stream.get('Language', 'und')
3074             for track in stream.findall('QualityLevel'):
3075                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3076                 # TODO: add support for WVC1 and WMAP
3077                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3078                     self.report_warning('%s is not a supported codec' % fourcc)
3079                     continue
3080                 tbr = int(track.attrib['Bitrate']) // 1000
3081                 # [1] does not mention Width and Height attributes. However,
3082                 # they're often present while MaxWidth and MaxHeight are
3083                 # missing, so should be used as fallbacks
3084                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3085                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3086                 sampling_rate = int_or_none(track.get('SamplingRate'))
3087
3088                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3089                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3090
3091                 fragments = []
3092                 fragment_ctx = {
3093                     'time': 0,
3094                 }
3095                 stream_fragments = stream.findall('c')
3096                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3097                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3098                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3099                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3100                     if not fragment_ctx['duration']:
3101                         try:
3102                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3103                         except IndexError:
3104                             next_fragment_time = duration
3105                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3106                     for _ in range(fragment_repeat):
3107                         fragments.append({
3108                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3109                             'duration': fragment_ctx['duration'] / stream_timescale,
3110                         })
3111                         fragment_ctx['time'] += fragment_ctx['duration']
3112
3113                 if stream_type == 'text':
3114                     subtitles.setdefault(stream_language, []).append({
3115                         'ext': 'ismt',
3116                         'protocol': 'ism',
3117                         'url': ism_url,
3118                         'manifest_url': ism_url,
3119                         'fragments': fragments,
3120                         '_download_params': {
3121                             'stream_type': stream_type,
3122                             'duration': duration,
3123                             'timescale': stream_timescale,
3124                             'fourcc': fourcc,
3125                             'language': stream_language,
3126                             'codec_private_data': track.get('CodecPrivateData'),
3127                         }
3128                     })
3129                 elif stream_type in ('video', 'audio'):
3130                     formats.append({
3131                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3132                         'url': ism_url,
3133                         'manifest_url': ism_url,
3134                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3135                         'width': width,
3136                         'height': height,
3137                         'tbr': tbr,
3138                         'asr': sampling_rate,
3139                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3140                         'acodec': 'none' if stream_type == 'video' else fourcc,
3141                         'protocol': 'ism',
3142                         'fragments': fragments,
3143                         'has_drm': ism_doc.find('Protection') is not None,
3144                         '_download_params': {
3145                             'stream_type': stream_type,
3146                             'duration': duration,
3147                             'timescale': stream_timescale,
3148                             'width': width or 0,
3149                             'height': height or 0,
3150                             'fourcc': fourcc,
3151                             'language': stream_language,
3152                             'codec_private_data': track.get('CodecPrivateData'),
3153                             'sampling_rate': sampling_rate,
3154                             'channels': int_or_none(track.get('Channels', 2)),
3155                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3156                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3157                         },
3158                     })
3159         return formats, subtitles
3160
3161     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3162         def absolute_url(item_url):
3163             return urljoin(base_url, item_url)
3164
3165         def parse_content_type(content_type):
3166             if not content_type:
3167                 return {}
3168             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3169             if ctr:
3170                 mimetype, codecs = ctr.groups()
3171                 f = parse_codecs(codecs)
3172                 f['ext'] = mimetype2ext(mimetype)
3173                 return f
3174             return {}
3175
3176         def _media_formats(src, cur_media_type, type_info={}):
3177             full_url = absolute_url(src)
3178             ext = type_info.get('ext') or determine_ext(full_url)
3179             if ext == 'm3u8':
3180                 is_plain_url = False
3181                 formats = self._extract_m3u8_formats(
3182                     full_url, video_id, ext='mp4',
3183                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3184                     preference=preference, quality=quality, fatal=False)
3185             elif ext == 'mpd':
3186                 is_plain_url = False
3187                 formats = self._extract_mpd_formats(
3188                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3189             else:
3190                 is_plain_url = True
3191                 formats = [{
3192                     'url': full_url,
3193                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3194                 }]
3195             return is_plain_url, formats
3196
3197         entries = []
3198         # amp-video and amp-audio are very similar to their HTML5 counterparts
3199         # so we wll include them right here (see
3200         # https://www.ampproject.org/docs/reference/components/amp-video)
3201         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3202         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3203         media_tags = [(media_tag, media_tag_name, media_type, '')
3204                       for media_tag, media_tag_name, media_type
3205                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3206         media_tags.extend(re.findall(
3207             # We only allow video|audio followed by a whitespace or '>'.
3208             # Allowing more characters may end up in significant slow down (see
3209             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3210             # http://www.porntrex.com/maps/videositemap.xml).
3211             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3212         for media_tag, _, media_type, media_content in media_tags:
3213             media_info = {
3214                 'formats': [],
3215                 'subtitles': {},
3216             }
3217             media_attributes = extract_attributes(media_tag)
3218             src = strip_or_none(media_attributes.get('src'))
3219             if src:
3220                 _, formats = _media_formats(src, media_type)
3221                 media_info['formats'].extend(formats)
3222             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3223             if media_content:
3224                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3225                     s_attr = extract_attributes(source_tag)
3226                     # data-video-src and data-src are non standard but seen
3227                     # several times in the wild
3228                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3229                     if not src:
3230                         continue
3231                     f = parse_content_type(s_attr.get('type'))
3232                     is_plain_url, formats = _media_formats(src, media_type, f)
3233                     if is_plain_url:
3234                         # width, height, res, label and title attributes are
3235                         # all not standard but seen several times in the wild
3236                         labels = [
3237                             s_attr.get(lbl)
3238                             for lbl in ('label', 'title')
3239                             if str_or_none(s_attr.get(lbl))
3240                         ]
3241                         width = int_or_none(s_attr.get('width'))
3242                         height = (int_or_none(s_attr.get('height'))
3243                                   or int_or_none(s_attr.get('res')))
3244                         if not width or not height:
3245                             for lbl in labels:
3246                                 resolution = parse_resolution(lbl)
3247                                 if not resolution:
3248                                     continue
3249                                 width = width or resolution.get('width')
3250                                 height = height or resolution.get('height')
3251                         for lbl in labels:
3252                             tbr = parse_bitrate(lbl)
3253                             if tbr:
3254                                 break
3255                         else:
3256                             tbr = None
3257                         f.update({
3258                             'width': width,
3259                             'height': height,
3260                             'tbr': tbr,
3261                             'format_id': s_attr.get('label') or s_attr.get('title'),
3262                         })
3263                         f.update(formats[0])
3264                         media_info['formats'].append(f)
3265                     else:
3266                         media_info['formats'].extend(formats)
3267                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3268                     track_attributes = extract_attributes(track_tag)
3269                     kind = track_attributes.get('kind')
3270                     if not kind or kind in ('subtitles', 'captions'):
3271                         src = strip_or_none(track_attributes.get('src'))
3272                         if not src:
3273                             continue
3274                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3275                         media_info['subtitles'].setdefault(lang, []).append({
3276                             'url': absolute_url(src),
3277                         })
3278             for f in media_info['formats']:
3279                 f.setdefault('http_headers', {})['Referer'] = base_url
3280             if media_info['formats'] or media_info['subtitles']:
3281                 entries.append(media_info)
3282         return entries
3283
3284     def _extract_akamai_formats(self, *args, **kwargs):
3285         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3286         if subs:
3287             self._report_ignoring_subs('akamai')
3288         return fmts
3289
3290     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3291         signed = 'hdnea=' in manifest_url
3292         if not signed:
3293             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3294             manifest_url = re.sub(
3295                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3296                 '', manifest_url).strip('?')
3297
3298         formats = []
3299         subtitles = {}
3300
3301         hdcore_sign = 'hdcore=3.7.0'
3302         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3303         hds_host = hosts.get('hds')
3304         if hds_host:
3305             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3306         if 'hdcore=' not in f4m_url:
3307             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3308         f4m_formats = self._extract_f4m_formats(
3309             f4m_url, video_id, f4m_id='hds', fatal=False)
3310         for entry in f4m_formats:
3311             entry.update({'extra_param_to_segment_url': hdcore_sign})
3312         formats.extend(f4m_formats)
3313
3314         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3315         hls_host = hosts.get('hls')
3316         if hls_host:
3317             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3318         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3319             m3u8_url, video_id, 'mp4', 'm3u8_native',
3320             m3u8_id='hls', fatal=False)
3321         formats.extend(m3u8_formats)
3322         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3323
3324         http_host = hosts.get('http')
3325         if http_host and m3u8_formats and not signed:
3326             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3327             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3328             qualities_length = len(qualities)
3329             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3330                 i = 0
3331                 for f in m3u8_formats:
3332                     if f['vcodec'] != 'none':
3333                         for protocol in ('http', 'https'):
3334                             http_f = f.copy()
3335                             del http_f['manifest_url']
3336                             http_url = re.sub(
3337                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3338                             http_f.update({
3339                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3340                                 'url': http_url,
3341                                 'protocol': protocol,
3342                             })
3343                             formats.append(http_f)
3344                         i += 1
3345
3346         return formats, subtitles
3347
3348     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3349         query = compat_urlparse.urlparse(url).query
3350         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3351         mobj = re.search(
3352             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3353         url_base = mobj.group('url')
3354         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3355         formats = []
3356
3357         def manifest_url(manifest):
3358             m_url = '%s/%s' % (http_base_url, manifest)
3359             if query:
3360                 m_url += '?%s' % query
3361             return m_url
3362
3363         if 'm3u8' not in skip_protocols:
3364             formats.extend(self._extract_m3u8_formats(
3365                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3366                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3367         if 'f4m' not in skip_protocols:
3368             formats.extend(self._extract_f4m_formats(
3369                 manifest_url('manifest.f4m'),
3370                 video_id, f4m_id='hds', fatal=False))
3371         if 'dash' not in skip_protocols:
3372             formats.extend(self._extract_mpd_formats(
3373                 manifest_url('manifest.mpd'),
3374                 video_id, mpd_id='dash', fatal=False))
3375         if re.search(r'(?:/smil:|\.smil)', url_base):
3376             if 'smil' not in skip_protocols:
3377                 rtmp_formats = self._extract_smil_formats(
3378                     manifest_url('jwplayer.smil'),
3379                     video_id, fatal=False)
3380                 for rtmp_format in rtmp_formats:
3381                     rtsp_format = rtmp_format.copy()
3382                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3383                     del rtsp_format['play_path']
3384                     del rtsp_format['ext']
3385                     rtsp_format.update({
3386                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3387                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3388                         'protocol': 'rtsp',
3389                     })
3390                     formats.extend([rtmp_format, rtsp_format])
3391         else:
3392             for protocol in ('rtmp', 'rtsp'):
3393                 if protocol not in skip_protocols:
3394                     formats.append({
3395                         'url': '%s:%s' % (protocol, url_base),
3396                         'format_id': protocol,
3397                         'protocol': protocol,
3398                     })
3399         return formats
3400
3401     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3402         mobj = re.search(
3403             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3404             webpage)
3405         if mobj:
3406             try:
3407                 jwplayer_data = self._parse_json(mobj.group('options'),
3408                                                  video_id=video_id,
3409                                                  transform_source=transform_source)
3410             except ExtractorError:
3411                 pass
3412             else:
3413                 if isinstance(jwplayer_data, dict):
3414                     return jwplayer_data
3415
3416     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3417         jwplayer_data = self._find_jwplayer_data(
3418             webpage, video_id, transform_source=js_to_json)
3419         return self._parse_jwplayer_data(
3420             jwplayer_data, video_id, *args, **kwargs)
3421
3422     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3423                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3424         # JWPlayer backward compatibility: flattened playlists
3425         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3426         if 'playlist' not in jwplayer_data:
3427             jwplayer_data = {'playlist': [jwplayer_data]}
3428
3429         entries = []
3430
3431         # JWPlayer backward compatibility: single playlist item
3432         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3433         if not isinstance(jwplayer_data['playlist'], list):
3434             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3435
3436         for video_data in jwplayer_data['playlist']:
3437             # JWPlayer backward compatibility: flattened sources
3438             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3439             if 'sources' not in video_data:
3440                 video_data['sources'] = [video_data]
3441
3442             this_video_id = video_id or video_data['mediaid']
3443
3444             formats = self._parse_jwplayer_formats(
3445                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3446                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3447
3448             subtitles = {}
3449             tracks = video_data.get('tracks')
3450             if tracks and isinstance(tracks, list):
3451                 for track in tracks:
3452                     if not isinstance(track, dict):
3453                         continue
3454                     track_kind = track.get('kind')
3455                     if not track_kind or not isinstance(track_kind, compat_str):
3456                         continue
3457                     if track_kind.lower() not in ('captions', 'subtitles'):
3458                         continue
3459                     track_url = urljoin(base_url, track.get('file'))
3460                     if not track_url:
3461                         continue
3462                     subtitles.setdefault(track.get('label') or 'en', []).append({
3463                         'url': self._proto_relative_url(track_url)
3464                     })
3465
3466             entry = {
3467                 'id': this_video_id,
3468                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3469                 'description': clean_html(video_data.get('description')),
3470                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3471                 'timestamp': int_or_none(video_data.get('pubdate')),
3472                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3473                 'subtitles': subtitles,
3474             }
3475             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3476             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3477                 entry.update({
3478                     '_type': 'url_transparent',
3479                     'url': formats[0]['url'],
3480                 })
3481             else:
3482                 self._sort_formats(formats)
3483                 entry['formats'] = formats
3484             entries.append(entry)
3485         if len(entries) == 1:
3486             return entries[0]
3487         else:
3488             return self.playlist_result(entries)
3489
3490     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3491                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3492         urls = []
3493         formats = []
3494         for source in jwplayer_sources_data:
3495             if not isinstance(source, dict):
3496                 continue
3497             source_url = urljoin(
3498                 base_url, self._proto_relative_url(source.get('file')))
3499             if not source_url or source_url in urls:
3500                 continue
3501             urls.append(source_url)
3502             source_type = source.get('type') or ''
3503             ext = mimetype2ext(source_type) or determine_ext(source_url)
3504             if source_type == 'hls' or ext == 'm3u8':
3505                 formats.extend(self._extract_m3u8_formats(
3506                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3507                     m3u8_id=m3u8_id, fatal=False))
3508             elif source_type == 'dash' or ext == 'mpd':
3509                 formats.extend(self._extract_mpd_formats(
3510                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3511             elif ext == 'smil':
3512                 formats.extend(self._extract_smil_formats(
3513                     source_url, video_id, fatal=False))
3514             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3515             elif source_type.startswith('audio') or ext in (
3516                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3517                 formats.append({
3518                     'url': source_url,
3519                     'vcodec': 'none',
3520                     'ext': ext,
3521                 })
3522             else:
3523                 height = int_or_none(source.get('height'))
3524                 if height is None:
3525                     # Often no height is provided but there is a label in
3526                     # format like "1080p", "720p SD", or 1080.
3527                     height = int_or_none(self._search_regex(
3528                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3529                         'height', default=None))
3530                 a_format = {
3531                     'url': source_url,
3532                     'width': int_or_none(source.get('width')),
3533                     'height': height,
3534                     'tbr': int_or_none(source.get('bitrate')),
3535                     'ext': ext,
3536                 }
3537                 if source_url.startswith('rtmp'):
3538                     a_format['ext'] = 'flv'
3539                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3540                     # of jwplayer.flash.swf
3541                     rtmp_url_parts = re.split(
3542                         r'((?:mp4|mp3|flv):)', source_url, 1)
3543                     if len(rtmp_url_parts) == 3:
3544                         rtmp_url, prefix, play_path = rtmp_url_parts
3545                         a_format.update({
3546                             'url': rtmp_url,
3547                             'play_path': prefix + play_path,
3548                         })
3549                     if rtmp_params:
3550                         a_format.update(rtmp_params)
3551                 formats.append(a_format)
3552         return formats
3553
3554     def _live_title(self, name):
3555         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3556         return name
3557
3558     def _int(self, v, name, fatal=False, **kwargs):
3559         res = int_or_none(v, **kwargs)
3560         if res is None:
3561             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3562             if fatal:
3563                 raise ExtractorError(msg)
3564             else:
3565                 self.report_warning(msg)
3566         return res
3567
3568     def _float(self, v, name, fatal=False, **kwargs):
3569         res = float_or_none(v, **kwargs)
3570         if res is None:
3571             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3572             if fatal:
3573                 raise ExtractorError(msg)
3574             else:
3575                 self.report_warning(msg)
3576         return res
3577
3578     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3579                     path='/', secure=False, discard=False, rest={}, **kwargs):
3580         cookie = compat_cookiejar_Cookie(
3581             0, name, value, port, port is not None, domain, True,
3582             domain.startswith('.'), path, True, secure, expire_time,
3583             discard, None, None, rest)
3584         self._downloader.cookiejar.set_cookie(cookie)
3585
3586     def _get_cookies(self, url):
3587         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3588         req = sanitized_Request(url)
3589         self._downloader.cookiejar.add_cookie_header(req)
3590         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3591
3592     def _apply_first_set_cookie_header(self, url_handle, cookie):
3593         """
3594         Apply first Set-Cookie header instead of the last. Experimental.
3595
3596         Some sites (e.g. [1-3]) may serve two cookies under the same name
3597         in Set-Cookie header and expect the first (old) one to be set rather
3598         than second (new). However, as of RFC6265 the newer one cookie
3599         should be set into cookie store what actually happens.
3600         We will workaround this issue by resetting the cookie to
3601         the first one manually.
3602         1. https://new.vk.com/
3603         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3604         3. https://learning.oreilly.com/
3605         """
3606         for header, cookies in url_handle.headers.items():
3607             if header.lower() != 'set-cookie':
3608                 continue
3609             if sys.version_info[0] >= 3:
3610                 cookies = cookies.encode('iso-8859-1')
3611             cookies = cookies.decode('utf-8')
3612             cookie_value = re.search(
3613                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3614             if cookie_value:
3615                 value, domain = cookie_value.groups()
3616                 self._set_cookie(domain, cookie, value)
3617                 break
3618
3619     def get_testcases(self, include_onlymatching=False):
3620         t = getattr(self, '_TEST', None)
3621         if t:
3622             assert not hasattr(self, '_TESTS'), \
3623                 '%s has _TEST and _TESTS' % type(self).__name__
3624             tests = [t]
3625         else:
3626             tests = getattr(self, '_TESTS', [])
3627         for t in tests:
3628             if not include_onlymatching and t.get('only_matching', False):
3629                 continue
3630             t['name'] = type(self).__name__[:-len('IE')]
3631             yield t
3632
3633     def is_suitable(self, age_limit):
3634         """ Test whether the extractor is generally suitable for the given
3635         age limit (i.e. pornographic sites are not, all others usually are) """
3636
3637         any_restricted = False
3638         for tc in self.get_testcases(include_onlymatching=False):
3639             if tc.get('playlist', []):
3640                 tc = tc['playlist'][0]
3641             is_restricted = age_restricted(
3642                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3643             if not is_restricted:
3644                 return True
3645             any_restricted = any_restricted or is_restricted
3646         return not any_restricted
3647
3648     def extract_subtitles(self, *args, **kwargs):
3649         if (self.get_param('writesubtitles', False)
3650                 or self.get_param('listsubtitles')):
3651             return self._get_subtitles(*args, **kwargs)
3652         return {}
3653
3654     def _get_subtitles(self, *args, **kwargs):
3655         raise NotImplementedError('This method must be implemented by subclasses')
3656
3657     def extract_comments(self, *args, **kwargs):
3658         if not self.get_param('getcomments'):
3659             return None
3660         generator = self._get_comments(*args, **kwargs)
3661
3662         def extractor():
3663             comments = []
3664             interrupted = True
3665             try:
3666                 while True:
3667                     comments.append(next(generator))
3668             except StopIteration:
3669                 interrupted = False
3670             except KeyboardInterrupt:
3671                 self.to_screen('Interrupted by user')
3672             except Exception as e:
3673                 if self.get_param('ignoreerrors') is not True:
3674                     raise
3675                 self._downloader.report_error(e)
3676             comment_count = len(comments)
3677             self.to_screen(f'Extracted {comment_count} comments')
3678             return {
3679                 'comments': comments,
3680                 'comment_count': None if interrupted else comment_count
3681             }
3682         return extractor
3683
3684     def _get_comments(self, *args, **kwargs):
3685         raise NotImplementedError('This method must be implemented by subclasses')
3686
3687     @staticmethod
3688     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3689         """ Merge subtitle items for one language. Items with duplicated URLs/data
3690         will be dropped. """
3691         list1_data = set((item.get('url'), item.get('data')) for item in subtitle_list1)
3692         ret = list(subtitle_list1)
3693         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3694         return ret
3695
3696     @classmethod
3697     def _merge_subtitles(cls, *dicts, target=None):
3698         """ Merge subtitle dictionaries, language by language. """
3699         if target is None:
3700             target = {}
3701         for d in dicts:
3702             for lang, subs in d.items():
3703                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3704         return target
3705
3706     def extract_automatic_captions(self, *args, **kwargs):
3707         if (self.get_param('writeautomaticsub', False)
3708                 or self.get_param('listsubtitles')):
3709             return self._get_automatic_captions(*args, **kwargs)
3710         return {}
3711
3712     def _get_automatic_captions(self, *args, **kwargs):
3713         raise NotImplementedError('This method must be implemented by subclasses')
3714
3715     def mark_watched(self, *args, **kwargs):
3716         if not self.get_param('mark_watched', False):
3717             return
3718         if (self.supports_login() and self._get_login_info()[0] is not None
3719                 or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')):
3720             self._mark_watched(*args, **kwargs)
3721
3722     def _mark_watched(self, *args, **kwargs):
3723         raise NotImplementedError('This method must be implemented by subclasses')
3724
3725     def geo_verification_headers(self):
3726         headers = {}
3727         geo_verification_proxy = self.get_param('geo_verification_proxy')
3728         if geo_verification_proxy:
3729             headers['Ytdl-request-proxy'] = geo_verification_proxy
3730         return headers
3731
3732     def _generic_id(self, url):
3733         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3734
3735     def _generic_title(self, url):
3736         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3737
3738     @staticmethod
3739     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3740         all_known = all(map(
3741             lambda x: x is not None,
3742             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3743         return (
3744             'private' if is_private
3745             else 'premium_only' if needs_premium
3746             else 'subscriber_only' if needs_subscription
3747             else 'needs_auth' if needs_auth
3748             else 'unlisted' if is_unlisted
3749             else 'public' if all_known
3750             else None)
3751
3752     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3753         '''
3754         @returns            A list of values for the extractor argument given by "key"
3755                             or "default" if no such key is present
3756         @param default      The default value to return when the key is not present (default: [])
3757         @param casesense    When false, the values are converted to lower case
3758         '''
3759         val = traverse_obj(
3760             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3761         if val is None:
3762             return [] if default is NO_DEFAULT else default
3763         return list(val) if casesense else [x.lower() for x in val]
3764
3765     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3766         if not playlist_id or not video_id:
3767             return not video_id
3768
3769         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3770         if no_playlist is not None:
3771             return not no_playlist
3772
3773         video_id = '' if video_id is True else f' {video_id}'
3774         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3775         if self.get_param('noplaylist'):
3776             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3777             return False
3778         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3779         return True
3780
3781
3782 class SearchInfoExtractor(InfoExtractor):
3783     """
3784     Base class for paged search queries extractors.
3785     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3786     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3787     """
3788
3789     _MAX_RESULTS = float('inf')
3790
3791     @classmethod
3792     def _make_valid_url(cls):
3793         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3794
3795     def _real_extract(self, query):
3796         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3797         if prefix == '':
3798             return self._get_n_results(query, 1)
3799         elif prefix == 'all':
3800             return self._get_n_results(query, self._MAX_RESULTS)
3801         else:
3802             n = int(prefix)
3803             if n <= 0:
3804                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3805             elif n > self._MAX_RESULTS:
3806                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3807                 n = self._MAX_RESULTS
3808             return self._get_n_results(query, n)
3809
3810     def _get_n_results(self, query, n):
3811         """Get a specified number of results for a query.
3812         Either this function or _search_results must be overridden by subclasses """
3813         return self.playlist_result(
3814             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3815             query, query)
3816
3817     def _search_results(self, query):
3818         """Returns an iterator of search results"""
3819         raise NotImplementedError('This method must be implemented by subclasses')
3820
3821     @property
3822     def SEARCH_KEY(self):
3823         return self._SEARCH_KEY