yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import collections
   6 import hashlib
   7 import itertools
   8 import json
   9 import netrc
  10 import os
  11 import random
  12 import re
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar_Cookie,
  19     compat_cookies_SimpleCookie,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader import FileDownloader
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     encode_data_uri,
  49     error_to_compat_str,
  50     extract_attributes,
  51     ExtractorError,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     join_nonempty,
  59     js_to_json,
  60     JSON_LD_RE,
  61     mimetype2ext,
  62     network_exceptions,
  63     NO_DEFAULT,
  64     orderedSet,
  65     parse_bitrate,
  66     parse_codecs,
  67     parse_duration,
  68     parse_iso8601,
  69     parse_m3u8_attributes,
  70     parse_resolution,
  71     RegexNotFoundError,
  72     sanitize_filename,
  73     sanitized_Request,
  74     str_or_none,
  75     str_to_int,
  76     strip_or_none,
  77     traverse_obj,
  78     try_get,
  79     unescapeHTML,
  80     UnsupportedError,
  81     unified_strdate,
  82     unified_timestamp,
  83     update_Request,
  84     update_url_query,
  85     url_basename,
  86     url_or_none,
  87     urljoin,
  88     variadic,
  89     xpath_element,
  90     xpath_text,
  91     xpath_with_ns,
  92 )
  93
  94
  95 class InfoExtractor(object):
  96     """Information Extractor class.
  97
  98     Information extractors are the classes that, given a URL, extract
  99     information about the video (or videos) the URL refers to. This
 100     information includes the real video URL, the video title, author and
 101     others. The information is stored in a dictionary which is then
 102     passed to the YoutubeDL. The YoutubeDL processes this
 103     information possibly downloading the video to the file system, among
 104     other possible outcomes.
 105
 106     The type field determines the type of the result.
 107     By far the most common value (and the default if _type is missing) is
 108     "video", which indicates a single video.
 109
 110     For a video, the dictionaries must include the following fields:
 111
 112     id:             Video identifier.
 113     title:          Video title, unescaped.
 114
 115     Additionally, it must contain either a formats entry or a url one:
 116
 117     formats:        A list of dictionaries for each format available, ordered
 118                     from worst to best quality.
 119
 120                     Potential fields:
 121                     * url        The mandatory URL representing the media:
 122                                    for plain file media - HTTP URL of this file,
 123                                    for RTMP - RTMP URL,
 124                                    for HLS - URL of the M3U8 media playlist,
 125                                    for HDS - URL of the F4M manifest,
 126                                    for DASH
 127                                      - HTTP URL to plain file media (in case of
 128                                        unfragmented media)
 129                                      - URL of the MPD manifest or base URL
 130                                        representing the media if MPD manifest
 131                                        is parsed from a string (in case of
 132                                        fragmented media)
 133                                    for MSS - URL of the ISM manifest.
 134                     * manifest_url
 135                                  The URL of the manifest file in case of
 136                                  fragmented media:
 137                                    for HLS - URL of the M3U8 master playlist,
 138                                    for HDS - URL of the F4M manifest,
 139                                    for DASH - URL of the MPD manifest,
 140                                    for MSS - URL of the ISM manifest.
 141                     * ext        Will be calculated from URL if missing
 142                     * format     A human-readable description of the format
 143                                  ("mp4 container with h264/opus").
 144                                  Calculated from the format_id, width, height.
 145                                  and format_note fields if missing.
 146                     * format_id  A short description of the format
 147                                  ("mp4_h264_opus" or "19").
 148                                 Technically optional, but strongly recommended.
 149                     * format_note Additional info about the format
 150                                  ("3D" or "DASH video")
 151                     * width      Width of the video, if known
 152                     * height     Height of the video, if known
 153                     * resolution Textual description of width and height
 154                     * dynamic_range The dynamic range of the video. One of:
 155                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 156                     * tbr        Average bitrate of audio and video in KBit/s
 157                     * abr        Average audio bitrate in KBit/s
 158                     * acodec     Name of the audio codec in use
 159                     * asr        Audio sampling rate in Hertz
 160                     * vbr        Average video bitrate in KBit/s
 161                     * fps        Frame rate
 162                     * vcodec     Name of the video codec in use
 163                     * container  Name of the container format
 164                     * filesize   The number of bytes, if known in advance
 165                     * filesize_approx  An estimate for the number of bytes
 166                     * player_url SWF Player URL (used for rtmpdump).
 167                     * protocol   The protocol that will be used for the actual
 168                                  download, lower-case. One of "http", "https" or
 169                                  one of the protocols defined in downloader.PROTOCOL_MAP
 170                     * fragment_base_url
 171                                  Base URL for fragments. Each fragment's path
 172                                  value (if present) will be relative to
 173                                  this URL.
 174                     * fragments  A list of fragments of a fragmented media.
 175                                  Each fragment entry must contain either an url
 176                                  or a path. If an url is present it should be
 177                                  considered by a client. Otherwise both path and
 178                                  fragment_base_url must be present. Here is
 179                                  the list of all potential fields:
 180                                  * "url" - fragment's URL
 181                                  * "path" - fragment's path relative to
 182                                             fragment_base_url
 183                                  * "duration" (optional, int or float)
 184                                  * "filesize" (optional, int)
 185                     * is_from_start  Is a live format that can be downloaded
 186                                 from the start. Boolean
 187                     * preference Order number of this format. If this field is
 188                                  present and not None, the formats get sorted
 189                                  by this field, regardless of all other values.
 190                                  -1 for default (order by other properties),
 191                                  -2 or smaller for less than default.
 192                                  < -1000 to hide the format (if there is
 193                                     another one which is strictly better)
 194                     * language   Language code, e.g. "de" or "en-US".
 195                     * language_preference  Is this in the language mentioned in
 196                                  the URL?
 197                                  10 if it's what the URL is about,
 198                                  -1 for default (don't know),
 199                                  -10 otherwise, other values reserved for now.
 200                     * quality    Order number of the video quality of this
 201                                  format, irrespective of the file format.
 202                                  -1 for default (order by other properties),
 203                                  -2 or smaller for less than default.
 204                     * source_preference  Order number for this video source
 205                                   (quality takes higher priority)
 206                                  -1 for default (order by other properties),
 207                                  -2 or smaller for less than default.
 208                     * http_headers  A dictionary of additional HTTP headers
 209                                  to add to the request.
 210                     * stretched_ratio  If given and not 1, indicates that the
 211                                  video's pixels are not square.
 212                                  width : height ratio as float.
 213                     * no_resume  The server does not support resuming the
 214                                  (HTTP or RTMP) download. Boolean.
 215                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 216                     * downloader_options  A dictionary of downloader options as
 217                                  described in FileDownloader
 218                     RTMP formats can also have the additional fields: page_url,
 219                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 220                     rtmp_protocol, rtmp_real_time
 221
 222     url:            Final video URL.
 223     ext:            Video filename extension.
 224     format:         The video format, defaults to ext (used for --get-format)
 225     player_url:     SWF Player URL (used for rtmpdump).
 226
 227     The following fields are optional:
 228
 229     direct:         True if a direct video file was given (must only be set by GenericIE)
 230     alt_title:      A secondary title of the video.
 231     display_id      An alternative identifier for the video, not necessarily
 232                     unique, but available before title. Typically, id is
 233                     something like "4234987", title "Dancing naked mole rats",
 234                     and display_id "dancing-naked-mole-rats"
 235     thumbnails:     A list of dictionaries, with the following entries:
 236                         * "id" (optional, string) - Thumbnail format ID
 237                         * "url"
 238                         * "preference" (optional, int) - quality of the image
 239                         * "width" (optional, int)
 240                         * "height" (optional, int)
 241                         * "resolution" (optional, string "{width}x{height}",
 242                                         deprecated)
 243                         * "filesize" (optional, int)
 244                         * "http_headers" (dict) - HTTP headers for the request
 245     thumbnail:      Full URL to a video thumbnail image.
 246     description:    Full video description.
 247     uploader:       Full name of the video uploader.
 248     license:        License name the video is licensed under.
 249     creator:        The creator of the video.
 250     timestamp:      UNIX timestamp of the moment the video was uploaded
 251     upload_date:    Video upload date (YYYYMMDD).
 252                     If not explicitly set, calculated from timestamp
 253     release_timestamp: UNIX timestamp of the moment the video was released.
 254                     If it is not clear whether to use timestamp or this, use the former
 255     release_date:   The date (YYYYMMDD) when the video was released.
 256                     If not explicitly set, calculated from release_timestamp
 257     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 258     modified_date:   The date (YYYYMMDD) when the video was last modified.
 259                     If not explicitly set, calculated from modified_timestamp
 260     uploader_id:    Nickname or id of the video uploader.
 261     uploader_url:   Full URL to a personal webpage of the video uploader.
 262     channel:        Full name of the channel the video is uploaded on.
 263                     Note that channel fields may or may not repeat uploader
 264                     fields. This depends on a particular extractor.
 265     channel_id:     Id of the channel.
 266     channel_url:    Full URL to a channel webpage.
 267     channel_follower_count: Number of followers of the channel.
 268     location:       Physical location where the video was filmed.
 269     subtitles:      The available subtitles as a dictionary in the format
 270                     {tag: subformats}. "tag" is usually a language code, and
 271                     "subformats" is a list sorted from lower to higher
 272                     preference, each element is a dictionary with the "ext"
 273                     entry and one of:
 274                         * "data": The subtitles file contents
 275                         * "url": A URL pointing to the subtitles file
 276                     It can optionally also have:
 277                         * "name": Name or description of the subtitles
 278                         * "http_headers": A dictionary of additional HTTP headers
 279                                   to add to the request.
 280                     "ext" will be calculated from URL if missing
 281     automatic_captions: Like 'subtitles'; contains automatically generated
 282                     captions instead of normal subtitles
 283     duration:       Length of the video in seconds, as an integer or float.
 284     view_count:     How many users have watched the video on the platform.
 285     like_count:     Number of positive ratings of the video
 286     dislike_count:  Number of negative ratings of the video
 287     repost_count:   Number of reposts of the video
 288     average_rating: Average rating give by users, the scale used depends on the webpage
 289     comment_count:  Number of comments on the video
 290     comments:       A list of comments, each with one or more of the following
 291                     properties (all but one of text or html optional):
 292                         * "author" - human-readable name of the comment author
 293                         * "author_id" - user ID of the comment author
 294                         * "author_thumbnail" - The thumbnail of the comment author
 295                         * "id" - Comment ID
 296                         * "html" - Comment as HTML
 297                         * "text" - Plain text of the comment
 298                         * "timestamp" - UNIX timestamp of comment
 299                         * "parent" - ID of the comment this one is replying to.
 300                                      Set to "root" to indicate that this is a
 301                                      comment to the original video.
 302                         * "like_count" - Number of positive ratings of the comment
 303                         * "dislike_count" - Number of negative ratings of the comment
 304                         * "is_favorited" - Whether the comment is marked as
 305                                            favorite by the video uploader
 306                         * "author_is_uploader" - Whether the comment is made by
 307                                                  the video uploader
 308     age_limit:      Age restriction for the video, as an integer (years)
 309     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 310                     should allow to get the same result again. (It will be set
 311                     by YoutubeDL if it's missing)
 312     categories:     A list of categories that the video falls in, for example
 313                     ["Sports", "Berlin"]
 314     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 315     cast:           A list of the video cast
 316     is_live:        True, False, or None (=unknown). Whether this video is a
 317                     live stream that goes on instead of a fixed-length video.
 318     was_live:       True, False, or None (=unknown). Whether this video was
 319                     originally a live stream.
 320     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 321                     If absent, automatically set from is_live, was_live
 322     start_time:     Time in seconds where the reproduction should start, as
 323                     specified in the URL.
 324     end_time:       Time in seconds where the reproduction should end, as
 325                     specified in the URL.
 326     chapters:       A list of dictionaries, with the following entries:
 327                         * "start_time" - The start time of the chapter in seconds
 328                         * "end_time" - The end time of the chapter in seconds
 329                         * "title" (optional, string)
 330     playable_in_embed: Whether this video is allowed to play in embedded
 331                     players on other sites. Can be True (=always allowed),
 332                     False (=never allowed), None (=unknown), or a string
 333                     specifying the criteria for embedability (Eg: 'whitelist')
 334     availability:   Under what condition the video is available. One of
 335                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 336                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 337                     to set it
 338     __post_extractor: A function to be called just before the metadata is
 339                     written to either disk, logger or console. The function
 340                     must return a dict which will be added to the info_dict.
 341                     This is usefull for additional information that is
 342                     time-consuming to extract. Note that the fields thus
 343                     extracted will not be available to output template and
 344                     match_filter. So, only "comments" and "comment_count" are
 345                     currently allowed to be extracted via this method.
 346
 347     The following fields should only be used when the video belongs to some logical
 348     chapter or section:
 349
 350     chapter:        Name or title of the chapter the video belongs to.
 351     chapter_number: Number of the chapter the video belongs to, as an integer.
 352     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 353
 354     The following fields should only be used when the video is an episode of some
 355     series, programme or podcast:
 356
 357     series:         Title of the series or programme the video episode belongs to.
 358     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 359     season:         Title of the season the video episode belongs to.
 360     season_number:  Number of the season the video episode belongs to, as an integer.
 361     season_id:      Id of the season the video episode belongs to, as a unicode string.
 362     episode:        Title of the video episode. Unlike mandatory video title field,
 363                     this field should denote the exact title of the video episode
 364                     without any kind of decoration.
 365     episode_number: Number of the video episode within a season, as an integer.
 366     episode_id:     Id of the video episode, as a unicode string.
 367
 368     The following fields should only be used when the media is a track or a part of
 369     a music album:
 370
 371     track:          Title of the track.
 372     track_number:   Number of the track within an album or a disc, as an integer.
 373     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 374                     as a unicode string.
 375     artist:         Artist(s) of the track.
 376     genre:          Genre(s) of the track.
 377     album:          Title of the album the track belongs to.
 378     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 379     album_artist:   List of all artists appeared on the album (e.g.
 380                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 381                     and compilations).
 382     disc_number:    Number of the disc or other physical medium the track belongs to,
 383                     as an integer.
 384     release_year:   Year (YYYY) when the album was released.
 385     composer:       Composer of the piece
 386
 387     Unless mentioned otherwise, the fields should be Unicode strings.
 388
 389     Unless mentioned otherwise, None is equivalent to absence of information.
 390
 391
 392     _type "playlist" indicates multiple videos.
 393     There must be a key "entries", which is a list, an iterable, or a PagedList
 394     object, each element of which is a valid dictionary by this specification.
 395
 396     Additionally, playlists can have "id", "title", and any other relevent
 397     attributes with the same semantics as videos (see above).
 398
 399     It can also have the following optional fields:
 400
 401     playlist_count: The total number of videos in a playlist. If not given,
 402                     YoutubeDL tries to calculate it from "entries"
 403
 404
 405     _type "multi_video" indicates that there are multiple videos that
 406     form a single show, for examples multiple acts of an opera or TV episode.
 407     It must have an entries key like a playlist and contain all the keys
 408     required for a video at the same time.
 409
 410
 411     _type "url" indicates that the video must be extracted from another
 412     location, possibly by a different extractor. Its only required key is:
 413     "url" - the next URL to extract.
 414     The key "ie_key" can be set to the class name (minus the trailing "IE",
 415     e.g. "Youtube") if the extractor class is known in advance.
 416     Additionally, the dictionary may have any properties of the resolved entity
 417     known in advance, for example "title" if the title of the referred video is
 418     known ahead of time.
 419
 420
 421     _type "url_transparent" entities have the same specification as "url", but
 422     indicate that the given additional information is more precise than the one
 423     associated with the resolved URL.
 424     This is useful when a site employs a video service that hosts the video and
 425     its technical metadata, but that video service does not embed a useful
 426     title, description etc.
 427
 428
 429     Subclasses of this should define a _VALID_URL regexp and, re-define the
 430     _real_extract() and (optionally) _real_initialize() methods.
 431     Probably, they should also be added to the list of extractors.
 432
 433     Subclasses may also override suitable() if necessary, but ensure the function
 434     signature is preserved and that this function imports everything it needs
 435     (except other extractors), so that lazy_extractors works correctly.
 436
 437     To support username + password (or netrc) login, the extractor must define a
 438     _NETRC_MACHINE and re-define _perform_login(username, password) and
 439     (optionally) _initialize_pre_login() methods. The _perform_login method will
 440     be called between _initialize_pre_login and _real_initialize if credentials
 441     are passed by the user. In cases where it is necessary to have the login
 442     process as part of the extraction rather than initialization, _perform_login
 443     can be left undefined.
 444
 445     _GEO_BYPASS attribute may be set to False in order to disable
 446     geo restriction bypass mechanisms for a particular extractor.
 447     Though it won't disable explicit geo restriction bypass based on
 448     country code provided with geo_bypass_country.
 449
 450     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 451     countries for this extractor. One of these countries will be used by
 452     geo restriction bypass mechanism right away in order to bypass
 453     geo restriction, of course, if the mechanism is not disabled.
 454
 455     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 456     IP blocks in CIDR notation for this extractor. One of these IP blocks
 457     will be used by geo restriction bypass mechanism similarly
 458     to _GEO_COUNTRIES.
 459
 460     The _WORKING attribute should be set to False for broken IEs
 461     in order to warn the users and skip the tests.
 462     """
 463
 464     _ready = False
 465     _downloader = None
 466     _x_forwarded_for_ip = None
 467     _GEO_BYPASS = True
 468     _GEO_COUNTRIES = None
 469     _GEO_IP_BLOCKS = None
 470     _WORKING = True
 471     _NETRC_MACHINE = None
 472     IE_DESC = None
 473
 474     _LOGIN_HINTS = {
 475         'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials',
 476         'cookies': (
 477             'Use --cookies-from-browser or --cookies for the authentication. '
 478             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 479         'password': 'Use --username and --password, or --netrc to provide account credentials',
 480     }
 481
 482     def __init__(self, downloader=None):
 483         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 484         If a downloader is not passed during initialization,
 485         it must be set using "set_downloader()" before "extract()" is called"""
 486         self._ready = False
 487         self._x_forwarded_for_ip = None
 488         self._printed_messages = set()
 489         self.set_downloader(downloader)
 490
 491     @classmethod
 492     def _match_valid_url(cls, url):
 493         # This does not use has/getattr intentionally - we want to know whether
 494         # we have cached the regexp for *this* class, whereas getattr would also
 495         # match the superclass
 496         if '_VALID_URL_RE' not in cls.__dict__:
 497             if '_VALID_URL' not in cls.__dict__:
 498                 cls._VALID_URL = cls._make_valid_url()
 499             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 500         return cls._VALID_URL_RE.match(url)
 501
 502     @classmethod
 503     def suitable(cls, url):
 504         """Receives a URL and returns True if suitable for this IE."""
 505         # This function must import everything it needs (except other extractors),
 506         # so that lazy_extractors works correctly
 507         return cls._match_valid_url(url) is not None
 508
 509     @classmethod
 510     def _match_id(cls, url):
 511         return cls._match_valid_url(url).group('id')
 512
 513     @classmethod
 514     def get_temp_id(cls, url):
 515         try:
 516             return cls._match_id(url)
 517         except (IndexError, AttributeError):
 518             return None
 519
 520     @classmethod
 521     def working(cls):
 522         """Getter method for _WORKING."""
 523         return cls._WORKING
 524
 525     @classmethod
 526     def supports_login(cls):
 527         return bool(cls._NETRC_MACHINE)
 528
 529     def initialize(self):
 530         """Initializes an instance (authentication, etc)."""
 531         self._printed_messages = set()
 532         self._initialize_geo_bypass({
 533             'countries': self._GEO_COUNTRIES,
 534             'ip_blocks': self._GEO_IP_BLOCKS,
 535         })
 536         if not self._ready:
 537             self._initialize_pre_login()
 538             if self.supports_login():
 539                 username, password = self._get_login_info()
 540                 if username:
 541                     self._perform_login(username, password)
 542             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 543                 self.report_warning(f'Login with password is not supported for this website. {self._LOGIN_HINTS["cookies"]}')
 544             self._real_initialize()
 545             self._ready = True
 546
 547     def _initialize_geo_bypass(self, geo_bypass_context):
 548         """
 549         Initialize geo restriction bypass mechanism.
 550
 551         This method is used to initialize geo bypass mechanism based on faking
 552         X-Forwarded-For HTTP header. A random country from provided country list
 553         is selected and a random IP belonging to this country is generated. This
 554         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 555         HTTP requests.
 556
 557         This method will be used for initial geo bypass mechanism initialization
 558         during the instance initialization with _GEO_COUNTRIES and
 559         _GEO_IP_BLOCKS.
 560
 561         You may also manually call it from extractor's code if geo bypass
 562         information is not available beforehand (e.g. obtained during
 563         extraction) or due to some other reason. In this case you should pass
 564         this information in geo bypass context passed as first argument. It may
 565         contain following fields:
 566
 567         countries:  List of geo unrestricted countries (similar
 568                     to _GEO_COUNTRIES)
 569         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 570                     (similar to _GEO_IP_BLOCKS)
 571
 572         """
 573         if not self._x_forwarded_for_ip:
 574
 575             # Geo bypass mechanism is explicitly disabled by user
 576             if not self.get_param('geo_bypass', True):
 577                 return
 578
 579             if not geo_bypass_context:
 580                 geo_bypass_context = {}
 581
 582             # Backward compatibility: previously _initialize_geo_bypass
 583             # expected a list of countries, some 3rd party code may still use
 584             # it this way
 585             if isinstance(geo_bypass_context, (list, tuple)):
 586                 geo_bypass_context = {
 587                     'countries': geo_bypass_context,
 588                 }
 589
 590             # The whole point of geo bypass mechanism is to fake IP
 591             # as X-Forwarded-For HTTP header based on some IP block or
 592             # country code.
 593
 594             # Path 1: bypassing based on IP block in CIDR notation
 595
 596             # Explicit IP block specified by user, use it right away
 597             # regardless of whether extractor is geo bypassable or not
 598             ip_block = self.get_param('geo_bypass_ip_block', None)
 599
 600             # Otherwise use random IP block from geo bypass context but only
 601             # if extractor is known as geo bypassable
 602             if not ip_block:
 603                 ip_blocks = geo_bypass_context.get('ip_blocks')
 604                 if self._GEO_BYPASS and ip_blocks:
 605                     ip_block = random.choice(ip_blocks)
 606
 607             if ip_block:
 608                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 609                 self._downloader.write_debug(
 610                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 611                 return
 612
 613             # Path 2: bypassing based on country code
 614
 615             # Explicit country code specified by user, use it right away
 616             # regardless of whether extractor is geo bypassable or not
 617             country = self.get_param('geo_bypass_country', None)
 618
 619             # Otherwise use random country code from geo bypass context but
 620             # only if extractor is known as geo bypassable
 621             if not country:
 622                 countries = geo_bypass_context.get('countries')
 623                 if self._GEO_BYPASS and countries:
 624                     country = random.choice(countries)
 625
 626             if country:
 627                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 628                 self._downloader.write_debug(
 629                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 630
 631     def extract(self, url):
 632         """Extracts URL information and returns it in list of dicts."""
 633         try:
 634             for _ in range(2):
 635                 try:
 636                     self.initialize()
 637                     self.write_debug('Extracting URL: %s' % url)
 638                     ie_result = self._real_extract(url)
 639                     if ie_result is None:
 640                         return None
 641                     if self._x_forwarded_for_ip:
 642                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 643                     subtitles = ie_result.get('subtitles')
 644                     if (subtitles and 'live_chat' in subtitles
 645                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 646                         del subtitles['live_chat']
 647                     return ie_result
 648                 except GeoRestrictedError as e:
 649                     if self.__maybe_fake_ip_and_retry(e.countries):
 650                         continue
 651                     raise
 652         except UnsupportedError:
 653             raise
 654         except ExtractorError as e:
 655             kwargs = {
 656                 'video_id': e.video_id or self.get_temp_id(url),
 657                 'ie': self.IE_NAME,
 658                 'tb': e.traceback or sys.exc_info()[2],
 659                 'expected': e.expected,
 660                 'cause': e.cause
 661             }
 662             if hasattr(e, 'countries'):
 663                 kwargs['countries'] = e.countries
 664             raise type(e)(e.orig_msg, **kwargs)
 665         except compat_http_client.IncompleteRead as e:
 666             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 667         except (KeyError, StopIteration) as e:
 668             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 669
 670     def __maybe_fake_ip_and_retry(self, countries):
 671         if (not self.get_param('geo_bypass_country', None)
 672                 and self._GEO_BYPASS
 673                 and self.get_param('geo_bypass', True)
 674                 and not self._x_forwarded_for_ip
 675                 and countries):
 676             country_code = random.choice(countries)
 677             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 678             if self._x_forwarded_for_ip:
 679                 self.report_warning(
 680                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 681                     % (self._x_forwarded_for_ip, country_code.upper()))
 682                 return True
 683         return False
 684
 685     def set_downloader(self, downloader):
 686         """Sets a YoutubeDL instance as the downloader for this IE."""
 687         self._downloader = downloader
 688
 689     def _initialize_pre_login(self):
 690         """ Intialization before login. Redefine in subclasses."""
 691         pass
 692
 693     def _perform_login(self, username, password):
 694         """ Login with username and password. Redefine in subclasses."""
 695         pass
 696
 697     def _real_initialize(self):
 698         """Real initialization process. Redefine in subclasses."""
 699         pass
 700
 701     def _real_extract(self, url):
 702         """Real extraction process. Redefine in subclasses."""
 703         raise NotImplementedError('This method must be implemented by subclasses')
 704
 705     @classmethod
 706     def ie_key(cls):
 707         """A string for getting the InfoExtractor with get_info_extractor"""
 708         return cls.__name__[:-2]
 709
 710     @property
 711     def IE_NAME(self):
 712         return compat_str(type(self).__name__[:-2])
 713
 714     @staticmethod
 715     def __can_accept_status_code(err, expected_status):
 716         assert isinstance(err, compat_urllib_error.HTTPError)
 717         if expected_status is None:
 718             return False
 719         elif callable(expected_status):
 720             return expected_status(err.code) is True
 721         else:
 722             return err.code in variadic(expected_status)
 723
 724     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 725         """
 726         Return the response handle.
 727
 728         See _download_webpage docstring for arguments specification.
 729         """
 730         if not self._downloader._first_webpage_request:
 731             sleep_interval = self.get_param('sleep_interval_requests') or 0
 732             if sleep_interval > 0:
 733                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 734                 time.sleep(sleep_interval)
 735         else:
 736             self._downloader._first_webpage_request = False
 737
 738         if note is None:
 739             self.report_download_webpage(video_id)
 740         elif note is not False:
 741             if video_id is None:
 742                 self.to_screen('%s' % (note,))
 743             else:
 744                 self.to_screen('%s: %s' % (video_id, note))
 745
 746         # Some sites check X-Forwarded-For HTTP header in order to figure out
 747         # the origin of the client behind proxy. This allows bypassing geo
 748         # restriction by faking this header's value to IP that belongs to some
 749         # geo unrestricted country. We will do so once we encounter any
 750         # geo restriction error.
 751         if self._x_forwarded_for_ip:
 752             if 'X-Forwarded-For' not in headers:
 753                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 754
 755         if isinstance(url_or_request, compat_urllib_request.Request):
 756             url_or_request = update_Request(
 757                 url_or_request, data=data, headers=headers, query=query)
 758         else:
 759             if query:
 760                 url_or_request = update_url_query(url_or_request, query)
 761             if data is not None or headers:
 762                 url_or_request = sanitized_Request(url_or_request, data, headers)
 763         try:
 764             return self._downloader.urlopen(url_or_request)
 765         except network_exceptions as err:
 766             if isinstance(err, compat_urllib_error.HTTPError):
 767                 if self.__can_accept_status_code(err, expected_status):
 768                     # Retain reference to error to prevent file object from
 769                     # being closed before it can be read. Works around the
 770                     # effects of <https://bugs.python.org/issue15002>
 771                     # introduced in Python 3.4.1.
 772                     err.fp._error = err
 773                     return err.fp
 774
 775             if errnote is False:
 776                 return False
 777             if errnote is None:
 778                 errnote = 'Unable to download webpage'
 779
 780             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 781             if fatal:
 782                 raise ExtractorError(errmsg, cause=err)
 783             else:
 784                 self.report_warning(errmsg)
 785                 return False
 786
 787     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 788         """
 789         Return a tuple (page content as string, URL handle).
 790
 791         See _download_webpage docstring for arguments specification.
 792         """
 793         # Strip hashes from the URL (#1038)
 794         if isinstance(url_or_request, (compat_str, str)):
 795             url_or_request = url_or_request.partition('#')[0]
 796
 797         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 798         if urlh is False:
 799             assert not fatal
 800             return False
 801         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 802         return (content, urlh)
 803
 804     @staticmethod
 805     def _guess_encoding_from_content(content_type, webpage_bytes):
 806         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 807         if m:
 808             encoding = m.group(1)
 809         else:
 810             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 811                           webpage_bytes[:1024])
 812             if m:
 813                 encoding = m.group(1).decode('ascii')
 814             elif webpage_bytes.startswith(b'\xff\xfe'):
 815                 encoding = 'utf-16'
 816             else:
 817                 encoding = 'utf-8'
 818
 819         return encoding
 820
 821     def __check_blocked(self, content):
 822         first_block = content[:512]
 823         if ('<title>Access to this site is blocked</title>' in content
 824                 and 'Websense' in first_block):
 825             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 826             blocked_iframe = self._html_search_regex(
 827                 r'<iframe src="([^"]+)"', content,
 828                 'Websense information URL', default=None)
 829             if blocked_iframe:
 830                 msg += ' Visit %s for more details' % blocked_iframe
 831             raise ExtractorError(msg, expected=True)
 832         if '<title>The URL you requested has been blocked</title>' in first_block:
 833             msg = (
 834                 'Access to this webpage has been blocked by Indian censorship. '
 835                 'Use a VPN or proxy server (with --proxy) to route around it.')
 836             block_msg = self._html_search_regex(
 837                 r'</h1><p>(.*?)</p>',
 838                 content, 'block message', default=None)
 839             if block_msg:
 840                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 841             raise ExtractorError(msg, expected=True)
 842         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 843                 and 'blocklist.rkn.gov.ru' in content):
 844             raise ExtractorError(
 845                 'Access to this webpage has been blocked by decision of the Russian government. '
 846                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 847                 expected=True)
 848
 849     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 850         content_type = urlh.headers.get('Content-Type', '')
 851         webpage_bytes = urlh.read()
 852         if prefix is not None:
 853             webpage_bytes = prefix + webpage_bytes
 854         if not encoding:
 855             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 856         if self.get_param('dump_intermediate_pages', False):
 857             self.to_screen('Dumping request to ' + urlh.geturl())
 858             dump = base64.b64encode(webpage_bytes).decode('ascii')
 859             self._downloader.to_screen(dump)
 860         if self.get_param('write_pages', False):
 861             basen = '%s_%s' % (video_id, urlh.geturl())
 862             trim_length = self.get_param('trim_file_name') or 240
 863             if len(basen) > trim_length:
 864                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 865                 basen = basen[:trim_length - len(h)] + h
 866             raw_filename = basen + '.dump'
 867             filename = sanitize_filename(raw_filename, restricted=True)
 868             self.to_screen('Saving request to ' + filename)
 869             # Working around MAX_PATH limitation on Windows (see
 870             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 871             if compat_os_name == 'nt':
 872                 absfilepath = os.path.abspath(filename)
 873                 if len(absfilepath) > 259:
 874                     filename = '\\\\?\\' + absfilepath
 875             with open(filename, 'wb') as outf:
 876                 outf.write(webpage_bytes)
 877
 878         try:
 879             content = webpage_bytes.decode(encoding, 'replace')
 880         except LookupError:
 881             content = webpage_bytes.decode('utf-8', 'replace')
 882
 883         self.__check_blocked(content)
 884
 885         return content
 886
 887     def _download_webpage(
 888             self, url_or_request, video_id, note=None, errnote=None,
 889             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 890             headers={}, query={}, expected_status=None):
 891         """
 892         Return the data of the page as a string.
 893
 894         Arguments:
 895         url_or_request -- plain text URL as a string or
 896             a compat_urllib_request.Requestobject
 897         video_id -- Video/playlist/item identifier (string)
 898
 899         Keyword arguments:
 900         note -- note printed before downloading (string)
 901         errnote -- note printed in case of an error (string)
 902         fatal -- flag denoting whether error should be considered fatal,
 903             i.e. whether it should cause ExtractionError to be raised,
 904             otherwise a warning will be reported and extraction continued
 905         tries -- number of tries
 906         timeout -- sleep interval between tries
 907         encoding -- encoding for a page content decoding, guessed automatically
 908             when not explicitly specified
 909         data -- POST data (bytes)
 910         headers -- HTTP headers (dict)
 911         query -- URL query (dict)
 912         expected_status -- allows to accept failed HTTP requests (non 2xx
 913             status code) by explicitly specifying a set of accepted status
 914             codes. Can be any of the following entities:
 915                 - an integer type specifying an exact failed status code to
 916                   accept
 917                 - a list or a tuple of integer types specifying a list of
 918                   failed status codes to accept
 919                 - a callable accepting an actual failed status code and
 920                   returning True if it should be accepted
 921             Note that this argument does not affect success status codes (2xx)
 922             which are always accepted.
 923         """
 924
 925         success = False
 926         try_count = 0
 927         while success is False:
 928             try:
 929                 res = self._download_webpage_handle(
 930                     url_or_request, video_id, note, errnote, fatal,
 931                     encoding=encoding, data=data, headers=headers, query=query,
 932                     expected_status=expected_status)
 933                 success = True
 934             except compat_http_client.IncompleteRead as e:
 935                 try_count += 1
 936                 if try_count >= tries:
 937                     raise e
 938                 self._sleep(timeout, video_id)
 939         if res is False:
 940             return res
 941         else:
 942             content, _ = res
 943             return content
 944
 945     def _download_xml_handle(
 946             self, url_or_request, video_id, note='Downloading XML',
 947             errnote='Unable to download XML', transform_source=None,
 948             fatal=True, encoding=None, data=None, headers={}, query={},
 949             expected_status=None):
 950         """
 951         Return a tuple (xml as an compat_etree_Element, URL handle).
 952
 953         See _download_webpage docstring for arguments specification.
 954         """
 955         res = self._download_webpage_handle(
 956             url_or_request, video_id, note, errnote, fatal=fatal,
 957             encoding=encoding, data=data, headers=headers, query=query,
 958             expected_status=expected_status)
 959         if res is False:
 960             return res
 961         xml_string, urlh = res
 962         return self._parse_xml(
 963             xml_string, video_id, transform_source=transform_source,
 964             fatal=fatal), urlh
 965
 966     def _download_xml(
 967             self, url_or_request, video_id,
 968             note='Downloading XML', errnote='Unable to download XML',
 969             transform_source=None, fatal=True, encoding=None,
 970             data=None, headers={}, query={}, expected_status=None):
 971         """
 972         Return the xml as an compat_etree_Element.
 973
 974         See _download_webpage docstring for arguments specification.
 975         """
 976         res = self._download_xml_handle(
 977             url_or_request, video_id, note=note, errnote=errnote,
 978             transform_source=transform_source, fatal=fatal, encoding=encoding,
 979             data=data, headers=headers, query=query,
 980             expected_status=expected_status)
 981         return res if res is False else res[0]
 982
 983     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 984         if transform_source:
 985             xml_string = transform_source(xml_string)
 986         try:
 987             return compat_etree_fromstring(xml_string.encode('utf-8'))
 988         except compat_xml_parse_error as ve:
 989             errmsg = '%s: Failed to parse XML ' % video_id
 990             if fatal:
 991                 raise ExtractorError(errmsg, cause=ve)
 992             else:
 993                 self.report_warning(errmsg + str(ve))
 994
 995     def _download_json_handle(
 996             self, url_or_request, video_id, note='Downloading JSON metadata',
 997             errnote='Unable to download JSON metadata', transform_source=None,
 998             fatal=True, encoding=None, data=None, headers={}, query={},
 999             expected_status=None):
1000         """
1001         Return a tuple (JSON object, URL handle).
1002
1003         See _download_webpage docstring for arguments specification.
1004         """
1005         res = self._download_webpage_handle(
1006             url_or_request, video_id, note, errnote, fatal=fatal,
1007             encoding=encoding, data=data, headers=headers, query=query,
1008             expected_status=expected_status)
1009         if res is False:
1010             return res
1011         json_string, urlh = res
1012         return self._parse_json(
1013             json_string, video_id, transform_source=transform_source,
1014             fatal=fatal), urlh
1015
1016     def _download_json(
1017             self, url_or_request, video_id, note='Downloading JSON metadata',
1018             errnote='Unable to download JSON metadata', transform_source=None,
1019             fatal=True, encoding=None, data=None, headers={}, query={},
1020             expected_status=None):
1021         """
1022         Return the JSON object as a dict.
1023
1024         See _download_webpage docstring for arguments specification.
1025         """
1026         res = self._download_json_handle(
1027             url_or_request, video_id, note=note, errnote=errnote,
1028             transform_source=transform_source, fatal=fatal, encoding=encoding,
1029             data=data, headers=headers, query=query,
1030             expected_status=expected_status)
1031         return res if res is False else res[0]
1032
1033     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
1034         if transform_source:
1035             json_string = transform_source(json_string)
1036         try:
1037             return json.loads(json_string, strict=False)
1038         except ValueError as ve:
1039             errmsg = '%s: Failed to parse JSON ' % video_id
1040             if fatal:
1041                 raise ExtractorError(errmsg, cause=ve)
1042             else:
1043                 self.report_warning(errmsg + str(ve))
1044
1045     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1046         return self._parse_json(
1047             data[data.find('{'):data.rfind('}') + 1],
1048             video_id, transform_source, fatal)
1049
1050     def _download_socket_json_handle(
1051             self, url_or_request, video_id, note='Polling socket',
1052             errnote='Unable to poll socket', transform_source=None,
1053             fatal=True, encoding=None, data=None, headers={}, query={},
1054             expected_status=None):
1055         """
1056         Return a tuple (JSON object, URL handle).
1057
1058         See _download_webpage docstring for arguments specification.
1059         """
1060         res = self._download_webpage_handle(
1061             url_or_request, video_id, note, errnote, fatal=fatal,
1062             encoding=encoding, data=data, headers=headers, query=query,
1063             expected_status=expected_status)
1064         if res is False:
1065             return res
1066         webpage, urlh = res
1067         return self._parse_socket_response_as_json(
1068             webpage, video_id, transform_source=transform_source,
1069             fatal=fatal), urlh
1070
1071     def _download_socket_json(
1072             self, url_or_request, video_id, note='Polling socket',
1073             errnote='Unable to poll socket', transform_source=None,
1074             fatal=True, encoding=None, data=None, headers={}, query={},
1075             expected_status=None):
1076         """
1077         Return the JSON object as a dict.
1078
1079         See _download_webpage docstring for arguments specification.
1080         """
1081         res = self._download_socket_json_handle(
1082             url_or_request, video_id, note=note, errnote=errnote,
1083             transform_source=transform_source, fatal=fatal, encoding=encoding,
1084             data=data, headers=headers, query=query,
1085             expected_status=expected_status)
1086         return res if res is False else res[0]
1087
1088     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1089         idstr = format_field(video_id, template='%s: ')
1090         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1091         if only_once:
1092             if f'WARNING: {msg}' in self._printed_messages:
1093                 return
1094             self._printed_messages.add(f'WARNING: {msg}')
1095         self._downloader.report_warning(msg, *args, **kwargs)
1096
1097     def to_screen(self, msg, *args, **kwargs):
1098         """Print msg to screen, prefixing it with '[ie_name]'"""
1099         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1100
1101     def write_debug(self, msg, *args, **kwargs):
1102         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1103
1104     def get_param(self, name, default=None, *args, **kwargs):
1105         if self._downloader:
1106             return self._downloader.params.get(name, default, *args, **kwargs)
1107         return default
1108
1109     def report_drm(self, video_id, partial=False):
1110         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1111
1112     def report_extraction(self, id_or_name):
1113         """Report information extraction."""
1114         self.to_screen('%s: Extracting information' % id_or_name)
1115
1116     def report_download_webpage(self, video_id):
1117         """Report webpage download."""
1118         self.to_screen('%s: Downloading webpage' % video_id)
1119
1120     def report_age_confirmation(self):
1121         """Report attempt to confirm age."""
1122         self.to_screen('Confirming age')
1123
1124     def report_login(self):
1125         """Report attempt to log in."""
1126         self.to_screen('Logging in')
1127
1128     def raise_login_required(
1129             self, msg='This video is only available for registered users',
1130             metadata_available=False, method=NO_DEFAULT):
1131         if metadata_available and (
1132                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1133             self.report_warning(msg)
1134             return
1135         if method is NO_DEFAULT:
1136             method = 'any' if self.supports_login() else 'cookies'
1137         if method is not None:
1138             assert method in self._LOGIN_HINTS, 'Invalid login method'
1139             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1140         raise ExtractorError(msg, expected=True)
1141
1142     def raise_geo_restricted(
1143             self, msg='This video is not available from your location due to geo restriction',
1144             countries=None, metadata_available=False):
1145         if metadata_available and (
1146                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1147             self.report_warning(msg)
1148         else:
1149             raise GeoRestrictedError(msg, countries=countries)
1150
1151     def raise_no_formats(self, msg, expected=False, video_id=None):
1152         if expected and (
1153                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1154             self.report_warning(msg, video_id)
1155         elif isinstance(msg, ExtractorError):
1156             raise msg
1157         else:
1158             raise ExtractorError(msg, expected=expected, video_id=video_id)
1159
1160     # Methods for following #608
1161     @staticmethod
1162     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1163         """Returns a URL that points to a page that should be processed"""
1164         if ie is not None:
1165             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1166         if video_id is not None:
1167             kwargs['id'] = video_id
1168         if video_title is not None:
1169             kwargs['title'] = video_title
1170         return {
1171             **kwargs,
1172             '_type': 'url_transparent' if url_transparent else 'url',
1173             'url': url,
1174         }
1175
1176     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1177         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1178                 for m in orderedSet(map(getter, matches) if getter else matches))
1179         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1180
1181     @staticmethod
1182     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1183         """Returns a playlist"""
1184         if playlist_id:
1185             kwargs['id'] = playlist_id
1186         if playlist_title:
1187             kwargs['title'] = playlist_title
1188         if playlist_description is not None:
1189             kwargs['description'] = playlist_description
1190         return {
1191             **kwargs,
1192             '_type': 'multi_video' if multi_video else 'playlist',
1193             'entries': entries,
1194         }
1195
1196     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1197         """
1198         Perform a regex search on the given string, using a single or a list of
1199         patterns returning the first matching group.
1200         In case of failure return a default value or raise a WARNING or a
1201         RegexNotFoundError, depending on fatal, specifying the field name.
1202         """
1203         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1204             mobj = re.search(pattern, string, flags)
1205         else:
1206             for p in pattern:
1207                 mobj = re.search(p, string, flags)
1208                 if mobj:
1209                     break
1210
1211         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1212
1213         if mobj:
1214             if group is None:
1215                 # return the first matching group
1216                 return next(g for g in mobj.groups() if g is not None)
1217             elif isinstance(group, (list, tuple)):
1218                 return tuple(mobj.group(g) for g in group)
1219             else:
1220                 return mobj.group(group)
1221         elif default is not NO_DEFAULT:
1222             return default
1223         elif fatal:
1224             raise RegexNotFoundError('Unable to extract %s' % _name)
1225         else:
1226             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1227             return None
1228
1229     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1230         """
1231         Like _search_regex, but strips HTML tags and unescapes entities.
1232         """
1233         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1234         if res:
1235             return clean_html(res).strip()
1236         else:
1237             return res
1238
1239     def _get_netrc_login_info(self, netrc_machine=None):
1240         username = None
1241         password = None
1242         netrc_machine = netrc_machine or self._NETRC_MACHINE
1243
1244         if self.get_param('usenetrc', False):
1245             try:
1246                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1247                 if os.path.isdir(netrc_file):
1248                     netrc_file = os.path.join(netrc_file, '.netrc')
1249                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1250                 if info is not None:
1251                     username = info[0]
1252                     password = info[2]
1253                 else:
1254                     raise netrc.NetrcParseError(
1255                         'No authenticators for %s' % netrc_machine)
1256             except (IOError, netrc.NetrcParseError) as err:
1257                 self.report_warning(
1258                     'parsing .netrc: %s' % error_to_compat_str(err))
1259
1260         return username, password
1261
1262     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1263         """
1264         Get the login info as (username, password)
1265         First look for the manually specified credentials using username_option
1266         and password_option as keys in params dictionary. If no such credentials
1267         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1268         value.
1269         If there's no info available, return (None, None)
1270         """
1271
1272         # Attempt to use provided username and password or .netrc data
1273         username = self.get_param(username_option)
1274         if username is not None:
1275             password = self.get_param(password_option)
1276         else:
1277             username, password = self._get_netrc_login_info(netrc_machine)
1278
1279         return username, password
1280
1281     def _get_tfa_info(self, note='two-factor verification code'):
1282         """
1283         Get the two-factor authentication info
1284         TODO - asking the user will be required for sms/phone verify
1285         currently just uses the command line option
1286         If there's no info available, return None
1287         """
1288
1289         tfa = self.get_param('twofactor')
1290         if tfa is not None:
1291             return tfa
1292
1293         return compat_getpass('Type %s and press [Return]: ' % note)
1294
1295     # Helper functions for extracting OpenGraph info
1296     @staticmethod
1297     def _og_regexes(prop):
1298         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1299         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1300                        % {'prop': re.escape(prop)})
1301         template = r'<meta[^>]+?%s[^>]+?%s'
1302         return [
1303             template % (property_re, content_re),
1304             template % (content_re, property_re),
1305         ]
1306
1307     @staticmethod
1308     def _meta_regex(prop):
1309         return r'''(?isx)<meta
1310                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1311                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1312
1313     def _og_search_property(self, prop, html, name=None, **kargs):
1314         prop = variadic(prop)
1315         if name is None:
1316             name = 'OpenGraph %s' % prop[0]
1317         og_regexes = []
1318         for p in prop:
1319             og_regexes.extend(self._og_regexes(p))
1320         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1321         if escaped is None:
1322             return None
1323         return unescapeHTML(escaped)
1324
1325     def _og_search_thumbnail(self, html, **kargs):
1326         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1327
1328     def _og_search_description(self, html, **kargs):
1329         return self._og_search_property('description', html, fatal=False, **kargs)
1330
1331     def _og_search_title(self, html, **kargs):
1332         kargs.setdefault('fatal', False)
1333         return self._og_search_property('title', html, **kargs)
1334
1335     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1336         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1337         if secure:
1338             regexes = self._og_regexes('video:secure_url') + regexes
1339         return self._html_search_regex(regexes, html, name, **kargs)
1340
1341     def _og_search_url(self, html, **kargs):
1342         return self._og_search_property('url', html, **kargs)
1343
1344     def _html_extract_title(self, html, name, **kwargs):
1345         return self._html_search_regex(
1346             r'(?s)<title>(.*?)</title>', html, name, **kwargs)
1347
1348     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1349         name = variadic(name)
1350         if display_name is None:
1351             display_name = name[0]
1352         return self._html_search_regex(
1353             [self._meta_regex(n) for n in name],
1354             html, display_name, fatal=fatal, group='content', **kwargs)
1355
1356     def _dc_search_uploader(self, html):
1357         return self._html_search_meta('dc.creator', html, 'uploader')
1358
1359     def _rta_search(self, html):
1360         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1361         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1362                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1363                      html):
1364             return 18
1365         return 0
1366
1367     def _media_rating_search(self, html):
1368         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1369         rating = self._html_search_meta('rating', html)
1370
1371         if not rating:
1372             return None
1373
1374         RATING_TABLE = {
1375             'safe for kids': 0,
1376             'general': 8,
1377             '14 years': 14,
1378             'mature': 17,
1379             'restricted': 19,
1380         }
1381         return RATING_TABLE.get(rating.lower())
1382
1383     def _family_friendly_search(self, html):
1384         # See http://schema.org/VideoObject
1385         family_friendly = self._html_search_meta(
1386             'isFamilyFriendly', html, default=None)
1387
1388         if not family_friendly:
1389             return None
1390
1391         RATING_TABLE = {
1392             '1': 0,
1393             'true': 0,
1394             '0': 18,
1395             'false': 18,
1396         }
1397         return RATING_TABLE.get(family_friendly.lower())
1398
1399     def _twitter_search_player(self, html):
1400         return self._html_search_meta('twitter:player', html,
1401                                       'twitter card player')
1402
1403     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1404         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1405         default = kwargs.get('default', NO_DEFAULT)
1406         # JSON-LD may be malformed and thus `fatal` should be respected.
1407         # At the same time `default` may be passed that assumes `fatal=False`
1408         # for _search_regex. Let's simulate the same behavior here as well.
1409         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1410         json_ld = []
1411         for mobj in json_ld_list:
1412             json_ld_item = self._parse_json(
1413                 mobj.group('json_ld'), video_id, fatal=fatal)
1414             if not json_ld_item:
1415                 continue
1416             if isinstance(json_ld_item, dict):
1417                 json_ld.append(json_ld_item)
1418             elif isinstance(json_ld_item, (list, tuple)):
1419                 json_ld.extend(json_ld_item)
1420         if json_ld:
1421             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1422         if json_ld:
1423             return json_ld
1424         if default is not NO_DEFAULT:
1425             return default
1426         elif fatal:
1427             raise RegexNotFoundError('Unable to extract JSON-LD')
1428         else:
1429             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1430             return {}
1431
1432     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1433         if isinstance(json_ld, compat_str):
1434             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1435         if not json_ld:
1436             return {}
1437         info = {}
1438         if not isinstance(json_ld, (list, tuple, dict)):
1439             return info
1440         if isinstance(json_ld, dict):
1441             json_ld = [json_ld]
1442
1443         INTERACTION_TYPE_MAP = {
1444             'CommentAction': 'comment',
1445             'AgreeAction': 'like',
1446             'DisagreeAction': 'dislike',
1447             'LikeAction': 'like',
1448             'DislikeAction': 'dislike',
1449             'ListenAction': 'view',
1450             'WatchAction': 'view',
1451             'ViewAction': 'view',
1452         }
1453
1454         def extract_interaction_type(e):
1455             interaction_type = e.get('interactionType')
1456             if isinstance(interaction_type, dict):
1457                 interaction_type = interaction_type.get('@type')
1458             return str_or_none(interaction_type)
1459
1460         def extract_interaction_statistic(e):
1461             interaction_statistic = e.get('interactionStatistic')
1462             if isinstance(interaction_statistic, dict):
1463                 interaction_statistic = [interaction_statistic]
1464             if not isinstance(interaction_statistic, list):
1465                 return
1466             for is_e in interaction_statistic:
1467                 if not isinstance(is_e, dict):
1468                     continue
1469                 if is_e.get('@type') != 'InteractionCounter':
1470                     continue
1471                 interaction_type = extract_interaction_type(is_e)
1472                 if not interaction_type:
1473                     continue
1474                 # For interaction count some sites provide string instead of
1475                 # an integer (as per spec) with non digit characters (e.g. ",")
1476                 # so extracting count with more relaxed str_to_int
1477                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1478                 if interaction_count is None:
1479                     continue
1480                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1481                 if not count_kind:
1482                     continue
1483                 count_key = '%s_count' % count_kind
1484                 if info.get(count_key) is not None:
1485                     continue
1486                 info[count_key] = interaction_count
1487
1488         def extract_chapter_information(e):
1489             chapters = [{
1490                 'title': part.get('name'),
1491                 'start_time': part.get('startOffset'),
1492                 'end_time': part.get('endOffset'),
1493             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1494             for idx, (last_c, current_c, next_c) in enumerate(zip(
1495                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1496                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1497                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1498                 if None in current_c.values():
1499                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1500                     return
1501             if chapters:
1502                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1503                 info['chapters'] = chapters
1504
1505         def extract_video_object(e):
1506             assert e['@type'] == 'VideoObject'
1507             author = e.get('author')
1508             info.update({
1509                 'url': url_or_none(e.get('contentUrl')),
1510                 'title': unescapeHTML(e.get('name')),
1511                 'description': unescapeHTML(e.get('description')),
1512                 'thumbnails': [{'url': url_or_none(url)}
1513                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1514                 'duration': parse_duration(e.get('duration')),
1515                 'timestamp': unified_timestamp(e.get('uploadDate')),
1516                 # author can be an instance of 'Organization' or 'Person' types.
1517                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1518                 # however some websites are using 'Text' type instead.
1519                 # 1. https://schema.org/VideoObject
1520                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1521                 'filesize': float_or_none(e.get('contentSize')),
1522                 'tbr': int_or_none(e.get('bitrate')),
1523                 'width': int_or_none(e.get('width')),
1524                 'height': int_or_none(e.get('height')),
1525                 'view_count': int_or_none(e.get('interactionCount')),
1526             })
1527             extract_interaction_statistic(e)
1528             extract_chapter_information(e)
1529
1530         def traverse_json_ld(json_ld, at_top_level=True):
1531             for e in json_ld:
1532                 if at_top_level and '@context' not in e:
1533                     continue
1534                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1535                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1536                     break
1537                 item_type = e.get('@type')
1538                 if expected_type is not None and expected_type != item_type:
1539                     continue
1540                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1541                 if rating is not None:
1542                     info['average_rating'] = rating
1543                 if item_type in ('TVEpisode', 'Episode'):
1544                     episode_name = unescapeHTML(e.get('name'))
1545                     info.update({
1546                         'episode': episode_name,
1547                         'episode_number': int_or_none(e.get('episodeNumber')),
1548                         'description': unescapeHTML(e.get('description')),
1549                     })
1550                     if not info.get('title') and episode_name:
1551                         info['title'] = episode_name
1552                     part_of_season = e.get('partOfSeason')
1553                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1554                         info.update({
1555                             'season': unescapeHTML(part_of_season.get('name')),
1556                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1557                         })
1558                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1559                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1560                         info['series'] = unescapeHTML(part_of_series.get('name'))
1561                 elif item_type == 'Movie':
1562                     info.update({
1563                         'title': unescapeHTML(e.get('name')),
1564                         'description': unescapeHTML(e.get('description')),
1565                         'duration': parse_duration(e.get('duration')),
1566                         'timestamp': unified_timestamp(e.get('dateCreated')),
1567                     })
1568                 elif item_type in ('Article', 'NewsArticle'):
1569                     info.update({
1570                         'timestamp': parse_iso8601(e.get('datePublished')),
1571                         'title': unescapeHTML(e.get('headline')),
1572                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1573                     })
1574                     if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1575                         extract_video_object(e['video'][0])
1576                 elif item_type == 'VideoObject':
1577                     extract_video_object(e)
1578                     if expected_type is None:
1579                         continue
1580                     else:
1581                         break
1582                 video = e.get('video')
1583                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1584                     extract_video_object(video)
1585                 if expected_type is None:
1586                     continue
1587                 else:
1588                     break
1589         traverse_json_ld(json_ld)
1590
1591         return dict((k, v) for k, v in info.items() if v is not None)
1592
1593     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1594         return self._parse_json(
1595             self._search_regex(
1596                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1597                 webpage, 'next.js data', fatal=fatal, **kw),
1598             video_id, transform_source=transform_source, fatal=fatal)
1599
1600     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1601         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1602         # not all website do this, but it can be changed
1603         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1604         rectx = re.escape(context_name)
1605         js, arg_keys, arg_vals = self._search_regex(
1606             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1607              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1608             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1609
1610         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1611
1612         for key, val in args.items():
1613             if val in ('undefined', 'void 0'):
1614                 args[key] = 'null'
1615
1616         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1617
1618     @staticmethod
1619     def _hidden_inputs(html):
1620         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1621         hidden_inputs = {}
1622         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1623             attrs = extract_attributes(input)
1624             if not input:
1625                 continue
1626             if attrs.get('type') not in ('hidden', 'submit'):
1627                 continue
1628             name = attrs.get('name') or attrs.get('id')
1629             value = attrs.get('value')
1630             if name and value is not None:
1631                 hidden_inputs[name] = value
1632         return hidden_inputs
1633
1634     def _form_hidden_inputs(self, form_id, html):
1635         form = self._search_regex(
1636             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1637             html, '%s form' % form_id, group='form')
1638         return self._hidden_inputs(form)
1639
1640     class FormatSort:
1641         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1642
1643         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1644                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1645                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1646         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1647                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1648                         'fps', 'fs_approx', 'source', 'id')
1649
1650         settings = {
1651             'vcodec': {'type': 'ordered', 'regex': True,
1652                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1653             'acodec': {'type': 'ordered', 'regex': True,
1654                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1655             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1656                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1657             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1658                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1659             'vext': {'type': 'ordered', 'field': 'video_ext',
1660                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1661                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1662             'aext': {'type': 'ordered', 'field': 'audio_ext',
1663                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1664                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1665             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1666             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1667                            'field': ('vcodec', 'acodec'),
1668                            'function': lambda it: int(any(v != 'none' for v in it))},
1669             'ie_pref': {'priority': True, 'type': 'extractor'},
1670             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1671             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1672             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1673             'quality': {'convert': 'float', 'default': -1},
1674             'filesize': {'convert': 'bytes'},
1675             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1676             'id': {'convert': 'string', 'field': 'format_id'},
1677             'height': {'convert': 'float_none'},
1678             'width': {'convert': 'float_none'},
1679             'fps': {'convert': 'float_none'},
1680             'tbr': {'convert': 'float_none'},
1681             'vbr': {'convert': 'float_none'},
1682             'abr': {'convert': 'float_none'},
1683             'asr': {'convert': 'float_none'},
1684             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1685
1686             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1687             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1688             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1689             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1690             'res': {'type': 'multiple', 'field': ('height', 'width'),
1691                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1692
1693             # For compatibility with youtube-dl
1694             'format_id': {'type': 'alias', 'field': 'id'},
1695             'preference': {'type': 'alias', 'field': 'ie_pref'},
1696             'language_preference': {'type': 'alias', 'field': 'lang'},
1697             'source_preference': {'type': 'alias', 'field': 'source'},
1698             'protocol': {'type': 'alias', 'field': 'proto'},
1699             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1700
1701             # Deprecated
1702             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1703             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1704             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1705             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1706             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1707             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1708             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1709             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1710             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1711             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1712             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1713             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1714             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1715             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1716             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1717             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1718             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1719             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1720             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1721             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1722         }
1723
1724         def __init__(self, ie, field_preference):
1725             self._order = []
1726             self.ydl = ie._downloader
1727             self.evaluate_params(self.ydl.params, field_preference)
1728             if ie.get_param('verbose'):
1729                 self.print_verbose_info(self.ydl.write_debug)
1730
1731         def _get_field_setting(self, field, key):
1732             if field not in self.settings:
1733                 if key in ('forced', 'priority'):
1734                     return False
1735                 self.ydl.deprecation_warning(
1736                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1737                     'and may be removed in a future version')
1738                 self.settings[field] = {}
1739             propObj = self.settings[field]
1740             if key not in propObj:
1741                 type = propObj.get('type')
1742                 if key == 'field':
1743                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1744                 elif key == 'convert':
1745                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1746                 else:
1747                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1748                 propObj[key] = default
1749             return propObj[key]
1750
1751         def _resolve_field_value(self, field, value, convertNone=False):
1752             if value is None:
1753                 if not convertNone:
1754                     return None
1755             else:
1756                 value = value.lower()
1757             conversion = self._get_field_setting(field, 'convert')
1758             if conversion == 'ignore':
1759                 return None
1760             if conversion == 'string':
1761                 return value
1762             elif conversion == 'float_none':
1763                 return float_or_none(value)
1764             elif conversion == 'bytes':
1765                 return FileDownloader.parse_bytes(value)
1766             elif conversion == 'order':
1767                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1768                 use_regex = self._get_field_setting(field, 'regex')
1769                 list_length = len(order_list)
1770                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1771                 if use_regex and value is not None:
1772                     for i, regex in enumerate(order_list):
1773                         if regex and re.match(regex, value):
1774                             return list_length - i
1775                     return list_length - empty_pos  # not in list
1776                 else:  # not regex or  value = None
1777                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1778             else:
1779                 if value.isnumeric():
1780                     return float(value)
1781                 else:
1782                     self.settings[field]['convert'] = 'string'
1783                     return value
1784
1785         def evaluate_params(self, params, sort_extractor):
1786             self._use_free_order = params.get('prefer_free_formats', False)
1787             self._sort_user = params.get('format_sort', [])
1788             self._sort_extractor = sort_extractor
1789
1790             def add_item(field, reverse, closest, limit_text):
1791                 field = field.lower()
1792                 if field in self._order:
1793                     return
1794                 self._order.append(field)
1795                 limit = self._resolve_field_value(field, limit_text)
1796                 data = {
1797                     'reverse': reverse,
1798                     'closest': False if limit is None else closest,
1799                     'limit_text': limit_text,
1800                     'limit': limit}
1801                 if field in self.settings:
1802                     self.settings[field].update(data)
1803                 else:
1804                     self.settings[field] = data
1805
1806             sort_list = (
1807                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1808                 + (tuple() if params.get('format_sort_force', False)
1809                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1810                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1811
1812             for item in sort_list:
1813                 match = re.match(self.regex, item)
1814                 if match is None:
1815                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1816                 field = match.group('field')
1817                 if field is None:
1818                     continue
1819                 if self._get_field_setting(field, 'type') == 'alias':
1820                     alias, field = field, self._get_field_setting(field, 'field')
1821                     if self._get_field_setting(alias, 'deprecated'):
1822                         self.ydl.deprecation_warning(
1823                             f'Format sorting alias {alias} is deprecated '
1824                             f'and may be removed in a future version. Please use {field} instead')
1825                 reverse = match.group('reverse') is not None
1826                 closest = match.group('separator') == '~'
1827                 limit_text = match.group('limit')
1828
1829                 has_limit = limit_text is not None
1830                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1831                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1832
1833                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1834                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1835                 limit_count = len(limits)
1836                 for (i, f) in enumerate(fields):
1837                     add_item(f, reverse, closest,
1838                              limits[i] if i < limit_count
1839                              else limits[0] if has_limit and not has_multiple_limits
1840                              else None)
1841
1842         def print_verbose_info(self, write_debug):
1843             if self._sort_user:
1844                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1845             if self._sort_extractor:
1846                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1847             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1848                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1849                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1850                               self._get_field_setting(field, 'limit_text'),
1851                               self._get_field_setting(field, 'limit'))
1852                 if self._get_field_setting(field, 'limit_text') is not None else '')
1853                 for field in self._order if self._get_field_setting(field, 'visible')]))
1854
1855         def _calculate_field_preference_from_value(self, format, field, type, value):
1856             reverse = self._get_field_setting(field, 'reverse')
1857             closest = self._get_field_setting(field, 'closest')
1858             limit = self._get_field_setting(field, 'limit')
1859
1860             if type == 'extractor':
1861                 maximum = self._get_field_setting(field, 'max')
1862                 if value is None or (maximum is not None and value >= maximum):
1863                     value = -1
1864             elif type == 'boolean':
1865                 in_list = self._get_field_setting(field, 'in_list')
1866                 not_in_list = self._get_field_setting(field, 'not_in_list')
1867                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1868             elif type == 'ordered':
1869                 value = self._resolve_field_value(field, value, True)
1870
1871             # try to convert to number
1872             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1873             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1874             if is_num:
1875                 value = val_num
1876
1877             return ((-10, 0) if value is None
1878                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1879                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1880                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1881                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1882                     else (-1, value, 0))
1883
1884         def _calculate_field_preference(self, format, field):
1885             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1886             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1887             if type == 'multiple':
1888                 type = 'field'  # Only 'field' is allowed in multiple for now
1889                 actual_fields = self._get_field_setting(field, 'field')
1890
1891                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1892             else:
1893                 value = get_value(field)
1894             return self._calculate_field_preference_from_value(format, field, type, value)
1895
1896         def calculate_preference(self, format):
1897             # Determine missing protocol
1898             if not format.get('protocol'):
1899                 format['protocol'] = determine_protocol(format)
1900
1901             # Determine missing ext
1902             if not format.get('ext') and 'url' in format:
1903                 format['ext'] = determine_ext(format['url'])
1904             if format.get('vcodec') == 'none':
1905                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1906                 format['video_ext'] = 'none'
1907             else:
1908                 format['video_ext'] = format['ext']
1909                 format['audio_ext'] = 'none'
1910             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1911             #    format['preference'] = -1000
1912
1913             # Determine missing bitrates
1914             if format.get('tbr') is None:
1915                 if format.get('vbr') is not None and format.get('abr') is not None:
1916                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1917             else:
1918                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1919                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1920                 if format.get('acodec') != 'none' and format.get('abr') is None:
1921                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1922
1923             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1924
1925     def _sort_formats(self, formats, field_preference=[]):
1926         if not formats:
1927             return
1928         format_sort = self.FormatSort(self, field_preference)
1929         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1930
1931     def _check_formats(self, formats, video_id):
1932         if formats:
1933             formats[:] = filter(
1934                 lambda f: self._is_valid_url(
1935                     f['url'], video_id,
1936                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1937                 formats)
1938
1939     @staticmethod
1940     def _remove_duplicate_formats(formats):
1941         format_urls = set()
1942         unique_formats = []
1943         for f in formats:
1944             if f['url'] not in format_urls:
1945                 format_urls.add(f['url'])
1946                 unique_formats.append(f)
1947         formats[:] = unique_formats
1948
1949     def _is_valid_url(self, url, video_id, item='video', headers={}):
1950         url = self._proto_relative_url(url, scheme='http:')
1951         # For now assume non HTTP(S) URLs always valid
1952         if not (url.startswith('http://') or url.startswith('https://')):
1953             return True
1954         try:
1955             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1956             return True
1957         except ExtractorError as e:
1958             self.to_screen(
1959                 '%s: %s URL is invalid, skipping: %s'
1960                 % (video_id, item, error_to_compat_str(e.cause)))
1961             return False
1962
1963     def http_scheme(self):
1964         """ Either "http:" or "https:", depending on the user's preferences """
1965         return (
1966             'http:'
1967             if self.get_param('prefer_insecure', False)
1968             else 'https:')
1969
1970     def _proto_relative_url(self, url, scheme=None):
1971         if url is None:
1972             return url
1973         if url.startswith('//'):
1974             if scheme is None:
1975                 scheme = self.http_scheme()
1976             return scheme + url
1977         else:
1978             return url
1979
1980     def _sleep(self, timeout, video_id, msg_template=None):
1981         if msg_template is None:
1982             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1983         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1984         self.to_screen(msg)
1985         time.sleep(timeout)
1986
1987     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1988                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1989                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1990         manifest = self._download_xml(
1991             manifest_url, video_id, 'Downloading f4m manifest',
1992             'Unable to download f4m manifest',
1993             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1994             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1995             transform_source=transform_source,
1996             fatal=fatal, data=data, headers=headers, query=query)
1997
1998         if manifest is False:
1999             return []
2000
2001         return self._parse_f4m_formats(
2002             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2003             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2004
2005     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2006                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2007                            fatal=True, m3u8_id=None):
2008         if not isinstance(manifest, compat_etree_Element) and not fatal:
2009             return []
2010
2011         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2012         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2013         if akamai_pv is not None and ';' in akamai_pv.text:
2014             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2015             if playerVerificationChallenge.strip() != '':
2016                 return []
2017
2018         formats = []
2019         manifest_version = '1.0'
2020         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2021         if not media_nodes:
2022             manifest_version = '2.0'
2023             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2024         # Remove unsupported DRM protected media from final formats
2025         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2026         media_nodes = remove_encrypted_media(media_nodes)
2027         if not media_nodes:
2028             return formats
2029
2030         manifest_base_url = get_base_url(manifest)
2031
2032         bootstrap_info = xpath_element(
2033             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2034             'bootstrap info', default=None)
2035
2036         vcodec = None
2037         mime_type = xpath_text(
2038             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2039             'base URL', default=None)
2040         if mime_type and mime_type.startswith('audio/'):
2041             vcodec = 'none'
2042
2043         for i, media_el in enumerate(media_nodes):
2044             tbr = int_or_none(media_el.attrib.get('bitrate'))
2045             width = int_or_none(media_el.attrib.get('width'))
2046             height = int_or_none(media_el.attrib.get('height'))
2047             format_id = join_nonempty(f4m_id, tbr or i)
2048             # If <bootstrapInfo> is present, the specified f4m is a
2049             # stream-level manifest, and only set-level manifests may refer to
2050             # external resources.  See section 11.4 and section 4 of F4M spec
2051             if bootstrap_info is None:
2052                 media_url = None
2053                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2054                 if manifest_version == '2.0':
2055                     media_url = media_el.attrib.get('href')
2056                 if media_url is None:
2057                     media_url = media_el.attrib.get('url')
2058                 if not media_url:
2059                     continue
2060                 manifest_url = (
2061                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2062                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2063                 # If media_url is itself a f4m manifest do the recursive extraction
2064                 # since bitrates in parent manifest (this one) and media_url manifest
2065                 # may differ leading to inability to resolve the format by requested
2066                 # bitrate in f4m downloader
2067                 ext = determine_ext(manifest_url)
2068                 if ext == 'f4m':
2069                     f4m_formats = self._extract_f4m_formats(
2070                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2071                         transform_source=transform_source, fatal=fatal)
2072                     # Sometimes stream-level manifest contains single media entry that
2073                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2074                     # At the same time parent's media entry in set-level manifest may
2075                     # contain it. We will copy it from parent in such cases.
2076                     if len(f4m_formats) == 1:
2077                         f = f4m_formats[0]
2078                         f.update({
2079                             'tbr': f.get('tbr') or tbr,
2080                             'width': f.get('width') or width,
2081                             'height': f.get('height') or height,
2082                             'format_id': f.get('format_id') if not tbr else format_id,
2083                             'vcodec': vcodec,
2084                         })
2085                     formats.extend(f4m_formats)
2086                     continue
2087                 elif ext == 'm3u8':
2088                     formats.extend(self._extract_m3u8_formats(
2089                         manifest_url, video_id, 'mp4', preference=preference,
2090                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2091                     continue
2092             formats.append({
2093                 'format_id': format_id,
2094                 'url': manifest_url,
2095                 'manifest_url': manifest_url,
2096                 'ext': 'flv' if bootstrap_info is not None else None,
2097                 'protocol': 'f4m',
2098                 'tbr': tbr,
2099                 'width': width,
2100                 'height': height,
2101                 'vcodec': vcodec,
2102                 'preference': preference,
2103                 'quality': quality,
2104             })
2105         return formats
2106
2107     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2108         return {
2109             'format_id': join_nonempty(m3u8_id, 'meta'),
2110             'url': m3u8_url,
2111             'ext': ext,
2112             'protocol': 'm3u8',
2113             'preference': preference - 100 if preference else -100,
2114             'quality': quality,
2115             'resolution': 'multiple',
2116             'format_note': 'Quality selection URL',
2117         }
2118
2119     def _report_ignoring_subs(self, name):
2120         self.report_warning(bug_reports_message(
2121             f'Ignoring subtitle tracks found in the {name} manifest; '
2122             'if any subtitle tracks are missing,'
2123         ), only_once=True)
2124
2125     def _extract_m3u8_formats(self, *args, **kwargs):
2126         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2127         if subs:
2128             self._report_ignoring_subs('HLS')
2129         return fmts
2130
2131     def _extract_m3u8_formats_and_subtitles(
2132             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2133             preference=None, quality=None, m3u8_id=None, note=None,
2134             errnote=None, fatal=True, live=False, data=None, headers={},
2135             query={}):
2136
2137         res = self._download_webpage_handle(
2138             m3u8_url, video_id,
2139             note='Downloading m3u8 information' if note is None else note,
2140             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2141             fatal=fatal, data=data, headers=headers, query=query)
2142
2143         if res is False:
2144             return [], {}
2145
2146         m3u8_doc, urlh = res
2147         m3u8_url = urlh.geturl()
2148
2149         return self._parse_m3u8_formats_and_subtitles(
2150             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2151             preference=preference, quality=quality, m3u8_id=m3u8_id,
2152             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2153             headers=headers, query=query, video_id=video_id)
2154
2155     def _parse_m3u8_formats_and_subtitles(
2156             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2157             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2158             errnote=None, fatal=True, data=None, headers={}, query={},
2159             video_id=None):
2160         formats, subtitles = [], {}
2161
2162         has_drm = re.search('|'.join([
2163             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2164             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2165         ]), m3u8_doc)
2166
2167         def format_url(url):
2168             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2169
2170         if self.get_param('hls_split_discontinuity', False):
2171             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2172                 if not m3u8_doc:
2173                     if not manifest_url:
2174                         return []
2175                     m3u8_doc = self._download_webpage(
2176                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2177                         note=False, errnote='Failed to download m3u8 playlist information')
2178                     if m3u8_doc is False:
2179                         return []
2180                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2181
2182         else:
2183             def _extract_m3u8_playlist_indices(*args, **kwargs):
2184                 return [None]
2185
2186         # References:
2187         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2188         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2189         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2190
2191         # We should try extracting formats only from master playlists [1, 4.3.4],
2192         # i.e. playlists that describe available qualities. On the other hand
2193         # media playlists [1, 4.3.3] should be returned as is since they contain
2194         # just the media without qualities renditions.
2195         # Fortunately, master playlist can be easily distinguished from media
2196         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2197         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2198         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2199         # media playlist and MUST NOT appear in master playlist thus we can
2200         # clearly detect media playlist with this criterion.
2201
2202         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2203             formats = [{
2204                 'format_id': join_nonempty(m3u8_id, idx),
2205                 'format_index': idx,
2206                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2207                 'ext': ext,
2208                 'protocol': entry_protocol,
2209                 'preference': preference,
2210                 'quality': quality,
2211                 'has_drm': has_drm,
2212             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2213
2214             return formats, subtitles
2215
2216         groups = {}
2217         last_stream_inf = {}
2218
2219         def extract_media(x_media_line):
2220             media = parse_m3u8_attributes(x_media_line)
2221             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2222             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2223             if not (media_type and group_id and name):
2224                 return
2225             groups.setdefault(group_id, []).append(media)
2226             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2227             if media_type == 'SUBTITLES':
2228                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2229                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2230                 # However, lack of URI has been spotted in the wild.
2231                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2232                 if not media.get('URI'):
2233                     return
2234                 url = format_url(media['URI'])
2235                 sub_info = {
2236                     'url': url,
2237                     'ext': determine_ext(url),
2238                 }
2239                 if sub_info['ext'] == 'm3u8':
2240                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2241                     # files may contain is WebVTT:
2242                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2243                     sub_info['ext'] = 'vtt'
2244                     sub_info['protocol'] = 'm3u8_native'
2245                 lang = media.get('LANGUAGE') or 'und'
2246                 subtitles.setdefault(lang, []).append(sub_info)
2247             if media_type not in ('VIDEO', 'AUDIO'):
2248                 return
2249             media_url = media.get('URI')
2250             if media_url:
2251                 manifest_url = format_url(media_url)
2252                 formats.extend({
2253                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2254                     'format_note': name,
2255                     'format_index': idx,
2256                     'url': manifest_url,
2257                     'manifest_url': m3u8_url,
2258                     'language': media.get('LANGUAGE'),
2259                     'ext': ext,
2260                     'protocol': entry_protocol,
2261                     'preference': preference,
2262                     'quality': quality,
2263                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2264                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2265
2266         def build_stream_name():
2267             # Despite specification does not mention NAME attribute for
2268             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2269             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2270             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2271             stream_name = last_stream_inf.get('NAME')
2272             if stream_name:
2273                 return stream_name
2274             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2275             # from corresponding rendition group
2276             stream_group_id = last_stream_inf.get('VIDEO')
2277             if not stream_group_id:
2278                 return
2279             stream_group = groups.get(stream_group_id)
2280             if not stream_group:
2281                 return stream_group_id
2282             rendition = stream_group[0]
2283             return rendition.get('NAME') or stream_group_id
2284
2285         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2286         # chance to detect video only formats when EXT-X-STREAM-INF tags
2287         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2288         for line in m3u8_doc.splitlines():
2289             if line.startswith('#EXT-X-MEDIA:'):
2290                 extract_media(line)
2291
2292         for line in m3u8_doc.splitlines():
2293             if line.startswith('#EXT-X-STREAM-INF:'):
2294                 last_stream_inf = parse_m3u8_attributes(line)
2295             elif line.startswith('#') or not line.strip():
2296                 continue
2297             else:
2298                 tbr = float_or_none(
2299                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2300                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2301                 manifest_url = format_url(line.strip())
2302
2303                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2304                     format_id = [m3u8_id, None, idx]
2305                     # Bandwidth of live streams may differ over time thus making
2306                     # format_id unpredictable. So it's better to keep provided
2307                     # format_id intact.
2308                     if not live:
2309                         stream_name = build_stream_name()
2310                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2311                     f = {
2312                         'format_id': join_nonempty(*format_id),
2313                         'format_index': idx,
2314                         'url': manifest_url,
2315                         'manifest_url': m3u8_url,
2316                         'tbr': tbr,
2317                         'ext': ext,
2318                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2319                         'protocol': entry_protocol,
2320                         'preference': preference,
2321                         'quality': quality,
2322                     }
2323                     resolution = last_stream_inf.get('RESOLUTION')
2324                     if resolution:
2325                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2326                         if mobj:
2327                             f['width'] = int(mobj.group('width'))
2328                             f['height'] = int(mobj.group('height'))
2329                     # Unified Streaming Platform
2330                     mobj = re.search(
2331                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2332                     if mobj:
2333                         abr, vbr = mobj.groups()
2334                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2335                         f.update({
2336                             'vbr': vbr,
2337                             'abr': abr,
2338                         })
2339                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2340                     f.update(codecs)
2341                     audio_group_id = last_stream_inf.get('AUDIO')
2342                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2343                     # references a rendition group MUST have a CODECS attribute.
2344                     # However, this is not always respected, for example, [2]
2345                     # contains EXT-X-STREAM-INF tag which references AUDIO
2346                     # rendition group but does not have CODECS and despite
2347                     # referencing an audio group it represents a complete
2348                     # (with audio and video) format. So, for such cases we will
2349                     # ignore references to rendition groups and treat them
2350                     # as complete formats.
2351                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2352                         audio_group = groups.get(audio_group_id)
2353                         if audio_group and audio_group[0].get('URI'):
2354                             # TODO: update acodec for audio only formats with
2355                             # the same GROUP-ID
2356                             f['acodec'] = 'none'
2357                     if not f.get('ext'):
2358                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2359                     formats.append(f)
2360
2361                     # for DailyMotion
2362                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2363                     if progressive_uri:
2364                         http_f = f.copy()
2365                         del http_f['manifest_url']
2366                         http_f.update({
2367                             'format_id': f['format_id'].replace('hls-', 'http-'),
2368                             'protocol': 'http',
2369                             'url': progressive_uri,
2370                         })
2371                         formats.append(http_f)
2372
2373                 last_stream_inf = {}
2374         return formats, subtitles
2375
2376     def _extract_m3u8_vod_duration(
2377             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2378
2379         m3u8_vod = self._download_webpage(
2380             m3u8_vod_url, video_id,
2381             note='Downloading m3u8 VOD manifest' if note is None else note,
2382             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2383             fatal=False, data=data, headers=headers, query=query)
2384
2385         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2386
2387     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2388         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2389             return None
2390
2391         return int(sum(
2392             float(line[len('#EXTINF:'):].split(',')[0])
2393             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2394
2395     @staticmethod
2396     def _xpath_ns(path, namespace=None):
2397         if not namespace:
2398             return path
2399         out = []
2400         for c in path.split('/'):
2401             if not c or c == '.':
2402                 out.append(c)
2403             else:
2404                 out.append('{%s}%s' % (namespace, c))
2405         return '/'.join(out)
2406
2407     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2408         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2409
2410         if smil is False:
2411             assert not fatal
2412             return [], {}
2413
2414         namespace = self._parse_smil_namespace(smil)
2415
2416         fmts = self._parse_smil_formats(
2417             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2418         subs = self._parse_smil_subtitles(
2419             smil, namespace=namespace)
2420
2421         return fmts, subs
2422
2423     def _extract_smil_formats(self, *args, **kwargs):
2424         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2425         if subs:
2426             self._report_ignoring_subs('SMIL')
2427         return fmts
2428
2429     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2430         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2431         if smil is False:
2432             return {}
2433         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2434
2435     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2436         return self._download_xml(
2437             smil_url, video_id, 'Downloading SMIL file',
2438             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2439
2440     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2441         namespace = self._parse_smil_namespace(smil)
2442
2443         formats = self._parse_smil_formats(
2444             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2445         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2446
2447         video_id = os.path.splitext(url_basename(smil_url))[0]
2448         title = None
2449         description = None
2450         upload_date = None
2451         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2452             name = meta.attrib.get('name')
2453             content = meta.attrib.get('content')
2454             if not name or not content:
2455                 continue
2456             if not title and name == 'title':
2457                 title = content
2458             elif not description and name in ('description', 'abstract'):
2459                 description = content
2460             elif not upload_date and name == 'date':
2461                 upload_date = unified_strdate(content)
2462
2463         thumbnails = [{
2464             'id': image.get('type'),
2465             'url': image.get('src'),
2466             'width': int_or_none(image.get('width')),
2467             'height': int_or_none(image.get('height')),
2468         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2469
2470         return {
2471             'id': video_id,
2472             'title': title or video_id,
2473             'description': description,
2474             'upload_date': upload_date,
2475             'thumbnails': thumbnails,
2476             'formats': formats,
2477             'subtitles': subtitles,
2478         }
2479
2480     def _parse_smil_namespace(self, smil):
2481         return self._search_regex(
2482             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2483
2484     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2485         base = smil_url
2486         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2487             b = meta.get('base') or meta.get('httpBase')
2488             if b:
2489                 base = b
2490                 break
2491
2492         formats = []
2493         rtmp_count = 0
2494         http_count = 0
2495         m3u8_count = 0
2496         imgs_count = 0
2497
2498         srcs = set()
2499         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2500         for medium in media:
2501             src = medium.get('src')
2502             if not src or src in srcs:
2503                 continue
2504             srcs.add(src)
2505
2506             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2507             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2508             width = int_or_none(medium.get('width'))
2509             height = int_or_none(medium.get('height'))
2510             proto = medium.get('proto')
2511             ext = medium.get('ext')
2512             src_ext = determine_ext(src)
2513             streamer = medium.get('streamer') or base
2514
2515             if proto == 'rtmp' or streamer.startswith('rtmp'):
2516                 rtmp_count += 1
2517                 formats.append({
2518                     'url': streamer,
2519                     'play_path': src,
2520                     'ext': 'flv',
2521                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2522                     'tbr': bitrate,
2523                     'filesize': filesize,
2524                     'width': width,
2525                     'height': height,
2526                 })
2527                 if transform_rtmp_url:
2528                     streamer, src = transform_rtmp_url(streamer, src)
2529                     formats[-1].update({
2530                         'url': streamer,
2531                         'play_path': src,
2532                     })
2533                 continue
2534
2535             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2536             src_url = src_url.strip()
2537
2538             if proto == 'm3u8' or src_ext == 'm3u8':
2539                 m3u8_formats = self._extract_m3u8_formats(
2540                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2541                 if len(m3u8_formats) == 1:
2542                     m3u8_count += 1
2543                     m3u8_formats[0].update({
2544                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2545                         'tbr': bitrate,
2546                         'width': width,
2547                         'height': height,
2548                     })
2549                 formats.extend(m3u8_formats)
2550             elif src_ext == 'f4m':
2551                 f4m_url = src_url
2552                 if not f4m_params:
2553                     f4m_params = {
2554                         'hdcore': '3.2.0',
2555                         'plugin': 'flowplayer-3.2.0.1',
2556                     }
2557                 f4m_url += '&' if '?' in f4m_url else '?'
2558                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2559                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2560             elif src_ext == 'mpd':
2561                 formats.extend(self._extract_mpd_formats(
2562                     src_url, video_id, mpd_id='dash', fatal=False))
2563             elif re.search(r'\.ism/[Mm]anifest', src_url):
2564                 formats.extend(self._extract_ism_formats(
2565                     src_url, video_id, ism_id='mss', fatal=False))
2566             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2567                 http_count += 1
2568                 formats.append({
2569                     'url': src_url,
2570                     'ext': ext or src_ext or 'flv',
2571                     'format_id': 'http-%d' % (bitrate or http_count),
2572                     'tbr': bitrate,
2573                     'filesize': filesize,
2574                     'width': width,
2575                     'height': height,
2576                 })
2577
2578         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2579             src = medium.get('src')
2580             if not src or src in srcs:
2581                 continue
2582             srcs.add(src)
2583
2584             imgs_count += 1
2585             formats.append({
2586                 'format_id': 'imagestream-%d' % (imgs_count),
2587                 'url': src,
2588                 'ext': mimetype2ext(medium.get('type')),
2589                 'acodec': 'none',
2590                 'vcodec': 'none',
2591                 'width': int_or_none(medium.get('width')),
2592                 'height': int_or_none(medium.get('height')),
2593                 'format_note': 'SMIL storyboards',
2594             })
2595
2596         return formats
2597
2598     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2599         urls = []
2600         subtitles = {}
2601         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2602             src = textstream.get('src')
2603             if not src or src in urls:
2604                 continue
2605             urls.append(src)
2606             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2607             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2608             subtitles.setdefault(lang, []).append({
2609                 'url': src,
2610                 'ext': ext,
2611             })
2612         return subtitles
2613
2614     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2615         xspf = self._download_xml(
2616             xspf_url, playlist_id, 'Downloading xpsf playlist',
2617             'Unable to download xspf manifest', fatal=fatal)
2618         if xspf is False:
2619             return []
2620         return self._parse_xspf(
2621             xspf, playlist_id, xspf_url=xspf_url,
2622             xspf_base_url=base_url(xspf_url))
2623
2624     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2625         NS_MAP = {
2626             'xspf': 'http://xspf.org/ns/0/',
2627             's1': 'http://static.streamone.nl/player/ns/0',
2628         }
2629
2630         entries = []
2631         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2632             title = xpath_text(
2633                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2634             description = xpath_text(
2635                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2636             thumbnail = xpath_text(
2637                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2638             duration = float_or_none(
2639                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2640
2641             formats = []
2642             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2643                 format_url = urljoin(xspf_base_url, location.text)
2644                 if not format_url:
2645                     continue
2646                 formats.append({
2647                     'url': format_url,
2648                     'manifest_url': xspf_url,
2649                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2650                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2651                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2652                 })
2653             self._sort_formats(formats)
2654
2655             entries.append({
2656                 'id': playlist_id,
2657                 'title': title,
2658                 'description': description,
2659                 'thumbnail': thumbnail,
2660                 'duration': duration,
2661                 'formats': formats,
2662             })
2663         return entries
2664
2665     def _extract_mpd_formats(self, *args, **kwargs):
2666         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2667         if subs:
2668             self._report_ignoring_subs('DASH')
2669         return fmts
2670
2671     def _extract_mpd_formats_and_subtitles(
2672             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2673             fatal=True, data=None, headers={}, query={}):
2674         res = self._download_xml_handle(
2675             mpd_url, video_id,
2676             note='Downloading MPD manifest' if note is None else note,
2677             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2678             fatal=fatal, data=data, headers=headers, query=query)
2679         if res is False:
2680             return [], {}
2681         mpd_doc, urlh = res
2682         if mpd_doc is None:
2683             return [], {}
2684         mpd_base_url = base_url(urlh.geturl())
2685
2686         return self._parse_mpd_formats_and_subtitles(
2687             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2688
2689     def _parse_mpd_formats(self, *args, **kwargs):
2690         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2691         if subs:
2692             self._report_ignoring_subs('DASH')
2693         return fmts
2694
2695     def _parse_mpd_formats_and_subtitles(
2696             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2697         """
2698         Parse formats from MPD manifest.
2699         References:
2700          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2701             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2702          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2703         """
2704         if not self.get_param('dynamic_mpd', True):
2705             if mpd_doc.get('type') == 'dynamic':
2706                 return [], {}
2707
2708         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2709
2710         def _add_ns(path):
2711             return self._xpath_ns(path, namespace)
2712
2713         def is_drm_protected(element):
2714             return element.find(_add_ns('ContentProtection')) is not None
2715
2716         def extract_multisegment_info(element, ms_parent_info):
2717             ms_info = ms_parent_info.copy()
2718
2719             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2720             # common attributes and elements.  We will only extract relevant
2721             # for us.
2722             def extract_common(source):
2723                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2724                 if segment_timeline is not None:
2725                     s_e = segment_timeline.findall(_add_ns('S'))
2726                     if s_e:
2727                         ms_info['total_number'] = 0
2728                         ms_info['s'] = []
2729                         for s in s_e:
2730                             r = int(s.get('r', 0))
2731                             ms_info['total_number'] += 1 + r
2732                             ms_info['s'].append({
2733                                 't': int(s.get('t', 0)),
2734                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2735                                 'd': int(s.attrib['d']),
2736                                 'r': r,
2737                             })
2738                 start_number = source.get('startNumber')
2739                 if start_number:
2740                     ms_info['start_number'] = int(start_number)
2741                 timescale = source.get('timescale')
2742                 if timescale:
2743                     ms_info['timescale'] = int(timescale)
2744                 segment_duration = source.get('duration')
2745                 if segment_duration:
2746                     ms_info['segment_duration'] = float(segment_duration)
2747
2748             def extract_Initialization(source):
2749                 initialization = source.find(_add_ns('Initialization'))
2750                 if initialization is not None:
2751                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2752
2753             segment_list = element.find(_add_ns('SegmentList'))
2754             if segment_list is not None:
2755                 extract_common(segment_list)
2756                 extract_Initialization(segment_list)
2757                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2758                 if segment_urls_e:
2759                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2760             else:
2761                 segment_template = element.find(_add_ns('SegmentTemplate'))
2762                 if segment_template is not None:
2763                     extract_common(segment_template)
2764                     media = segment_template.get('media')
2765                     if media:
2766                         ms_info['media'] = media
2767                     initialization = segment_template.get('initialization')
2768                     if initialization:
2769                         ms_info['initialization'] = initialization
2770                     else:
2771                         extract_Initialization(segment_template)
2772             return ms_info
2773
2774         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2775         formats, subtitles = [], {}
2776         stream_numbers = collections.defaultdict(int)
2777         for period in mpd_doc.findall(_add_ns('Period')):
2778             period_duration = parse_duration(period.get('duration')) or mpd_duration
2779             period_ms_info = extract_multisegment_info(period, {
2780                 'start_number': 1,
2781                 'timescale': 1,
2782             })
2783             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2784                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2785                 for representation in adaptation_set.findall(_add_ns('Representation')):
2786                     representation_attrib = adaptation_set.attrib.copy()
2787                     representation_attrib.update(representation.attrib)
2788                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2789                     mime_type = representation_attrib['mimeType']
2790                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2791
2792                     codecs = parse_codecs(representation_attrib.get('codecs', ''))
2793                     if content_type not in ('video', 'audio', 'text'):
2794                         if mime_type == 'image/jpeg':
2795                             content_type = mime_type
2796                         elif codecs['vcodec'] != 'none':
2797                             content_type = 'video'
2798                         elif codecs['acodec'] != 'none':
2799                             content_type = 'audio'
2800                         elif codecs.get('tcodec', 'none') != 'none':
2801                             content_type = 'text'
2802                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2803                             content_type = 'text'
2804                         else:
2805                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2806                             continue
2807
2808                     base_url = ''
2809                     for element in (representation, adaptation_set, period, mpd_doc):
2810                         base_url_e = element.find(_add_ns('BaseURL'))
2811                         if base_url_e is not None:
2812                             base_url = base_url_e.text + base_url
2813                             if re.match(r'^https?://', base_url):
2814                                 break
2815                     if mpd_base_url and base_url.startswith('/'):
2816                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2817                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2818                         if not mpd_base_url.endswith('/'):
2819                             mpd_base_url += '/'
2820                         base_url = mpd_base_url + base_url
2821                     representation_id = representation_attrib.get('id')
2822                     lang = representation_attrib.get('lang')
2823                     url_el = representation.find(_add_ns('BaseURL'))
2824                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2825                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2826                     if representation_id is not None:
2827                         format_id = representation_id
2828                     else:
2829                         format_id = content_type
2830                     if mpd_id:
2831                         format_id = mpd_id + '-' + format_id
2832                     if content_type in ('video', 'audio'):
2833                         f = {
2834                             'format_id': format_id,
2835                             'manifest_url': mpd_url,
2836                             'ext': mimetype2ext(mime_type),
2837                             'width': int_or_none(representation_attrib.get('width')),
2838                             'height': int_or_none(representation_attrib.get('height')),
2839                             'tbr': float_or_none(bandwidth, 1000),
2840                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2841                             'fps': int_or_none(representation_attrib.get('frameRate')),
2842                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2843                             'format_note': 'DASH %s' % content_type,
2844                             'filesize': filesize,
2845                             'container': mimetype2ext(mime_type) + '_dash',
2846                             **codecs
2847                         }
2848                     elif content_type == 'text':
2849                         f = {
2850                             'ext': mimetype2ext(mime_type),
2851                             'manifest_url': mpd_url,
2852                             'filesize': filesize,
2853                         }
2854                     elif content_type == 'image/jpeg':
2855                         # See test case in VikiIE
2856                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2857                         f = {
2858                             'format_id': format_id,
2859                             'ext': 'mhtml',
2860                             'manifest_url': mpd_url,
2861                             'format_note': 'DASH storyboards (jpeg)',
2862                             'acodec': 'none',
2863                             'vcodec': 'none',
2864                         }
2865                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2866                         f['has_drm'] = True
2867                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2868
2869                     def prepare_template(template_name, identifiers):
2870                         tmpl = representation_ms_info[template_name]
2871                         # First of, % characters outside $...$ templates
2872                         # must be escaped by doubling for proper processing
2873                         # by % operator string formatting used further (see
2874                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2875                         t = ''
2876                         in_template = False
2877                         for c in tmpl:
2878                             t += c
2879                             if c == '$':
2880                                 in_template = not in_template
2881                             elif c == '%' and not in_template:
2882                                 t += c
2883                         # Next, $...$ templates are translated to their
2884                         # %(...) counterparts to be used with % operator
2885                         if representation_id is not None:
2886                             t = t.replace('$RepresentationID$', representation_id)
2887                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2888                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2889                         t.replace('$$', '$')
2890                         return t
2891
2892                     # @initialization is a regular template like @media one
2893                     # so it should be handled just the same way (see
2894                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2895                     if 'initialization' in representation_ms_info:
2896                         initialization_template = prepare_template(
2897                             'initialization',
2898                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2899                             # $Time$ shall not be included for @initialization thus
2900                             # only $Bandwidth$ remains
2901                             ('Bandwidth', ))
2902                         representation_ms_info['initialization_url'] = initialization_template % {
2903                             'Bandwidth': bandwidth,
2904                         }
2905
2906                     def location_key(location):
2907                         return 'url' if re.match(r'^https?://', location) else 'path'
2908
2909                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2910
2911                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2912                         media_location_key = location_key(media_template)
2913
2914                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2915                         # can't be used at the same time
2916                         if '%(Number' in media_template and 's' not in representation_ms_info:
2917                             segment_duration = None
2918                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2919                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2920                                 representation_ms_info['total_number'] = int(math.ceil(
2921                                     float_or_none(period_duration, segment_duration, default=0)))
2922                             representation_ms_info['fragments'] = [{
2923                                 media_location_key: media_template % {
2924                                     'Number': segment_number,
2925                                     'Bandwidth': bandwidth,
2926                                 },
2927                                 'duration': segment_duration,
2928                             } for segment_number in range(
2929                                 representation_ms_info['start_number'],
2930                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2931                         else:
2932                             # $Number*$ or $Time$ in media template with S list available
2933                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2934                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2935                             representation_ms_info['fragments'] = []
2936                             segment_time = 0
2937                             segment_d = None
2938                             segment_number = representation_ms_info['start_number']
2939
2940                             def add_segment_url():
2941                                 segment_url = media_template % {
2942                                     'Time': segment_time,
2943                                     'Bandwidth': bandwidth,
2944                                     'Number': segment_number,
2945                                 }
2946                                 representation_ms_info['fragments'].append({
2947                                     media_location_key: segment_url,
2948                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2949                                 })
2950
2951                             for num, s in enumerate(representation_ms_info['s']):
2952                                 segment_time = s.get('t') or segment_time
2953                                 segment_d = s['d']
2954                                 add_segment_url()
2955                                 segment_number += 1
2956                                 for r in range(s.get('r', 0)):
2957                                     segment_time += segment_d
2958                                     add_segment_url()
2959                                     segment_number += 1
2960                                 segment_time += segment_d
2961                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2962                         # No media template
2963                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2964                         # or any YouTube dashsegments video
2965                         fragments = []
2966                         segment_index = 0
2967                         timescale = representation_ms_info['timescale']
2968                         for s in representation_ms_info['s']:
2969                             duration = float_or_none(s['d'], timescale)
2970                             for r in range(s.get('r', 0) + 1):
2971                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2972                                 fragments.append({
2973                                     location_key(segment_uri): segment_uri,
2974                                     'duration': duration,
2975                                 })
2976                                 segment_index += 1
2977                         representation_ms_info['fragments'] = fragments
2978                     elif 'segment_urls' in representation_ms_info:
2979                         # Segment URLs with no SegmentTimeline
2980                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2981                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2982                         fragments = []
2983                         segment_duration = float_or_none(
2984                             representation_ms_info['segment_duration'],
2985                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2986                         for segment_url in representation_ms_info['segment_urls']:
2987                             fragment = {
2988                                 location_key(segment_url): segment_url,
2989                             }
2990                             if segment_duration:
2991                                 fragment['duration'] = segment_duration
2992                             fragments.append(fragment)
2993                         representation_ms_info['fragments'] = fragments
2994                     # If there is a fragments key available then we correctly recognized fragmented media.
2995                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2996                     # assumption is not necessarily correct since we may simply have no support for
2997                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2998                     if 'fragments' in representation_ms_info:
2999                         f.update({
3000                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3001                             'url': mpd_url or base_url,
3002                             'fragment_base_url': base_url,
3003                             'fragments': [],
3004                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3005                         })
3006                         if 'initialization_url' in representation_ms_info:
3007                             initialization_url = representation_ms_info['initialization_url']
3008                             if not f.get('url'):
3009                                 f['url'] = initialization_url
3010                             f['fragments'].append({location_key(initialization_url): initialization_url})
3011                         f['fragments'].extend(representation_ms_info['fragments'])
3012                         if not period_duration:
3013                             period_duration = try_get(
3014                                 representation_ms_info,
3015                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3016                     else:
3017                         # Assuming direct URL to unfragmented media.
3018                         f['url'] = base_url
3019                     if content_type in ('video', 'audio', 'image/jpeg'):
3020                         f['manifest_stream_number'] = stream_numbers[f['url']]
3021                         stream_numbers[f['url']] += 1
3022                         formats.append(f)
3023                     elif content_type == 'text':
3024                         subtitles.setdefault(lang or 'und', []).append(f)
3025
3026         return formats, subtitles
3027
3028     def _extract_ism_formats(self, *args, **kwargs):
3029         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3030         if subs:
3031             self._report_ignoring_subs('ISM')
3032         return fmts
3033
3034     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3035         res = self._download_xml_handle(
3036             ism_url, video_id,
3037             note='Downloading ISM manifest' if note is None else note,
3038             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3039             fatal=fatal, data=data, headers=headers, query=query)
3040         if res is False:
3041             return [], {}
3042         ism_doc, urlh = res
3043         if ism_doc is None:
3044             return [], {}
3045
3046         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3047
3048     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3049         """
3050         Parse formats from ISM manifest.
3051         References:
3052          1. [MS-SSTR]: Smooth Streaming Protocol,
3053             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3054         """
3055         if ism_doc.get('IsLive') == 'TRUE':
3056             return [], {}
3057
3058         duration = int(ism_doc.attrib['Duration'])
3059         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3060
3061         formats = []
3062         subtitles = {}
3063         for stream in ism_doc.findall('StreamIndex'):
3064             stream_type = stream.get('Type')
3065             if stream_type not in ('video', 'audio', 'text'):
3066                 continue
3067             url_pattern = stream.attrib['Url']
3068             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3069             stream_name = stream.get('Name')
3070             stream_language = stream.get('Language', 'und')
3071             for track in stream.findall('QualityLevel'):
3072                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3073                 # TODO: add support for WVC1 and WMAP
3074                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3075                     self.report_warning('%s is not a supported codec' % fourcc)
3076                     continue
3077                 tbr = int(track.attrib['Bitrate']) // 1000
3078                 # [1] does not mention Width and Height attributes. However,
3079                 # they're often present while MaxWidth and MaxHeight are
3080                 # missing, so should be used as fallbacks
3081                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3082                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3083                 sampling_rate = int_or_none(track.get('SamplingRate'))
3084
3085                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3086                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3087
3088                 fragments = []
3089                 fragment_ctx = {
3090                     'time': 0,
3091                 }
3092                 stream_fragments = stream.findall('c')
3093                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3094                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3095                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3096                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3097                     if not fragment_ctx['duration']:
3098                         try:
3099                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3100                         except IndexError:
3101                             next_fragment_time = duration
3102                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3103                     for _ in range(fragment_repeat):
3104                         fragments.append({
3105                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3106                             'duration': fragment_ctx['duration'] / stream_timescale,
3107                         })
3108                         fragment_ctx['time'] += fragment_ctx['duration']
3109
3110                 if stream_type == 'text':
3111                     subtitles.setdefault(stream_language, []).append({
3112                         'ext': 'ismt',
3113                         'protocol': 'ism',
3114                         'url': ism_url,
3115                         'manifest_url': ism_url,
3116                         'fragments': fragments,
3117                         '_download_params': {
3118                             'stream_type': stream_type,
3119                             'duration': duration,
3120                             'timescale': stream_timescale,
3121                             'fourcc': fourcc,
3122                             'language': stream_language,
3123                             'codec_private_data': track.get('CodecPrivateData'),
3124                         }
3125                     })
3126                 elif stream_type in ('video', 'audio'):
3127                     formats.append({
3128                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3129                         'url': ism_url,
3130                         'manifest_url': ism_url,
3131                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3132                         'width': width,
3133                         'height': height,
3134                         'tbr': tbr,
3135                         'asr': sampling_rate,
3136                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3137                         'acodec': 'none' if stream_type == 'video' else fourcc,
3138                         'protocol': 'ism',
3139                         'fragments': fragments,
3140                         'has_drm': ism_doc.find('Protection') is not None,
3141                         '_download_params': {
3142                             'stream_type': stream_type,
3143                             'duration': duration,
3144                             'timescale': stream_timescale,
3145                             'width': width or 0,
3146                             'height': height or 0,
3147                             'fourcc': fourcc,
3148                             'language': stream_language,
3149                             'codec_private_data': track.get('CodecPrivateData'),
3150                             'sampling_rate': sampling_rate,
3151                             'channels': int_or_none(track.get('Channels', 2)),
3152                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3153                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3154                         },
3155                     })
3156         return formats, subtitles
3157
3158     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3159         def absolute_url(item_url):
3160             return urljoin(base_url, item_url)
3161
3162         def parse_content_type(content_type):
3163             if not content_type:
3164                 return {}
3165             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3166             if ctr:
3167                 mimetype, codecs = ctr.groups()
3168                 f = parse_codecs(codecs)
3169                 f['ext'] = mimetype2ext(mimetype)
3170                 return f
3171             return {}
3172
3173         def _media_formats(src, cur_media_type, type_info={}):
3174             full_url = absolute_url(src)
3175             ext = type_info.get('ext') or determine_ext(full_url)
3176             if ext == 'm3u8':
3177                 is_plain_url = False
3178                 formats = self._extract_m3u8_formats(
3179                     full_url, video_id, ext='mp4',
3180                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3181                     preference=preference, quality=quality, fatal=False)
3182             elif ext == 'mpd':
3183                 is_plain_url = False
3184                 formats = self._extract_mpd_formats(
3185                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3186             else:
3187                 is_plain_url = True
3188                 formats = [{
3189                     'url': full_url,
3190                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3191                 }]
3192             return is_plain_url, formats
3193
3194         entries = []
3195         # amp-video and amp-audio are very similar to their HTML5 counterparts
3196         # so we wll include them right here (see
3197         # https://www.ampproject.org/docs/reference/components/amp-video)
3198         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3199         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3200         media_tags = [(media_tag, media_tag_name, media_type, '')
3201                       for media_tag, media_tag_name, media_type
3202                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3203         media_tags.extend(re.findall(
3204             # We only allow video|audio followed by a whitespace or '>'.
3205             # Allowing more characters may end up in significant slow down (see
3206             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3207             # http://www.porntrex.com/maps/videositemap.xml).
3208             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3209         for media_tag, _, media_type, media_content in media_tags:
3210             media_info = {
3211                 'formats': [],
3212                 'subtitles': {},
3213             }
3214             media_attributes = extract_attributes(media_tag)
3215             src = strip_or_none(media_attributes.get('src'))
3216             if src:
3217                 _, formats = _media_formats(src, media_type)
3218                 media_info['formats'].extend(formats)
3219             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3220             if media_content:
3221                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3222                     s_attr = extract_attributes(source_tag)
3223                     # data-video-src and data-src are non standard but seen
3224                     # several times in the wild
3225                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3226                     if not src:
3227                         continue
3228                     f = parse_content_type(s_attr.get('type'))
3229                     is_plain_url, formats = _media_formats(src, media_type, f)
3230                     if is_plain_url:
3231                         # width, height, res, label and title attributes are
3232                         # all not standard but seen several times in the wild
3233                         labels = [
3234                             s_attr.get(lbl)
3235                             for lbl in ('label', 'title')
3236                             if str_or_none(s_attr.get(lbl))
3237                         ]
3238                         width = int_or_none(s_attr.get('width'))
3239                         height = (int_or_none(s_attr.get('height'))
3240                                   or int_or_none(s_attr.get('res')))
3241                         if not width or not height:
3242                             for lbl in labels:
3243                                 resolution = parse_resolution(lbl)
3244                                 if not resolution:
3245                                     continue
3246                                 width = width or resolution.get('width')
3247                                 height = height or resolution.get('height')
3248                         for lbl in labels:
3249                             tbr = parse_bitrate(lbl)
3250                             if tbr:
3251                                 break
3252                         else:
3253                             tbr = None
3254                         f.update({
3255                             'width': width,
3256                             'height': height,
3257                             'tbr': tbr,
3258                             'format_id': s_attr.get('label') or s_attr.get('title'),
3259                         })
3260                         f.update(formats[0])
3261                         media_info['formats'].append(f)
3262                     else:
3263                         media_info['formats'].extend(formats)
3264                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3265                     track_attributes = extract_attributes(track_tag)
3266                     kind = track_attributes.get('kind')
3267                     if not kind or kind in ('subtitles', 'captions'):
3268                         src = strip_or_none(track_attributes.get('src'))
3269                         if not src:
3270                             continue
3271                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3272                         media_info['subtitles'].setdefault(lang, []).append({
3273                             'url': absolute_url(src),
3274                         })
3275             for f in media_info['formats']:
3276                 f.setdefault('http_headers', {})['Referer'] = base_url
3277             if media_info['formats'] or media_info['subtitles']:
3278                 entries.append(media_info)
3279         return entries
3280
3281     def _extract_akamai_formats(self, *args, **kwargs):
3282         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3283         if subs:
3284             self._report_ignoring_subs('akamai')
3285         return fmts
3286
3287     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3288         signed = 'hdnea=' in manifest_url
3289         if not signed:
3290             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3291             manifest_url = re.sub(
3292                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3293                 '', manifest_url).strip('?')
3294
3295         formats = []
3296         subtitles = {}
3297
3298         hdcore_sign = 'hdcore=3.7.0'
3299         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3300         hds_host = hosts.get('hds')
3301         if hds_host:
3302             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3303         if 'hdcore=' not in f4m_url:
3304             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3305         f4m_formats = self._extract_f4m_formats(
3306             f4m_url, video_id, f4m_id='hds', fatal=False)
3307         for entry in f4m_formats:
3308             entry.update({'extra_param_to_segment_url': hdcore_sign})
3309         formats.extend(f4m_formats)
3310
3311         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3312         hls_host = hosts.get('hls')
3313         if hls_host:
3314             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3315         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3316             m3u8_url, video_id, 'mp4', 'm3u8_native',
3317             m3u8_id='hls', fatal=False)
3318         formats.extend(m3u8_formats)
3319         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3320
3321         http_host = hosts.get('http')
3322         if http_host and m3u8_formats and not signed:
3323             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3324             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3325             qualities_length = len(qualities)
3326             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3327                 i = 0
3328                 for f in m3u8_formats:
3329                     if f['vcodec'] != 'none':
3330                         for protocol in ('http', 'https'):
3331                             http_f = f.copy()
3332                             del http_f['manifest_url']
3333                             http_url = re.sub(
3334                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3335                             http_f.update({
3336                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3337                                 'url': http_url,
3338                                 'protocol': protocol,
3339                             })
3340                             formats.append(http_f)
3341                         i += 1
3342
3343         return formats, subtitles
3344
3345     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3346         query = compat_urlparse.urlparse(url).query
3347         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3348         mobj = re.search(
3349             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3350         url_base = mobj.group('url')
3351         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3352         formats = []
3353
3354         def manifest_url(manifest):
3355             m_url = '%s/%s' % (http_base_url, manifest)
3356             if query:
3357                 m_url += '?%s' % query
3358             return m_url
3359
3360         if 'm3u8' not in skip_protocols:
3361             formats.extend(self._extract_m3u8_formats(
3362                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3363                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3364         if 'f4m' not in skip_protocols:
3365             formats.extend(self._extract_f4m_formats(
3366                 manifest_url('manifest.f4m'),
3367                 video_id, f4m_id='hds', fatal=False))
3368         if 'dash' not in skip_protocols:
3369             formats.extend(self._extract_mpd_formats(
3370                 manifest_url('manifest.mpd'),
3371                 video_id, mpd_id='dash', fatal=False))
3372         if re.search(r'(?:/smil:|\.smil)', url_base):
3373             if 'smil' not in skip_protocols:
3374                 rtmp_formats = self._extract_smil_formats(
3375                     manifest_url('jwplayer.smil'),
3376                     video_id, fatal=False)
3377                 for rtmp_format in rtmp_formats:
3378                     rtsp_format = rtmp_format.copy()
3379                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3380                     del rtsp_format['play_path']
3381                     del rtsp_format['ext']
3382                     rtsp_format.update({
3383                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3384                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3385                         'protocol': 'rtsp',
3386                     })
3387                     formats.extend([rtmp_format, rtsp_format])
3388         else:
3389             for protocol in ('rtmp', 'rtsp'):
3390                 if protocol not in skip_protocols:
3391                     formats.append({
3392                         'url': '%s:%s' % (protocol, url_base),
3393                         'format_id': protocol,
3394                         'protocol': protocol,
3395                     })
3396         return formats
3397
3398     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3399         mobj = re.search(
3400             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3401             webpage)
3402         if mobj:
3403             try:
3404                 jwplayer_data = self._parse_json(mobj.group('options'),
3405                                                  video_id=video_id,
3406                                                  transform_source=transform_source)
3407             except ExtractorError:
3408                 pass
3409             else:
3410                 if isinstance(jwplayer_data, dict):
3411                     return jwplayer_data
3412
3413     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3414         jwplayer_data = self._find_jwplayer_data(
3415             webpage, video_id, transform_source=js_to_json)
3416         return self._parse_jwplayer_data(
3417             jwplayer_data, video_id, *args, **kwargs)
3418
3419     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3420                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3421         # JWPlayer backward compatibility: flattened playlists
3422         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3423         if 'playlist' not in jwplayer_data:
3424             jwplayer_data = {'playlist': [jwplayer_data]}
3425
3426         entries = []
3427
3428         # JWPlayer backward compatibility: single playlist item
3429         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3430         if not isinstance(jwplayer_data['playlist'], list):
3431             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3432
3433         for video_data in jwplayer_data['playlist']:
3434             # JWPlayer backward compatibility: flattened sources
3435             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3436             if 'sources' not in video_data:
3437                 video_data['sources'] = [video_data]
3438
3439             this_video_id = video_id or video_data['mediaid']
3440
3441             formats = self._parse_jwplayer_formats(
3442                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3443                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3444
3445             subtitles = {}
3446             tracks = video_data.get('tracks')
3447             if tracks and isinstance(tracks, list):
3448                 for track in tracks:
3449                     if not isinstance(track, dict):
3450                         continue
3451                     track_kind = track.get('kind')
3452                     if not track_kind or not isinstance(track_kind, compat_str):
3453                         continue
3454                     if track_kind.lower() not in ('captions', 'subtitles'):
3455                         continue
3456                     track_url = urljoin(base_url, track.get('file'))
3457                     if not track_url:
3458                         continue
3459                     subtitles.setdefault(track.get('label') or 'en', []).append({
3460                         'url': self._proto_relative_url(track_url)
3461                     })
3462
3463             entry = {
3464                 'id': this_video_id,
3465                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3466                 'description': clean_html(video_data.get('description')),
3467                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3468                 'timestamp': int_or_none(video_data.get('pubdate')),
3469                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3470                 'subtitles': subtitles,
3471             }
3472             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3473             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3474                 entry.update({
3475                     '_type': 'url_transparent',
3476                     'url': formats[0]['url'],
3477                 })
3478             else:
3479                 self._sort_formats(formats)
3480                 entry['formats'] = formats
3481             entries.append(entry)
3482         if len(entries) == 1:
3483             return entries[0]
3484         else:
3485             return self.playlist_result(entries)
3486
3487     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3488                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3489         urls = []
3490         formats = []
3491         for source in jwplayer_sources_data:
3492             if not isinstance(source, dict):
3493                 continue
3494             source_url = urljoin(
3495                 base_url, self._proto_relative_url(source.get('file')))
3496             if not source_url or source_url in urls:
3497                 continue
3498             urls.append(source_url)
3499             source_type = source.get('type') or ''
3500             ext = mimetype2ext(source_type) or determine_ext(source_url)
3501             if source_type == 'hls' or ext == 'm3u8':
3502                 formats.extend(self._extract_m3u8_formats(
3503                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3504                     m3u8_id=m3u8_id, fatal=False))
3505             elif source_type == 'dash' or ext == 'mpd':
3506                 formats.extend(self._extract_mpd_formats(
3507                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3508             elif ext == 'smil':
3509                 formats.extend(self._extract_smil_formats(
3510                     source_url, video_id, fatal=False))
3511             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3512             elif source_type.startswith('audio') or ext in (
3513                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3514                 formats.append({
3515                     'url': source_url,
3516                     'vcodec': 'none',
3517                     'ext': ext,
3518                 })
3519             else:
3520                 height = int_or_none(source.get('height'))
3521                 if height is None:
3522                     # Often no height is provided but there is a label in
3523                     # format like "1080p", "720p SD", or 1080.
3524                     height = int_or_none(self._search_regex(
3525                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3526                         'height', default=None))
3527                 a_format = {
3528                     'url': source_url,
3529                     'width': int_or_none(source.get('width')),
3530                     'height': height,
3531                     'tbr': int_or_none(source.get('bitrate')),
3532                     'ext': ext,
3533                 }
3534                 if source_url.startswith('rtmp'):
3535                     a_format['ext'] = 'flv'
3536                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3537                     # of jwplayer.flash.swf
3538                     rtmp_url_parts = re.split(
3539                         r'((?:mp4|mp3|flv):)', source_url, 1)
3540                     if len(rtmp_url_parts) == 3:
3541                         rtmp_url, prefix, play_path = rtmp_url_parts
3542                         a_format.update({
3543                             'url': rtmp_url,
3544                             'play_path': prefix + play_path,
3545                         })
3546                     if rtmp_params:
3547                         a_format.update(rtmp_params)
3548                 formats.append(a_format)
3549         return formats
3550
3551     def _live_title(self, name):
3552         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3553         return name
3554
3555     def _int(self, v, name, fatal=False, **kwargs):
3556         res = int_or_none(v, **kwargs)
3557         if res is None:
3558             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3559             if fatal:
3560                 raise ExtractorError(msg)
3561             else:
3562                 self.report_warning(msg)
3563         return res
3564
3565     def _float(self, v, name, fatal=False, **kwargs):
3566         res = float_or_none(v, **kwargs)
3567         if res is None:
3568             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3569             if fatal:
3570                 raise ExtractorError(msg)
3571             else:
3572                 self.report_warning(msg)
3573         return res
3574
3575     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3576                     path='/', secure=False, discard=False, rest={}, **kwargs):
3577         cookie = compat_cookiejar_Cookie(
3578             0, name, value, port, port is not None, domain, True,
3579             domain.startswith('.'), path, True, secure, expire_time,
3580             discard, None, None, rest)
3581         self._downloader.cookiejar.set_cookie(cookie)
3582
3583     def _get_cookies(self, url):
3584         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3585         req = sanitized_Request(url)
3586         self._downloader.cookiejar.add_cookie_header(req)
3587         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3588
3589     def _apply_first_set_cookie_header(self, url_handle, cookie):
3590         """
3591         Apply first Set-Cookie header instead of the last. Experimental.
3592
3593         Some sites (e.g. [1-3]) may serve two cookies under the same name
3594         in Set-Cookie header and expect the first (old) one to be set rather
3595         than second (new). However, as of RFC6265 the newer one cookie
3596         should be set into cookie store what actually happens.
3597         We will workaround this issue by resetting the cookie to
3598         the first one manually.
3599         1. https://new.vk.com/
3600         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3601         3. https://learning.oreilly.com/
3602         """
3603         for header, cookies in url_handle.headers.items():
3604             if header.lower() != 'set-cookie':
3605                 continue
3606             if sys.version_info[0] >= 3:
3607                 cookies = cookies.encode('iso-8859-1')
3608             cookies = cookies.decode('utf-8')
3609             cookie_value = re.search(
3610                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3611             if cookie_value:
3612                 value, domain = cookie_value.groups()
3613                 self._set_cookie(domain, cookie, value)
3614                 break
3615
3616     def get_testcases(self, include_onlymatching=False):
3617         t = getattr(self, '_TEST', None)
3618         if t:
3619             assert not hasattr(self, '_TESTS'), \
3620                 '%s has _TEST and _TESTS' % type(self).__name__
3621             tests = [t]
3622         else:
3623             tests = getattr(self, '_TESTS', [])
3624         for t in tests:
3625             if not include_onlymatching and t.get('only_matching', False):
3626                 continue
3627             t['name'] = type(self).__name__[:-len('IE')]
3628             yield t
3629
3630     def is_suitable(self, age_limit):
3631         """ Test whether the extractor is generally suitable for the given
3632         age limit (i.e. pornographic sites are not, all others usually are) """
3633
3634         any_restricted = False
3635         for tc in self.get_testcases(include_onlymatching=False):
3636             if tc.get('playlist', []):
3637                 tc = tc['playlist'][0]
3638             is_restricted = age_restricted(
3639                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3640             if not is_restricted:
3641                 return True
3642             any_restricted = any_restricted or is_restricted
3643         return not any_restricted
3644
3645     def extract_subtitles(self, *args, **kwargs):
3646         if (self.get_param('writesubtitles', False)
3647                 or self.get_param('listsubtitles')):
3648             return self._get_subtitles(*args, **kwargs)
3649         return {}
3650
3651     def _get_subtitles(self, *args, **kwargs):
3652         raise NotImplementedError('This method must be implemented by subclasses')
3653
3654     def extract_comments(self, *args, **kwargs):
3655         if not self.get_param('getcomments'):
3656             return None
3657         generator = self._get_comments(*args, **kwargs)
3658
3659         def extractor():
3660             comments = []
3661             interrupted = True
3662             try:
3663                 while True:
3664                     comments.append(next(generator))
3665             except StopIteration:
3666                 interrupted = False
3667             except KeyboardInterrupt:
3668                 self.to_screen('Interrupted by user')
3669             except Exception as e:
3670                 if self.get_param('ignoreerrors') is not True:
3671                     raise
3672                 self._downloader.report_error(e)
3673             comment_count = len(comments)
3674             self.to_screen(f'Extracted {comment_count} comments')
3675             return {
3676                 'comments': comments,
3677                 'comment_count': None if interrupted else comment_count
3678             }
3679         return extractor
3680
3681     def _get_comments(self, *args, **kwargs):
3682         raise NotImplementedError('This method must be implemented by subclasses')
3683
3684     @staticmethod
3685     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3686         """ Merge subtitle items for one language. Items with duplicated URLs/data
3687         will be dropped. """
3688         list1_data = set([item.get('url') or item['data'] for item in subtitle_list1])
3689         ret = list(subtitle_list1)
3690         ret.extend([item for item in subtitle_list2 if (item.get('url') or item['data']) not in list1_data])
3691         return ret
3692
3693     @classmethod
3694     def _merge_subtitles(cls, *dicts, target=None):
3695         """ Merge subtitle dictionaries, language by language. """
3696         if target is None:
3697             target = {}
3698         for d in dicts:
3699             for lang, subs in d.items():
3700                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3701         return target
3702
3703     def extract_automatic_captions(self, *args, **kwargs):
3704         if (self.get_param('writeautomaticsub', False)
3705                 or self.get_param('listsubtitles')):
3706             return self._get_automatic_captions(*args, **kwargs)
3707         return {}
3708
3709     def _get_automatic_captions(self, *args, **kwargs):
3710         raise NotImplementedError('This method must be implemented by subclasses')
3711
3712     def mark_watched(self, *args, **kwargs):
3713         if not self.get_param('mark_watched', False):
3714             return
3715         if (self.supports_login() and self._get_login_info()[0] is not None
3716                 or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')):
3717             self._mark_watched(*args, **kwargs)
3718
3719     def _mark_watched(self, *args, **kwargs):
3720         raise NotImplementedError('This method must be implemented by subclasses')
3721
3722     def geo_verification_headers(self):
3723         headers = {}
3724         geo_verification_proxy = self.get_param('geo_verification_proxy')
3725         if geo_verification_proxy:
3726             headers['Ytdl-request-proxy'] = geo_verification_proxy
3727         return headers
3728
3729     def _generic_id(self, url):
3730         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3731
3732     def _generic_title(self, url):
3733         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3734
3735     @staticmethod
3736     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3737         all_known = all(map(
3738             lambda x: x is not None,
3739             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3740         return (
3741             'private' if is_private
3742             else 'premium_only' if needs_premium
3743             else 'subscriber_only' if needs_subscription
3744             else 'needs_auth' if needs_auth
3745             else 'unlisted' if is_unlisted
3746             else 'public' if all_known
3747             else None)
3748
3749     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3750         '''
3751         @returns            A list of values for the extractor argument given by "key"
3752                             or "default" if no such key is present
3753         @param default      The default value to return when the key is not present (default: [])
3754         @param casesense    When false, the values are converted to lower case
3755         '''
3756         val = traverse_obj(
3757             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3758         if val is None:
3759             return [] if default is NO_DEFAULT else default
3760         return list(val) if casesense else [x.lower() for x in val]
3761
3762     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3763         if not playlist_id or not video_id:
3764             return not video_id
3765
3766         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3767         if no_playlist is not None:
3768             return not no_playlist
3769
3770         video_id = '' if video_id is True else f' {video_id}'
3771         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3772         if self.get_param('noplaylist'):
3773             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3774             return False
3775         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3776         return True
3777
3778
3779 class SearchInfoExtractor(InfoExtractor):
3780     """
3781     Base class for paged search queries extractors.
3782     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3783     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3784     """
3785
3786     _MAX_RESULTS = float('inf')
3787
3788     @classmethod
3789     def _make_valid_url(cls):
3790         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3791
3792     def _real_extract(self, query):
3793         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3794         if prefix == '':
3795             return self._get_n_results(query, 1)
3796         elif prefix == 'all':
3797             return self._get_n_results(query, self._MAX_RESULTS)
3798         else:
3799             n = int(prefix)
3800             if n <= 0:
3801                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3802             elif n > self._MAX_RESULTS:
3803                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3804                 n = self._MAX_RESULTS
3805             return self._get_n_results(query, n)
3806
3807     def _get_n_results(self, query, n):
3808         """Get a specified number of results for a query.
3809         Either this function or _search_results must be overridden by subclasses """
3810         return self.playlist_result(
3811             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3812             query, query)
3813
3814     def _search_results(self, query):
3815         """Returns an iterator of search results"""
3816         raise NotImplementedError('This method must be implemented by subclasses')
3817
3818     @property
3819     def SEARCH_KEY(self):
3820         return self._SEARCH_KEY