yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import collections
   6 import hashlib
   7 import itertools
   8 import json
   9 import netrc
  10 import os
  11 import random
  12 import re
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar_Cookie,
  19     compat_cookies_SimpleCookie,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader import FileDownloader
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     encode_data_uri,
  49     error_to_compat_str,
  50     extract_attributes,
  51     ExtractorError,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     join_nonempty,
  59     js_to_json,
  60     JSON_LD_RE,
  61     mimetype2ext,
  62     network_exceptions,
  63     NO_DEFAULT,
  64     orderedSet,
  65     parse_bitrate,
  66     parse_codecs,
  67     parse_duration,
  68     parse_iso8601,
  69     parse_m3u8_attributes,
  70     parse_resolution,
  71     RegexNotFoundError,
  72     sanitize_filename,
  73     sanitized_Request,
  74     str_or_none,
  75     str_to_int,
  76     strip_or_none,
  77     traverse_obj,
  78     unescapeHTML,
  79     UnsupportedError,
  80     unified_strdate,
  81     unified_timestamp,
  82     update_Request,
  83     update_url_query,
  84     url_basename,
  85     url_or_none,
  86     urljoin,
  87     variadic,
  88     xpath_element,
  89     xpath_text,
  90     xpath_with_ns,
  91 )
  92
  93
  94 class InfoExtractor(object):
  95     """Information Extractor class.
  96
  97     Information extractors are the classes that, given a URL, extract
  98     information about the video (or videos) the URL refers to. This
  99     information includes the real video URL, the video title, author and
 100     others. The information is stored in a dictionary which is then
 101     passed to the YoutubeDL. The YoutubeDL processes this
 102     information possibly downloading the video to the file system, among
 103     other possible outcomes.
 104
 105     The type field determines the type of the result.
 106     By far the most common value (and the default if _type is missing) is
 107     "video", which indicates a single video.
 108
 109     For a video, the dictionaries must include the following fields:
 110
 111     id:             Video identifier.
 112     title:          Video title, unescaped.
 113
 114     Additionally, it must contain either a formats entry or a url one:
 115
 116     formats:        A list of dictionaries for each format available, ordered
 117                     from worst to best quality.
 118
 119                     Potential fields:
 120                     * url        The mandatory URL representing the media:
 121                                    for plain file media - HTTP URL of this file,
 122                                    for RTMP - RTMP URL,
 123                                    for HLS - URL of the M3U8 media playlist,
 124                                    for HDS - URL of the F4M manifest,
 125                                    for DASH
 126                                      - HTTP URL to plain file media (in case of
 127                                        unfragmented media)
 128                                      - URL of the MPD manifest or base URL
 129                                        representing the media if MPD manifest
 130                                        is parsed from a string (in case of
 131                                        fragmented media)
 132                                    for MSS - URL of the ISM manifest.
 133                     * manifest_url
 134                                  The URL of the manifest file in case of
 135                                  fragmented media:
 136                                    for HLS - URL of the M3U8 master playlist,
 137                                    for HDS - URL of the F4M manifest,
 138                                    for DASH - URL of the MPD manifest,
 139                                    for MSS - URL of the ISM manifest.
 140                     * ext        Will be calculated from URL if missing
 141                     * format     A human-readable description of the format
 142                                  ("mp4 container with h264/opus").
 143                                  Calculated from the format_id, width, height.
 144                                  and format_note fields if missing.
 145                     * format_id  A short description of the format
 146                                  ("mp4_h264_opus" or "19").
 147                                 Technically optional, but strongly recommended.
 148                     * format_note Additional info about the format
 149                                  ("3D" or "DASH video")
 150                     * width      Width of the video, if known
 151                     * height     Height of the video, if known
 152                     * resolution Textual description of width and height
 153                     * dynamic_range The dynamic range of the video. One of:
 154                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 155                     * tbr        Average bitrate of audio and video in KBit/s
 156                     * abr        Average audio bitrate in KBit/s
 157                     * acodec     Name of the audio codec in use
 158                     * asr        Audio sampling rate in Hertz
 159                     * vbr        Average video bitrate in KBit/s
 160                     * fps        Frame rate
 161                     * vcodec     Name of the video codec in use
 162                     * container  Name of the container format
 163                     * filesize   The number of bytes, if known in advance
 164                     * filesize_approx  An estimate for the number of bytes
 165                     * player_url SWF Player URL (used for rtmpdump).
 166                     * protocol   The protocol that will be used for the actual
 167                                  download, lower-case. One of "http", "https" or
 168                                  one of the protocols defined in downloader.PROTOCOL_MAP
 169                     * fragment_base_url
 170                                  Base URL for fragments. Each fragment's path
 171                                  value (if present) will be relative to
 172                                  this URL.
 173                     * fragments  A list of fragments of a fragmented media.
 174                                  Each fragment entry must contain either an url
 175                                  or a path. If an url is present it should be
 176                                  considered by a client. Otherwise both path and
 177                                  fragment_base_url must be present. Here is
 178                                  the list of all potential fields:
 179                                  * "url" - fragment's URL
 180                                  * "path" - fragment's path relative to
 181                                             fragment_base_url
 182                                  * "duration" (optional, int or float)
 183                                  * "filesize" (optional, int)
 184                     * is_from_start  Is a live format that can be downloaded
 185                                 from the start. Boolean
 186                     * preference Order number of this format. If this field is
 187                                  present and not None, the formats get sorted
 188                                  by this field, regardless of all other values.
 189                                  -1 for default (order by other properties),
 190                                  -2 or smaller for less than default.
 191                                  < -1000 to hide the format (if there is
 192                                     another one which is strictly better)
 193                     * language   Language code, e.g. "de" or "en-US".
 194                     * language_preference  Is this in the language mentioned in
 195                                  the URL?
 196                                  10 if it's what the URL is about,
 197                                  -1 for default (don't know),
 198                                  -10 otherwise, other values reserved for now.
 199                     * quality    Order number of the video quality of this
 200                                  format, irrespective of the file format.
 201                                  -1 for default (order by other properties),
 202                                  -2 or smaller for less than default.
 203                     * source_preference  Order number for this video source
 204                                   (quality takes higher priority)
 205                                  -1 for default (order by other properties),
 206                                  -2 or smaller for less than default.
 207                     * http_headers  A dictionary of additional HTTP headers
 208                                  to add to the request.
 209                     * stretched_ratio  If given and not 1, indicates that the
 210                                  video's pixels are not square.
 211                                  width : height ratio as float.
 212                     * no_resume  The server does not support resuming the
 213                                  (HTTP or RTMP) download. Boolean.
 214                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 215                     * downloader_options  A dictionary of downloader options as
 216                                  described in FileDownloader
 217                     RTMP formats can also have the additional fields: page_url,
 218                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 219                     rtmp_protocol, rtmp_real_time
 220
 221     url:            Final video URL.
 222     ext:            Video filename extension.
 223     format:         The video format, defaults to ext (used for --get-format)
 224     player_url:     SWF Player URL (used for rtmpdump).
 225
 226     The following fields are optional:
 227
 228     alt_title:      A secondary title of the video.
 229     display_id      An alternative identifier for the video, not necessarily
 230                     unique, but available before title. Typically, id is
 231                     something like "4234987", title "Dancing naked mole rats",
 232                     and display_id "dancing-naked-mole-rats"
 233     thumbnails:     A list of dictionaries, with the following entries:
 234                         * "id" (optional, string) - Thumbnail format ID
 235                         * "url"
 236                         * "preference" (optional, int) - quality of the image
 237                         * "width" (optional, int)
 238                         * "height" (optional, int)
 239                         * "resolution" (optional, string "{width}x{height}",
 240                                         deprecated)
 241                         * "filesize" (optional, int)
 242                         * "http_headers" (dict) - HTTP headers for the request
 243     thumbnail:      Full URL to a video thumbnail image.
 244     description:    Full video description.
 245     uploader:       Full name of the video uploader.
 246     license:        License name the video is licensed under.
 247     creator:        The creator of the video.
 248     timestamp:      UNIX timestamp of the moment the video was uploaded
 249     upload_date:    Video upload date (YYYYMMDD).
 250                     If not explicitly set, calculated from timestamp
 251     release_timestamp: UNIX timestamp of the moment the video was released.
 252                     If it is not clear whether to use timestamp or this, use the former
 253     release_date:   The date (YYYYMMDD) when the video was released.
 254                     If not explicitly set, calculated from release_timestamp
 255     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 256     modified_date:   The date (YYYYMMDD) when the video was last modified.
 257                     If not explicitly set, calculated from modified_timestamp
 258     uploader_id:    Nickname or id of the video uploader.
 259     uploader_url:   Full URL to a personal webpage of the video uploader.
 260     channel:        Full name of the channel the video is uploaded on.
 261                     Note that channel fields may or may not repeat uploader
 262                     fields. This depends on a particular extractor.
 263     channel_id:     Id of the channel.
 264     channel_url:    Full URL to a channel webpage.
 265     channel_follower_count: Number of followers of the channel.
 266     location:       Physical location where the video was filmed.
 267     subtitles:      The available subtitles as a dictionary in the format
 268                     {tag: subformats}. "tag" is usually a language code, and
 269                     "subformats" is a list sorted from lower to higher
 270                     preference, each element is a dictionary with the "ext"
 271                     entry and one of:
 272                         * "data": The subtitles file contents
 273                         * "url": A URL pointing to the subtitles file
 274                     It can optionally also have:
 275                         * "name": Name or description of the subtitles
 276                         * http_headers: A dictionary of additional HTTP headers
 277                                   to add to the request.
 278                     "ext" will be calculated from URL if missing
 279     automatic_captions: Like 'subtitles'; contains automatically generated
 280                     captions instead of normal subtitles
 281     duration:       Length of the video in seconds, as an integer or float.
 282     view_count:     How many users have watched the video on the platform.
 283     like_count:     Number of positive ratings of the video
 284     dislike_count:  Number of negative ratings of the video
 285     repost_count:   Number of reposts of the video
 286     average_rating: Average rating give by users, the scale used depends on the webpage
 287     comment_count:  Number of comments on the video
 288     comments:       A list of comments, each with one or more of the following
 289                     properties (all but one of text or html optional):
 290                         * "author" - human-readable name of the comment author
 291                         * "author_id" - user ID of the comment author
 292                         * "author_thumbnail" - The thumbnail of the comment author
 293                         * "id" - Comment ID
 294                         * "html" - Comment as HTML
 295                         * "text" - Plain text of the comment
 296                         * "timestamp" - UNIX timestamp of comment
 297                         * "parent" - ID of the comment this one is replying to.
 298                                      Set to "root" to indicate that this is a
 299                                      comment to the original video.
 300                         * "like_count" - Number of positive ratings of the comment
 301                         * "dislike_count" - Number of negative ratings of the comment
 302                         * "is_favorited" - Whether the comment is marked as
 303                                            favorite by the video uploader
 304                         * "author_is_uploader" - Whether the comment is made by
 305                                                  the video uploader
 306     age_limit:      Age restriction for the video, as an integer (years)
 307     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 308                     should allow to get the same result again. (It will be set
 309                     by YoutubeDL if it's missing)
 310     categories:     A list of categories that the video falls in, for example
 311                     ["Sports", "Berlin"]
 312     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 313     cast:           A list of the video cast
 314     is_live:        True, False, or None (=unknown). Whether this video is a
 315                     live stream that goes on instead of a fixed-length video.
 316     was_live:       True, False, or None (=unknown). Whether this video was
 317                     originally a live stream.
 318     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 319                     If absent, automatically set from is_live, was_live
 320     start_time:     Time in seconds where the reproduction should start, as
 321                     specified in the URL.
 322     end_time:       Time in seconds where the reproduction should end, as
 323                     specified in the URL.
 324     chapters:       A list of dictionaries, with the following entries:
 325                         * "start_time" - The start time of the chapter in seconds
 326                         * "end_time" - The end time of the chapter in seconds
 327                         * "title" (optional, string)
 328     playable_in_embed: Whether this video is allowed to play in embedded
 329                     players on other sites. Can be True (=always allowed),
 330                     False (=never allowed), None (=unknown), or a string
 331                     specifying the criteria for embedability (Eg: 'whitelist')
 332     availability:   Under what condition the video is available. One of
 333                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 334                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 335                     to set it
 336     __post_extractor: A function to be called just before the metadata is
 337                     written to either disk, logger or console. The function
 338                     must return a dict which will be added to the info_dict.
 339                     This is usefull for additional information that is
 340                     time-consuming to extract. Note that the fields thus
 341                     extracted will not be available to output template and
 342                     match_filter. So, only "comments" and "comment_count" are
 343                     currently allowed to be extracted via this method.
 344
 345     The following fields should only be used when the video belongs to some logical
 346     chapter or section:
 347
 348     chapter:        Name or title of the chapter the video belongs to.
 349     chapter_number: Number of the chapter the video belongs to, as an integer.
 350     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 351
 352     The following fields should only be used when the video is an episode of some
 353     series, programme or podcast:
 354
 355     series:         Title of the series or programme the video episode belongs to.
 356     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 357     season:         Title of the season the video episode belongs to.
 358     season_number:  Number of the season the video episode belongs to, as an integer.
 359     season_id:      Id of the season the video episode belongs to, as a unicode string.
 360     episode:        Title of the video episode. Unlike mandatory video title field,
 361                     this field should denote the exact title of the video episode
 362                     without any kind of decoration.
 363     episode_number: Number of the video episode within a season, as an integer.
 364     episode_id:     Id of the video episode, as a unicode string.
 365
 366     The following fields should only be used when the media is a track or a part of
 367     a music album:
 368
 369     track:          Title of the track.
 370     track_number:   Number of the track within an album or a disc, as an integer.
 371     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 372                     as a unicode string.
 373     artist:         Artist(s) of the track.
 374     genre:          Genre(s) of the track.
 375     album:          Title of the album the track belongs to.
 376     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 377     album_artist:   List of all artists appeared on the album (e.g.
 378                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 379                     and compilations).
 380     disc_number:    Number of the disc or other physical medium the track belongs to,
 381                     as an integer.
 382     release_year:   Year (YYYY) when the album was released.
 383     composer:       Composer of the piece
 384
 385     Unless mentioned otherwise, the fields should be Unicode strings.
 386
 387     Unless mentioned otherwise, None is equivalent to absence of information.
 388
 389
 390     _type "playlist" indicates multiple videos.
 391     There must be a key "entries", which is a list, an iterable, or a PagedList
 392     object, each element of which is a valid dictionary by this specification.
 393
 394     Additionally, playlists can have "id", "title", and any other relevent
 395     attributes with the same semantics as videos (see above).
 396
 397     It can also have the following optional fields:
 398
 399     playlist_count: The total number of videos in a playlist. If not given,
 400                     YoutubeDL tries to calculate it from "entries"
 401
 402
 403     _type "multi_video" indicates that there are multiple videos that
 404     form a single show, for examples multiple acts of an opera or TV episode.
 405     It must have an entries key like a playlist and contain all the keys
 406     required for a video at the same time.
 407
 408
 409     _type "url" indicates that the video must be extracted from another
 410     location, possibly by a different extractor. Its only required key is:
 411     "url" - the next URL to extract.
 412     The key "ie_key" can be set to the class name (minus the trailing "IE",
 413     e.g. "Youtube") if the extractor class is known in advance.
 414     Additionally, the dictionary may have any properties of the resolved entity
 415     known in advance, for example "title" if the title of the referred video is
 416     known ahead of time.
 417
 418
 419     _type "url_transparent" entities have the same specification as "url", but
 420     indicate that the given additional information is more precise than the one
 421     associated with the resolved URL.
 422     This is useful when a site employs a video service that hosts the video and
 423     its technical metadata, but that video service does not embed a useful
 424     title, description etc.
 425
 426
 427     Subclasses of this one should re-define the _real_initialize() and
 428     _real_extract() methods and define a _VALID_URL regexp.
 429     Probably, they should also be added to the list of extractors.
 430
 431     Subclasses may also override suitable() if necessary, but ensure the function
 432     signature is preserved and that this function imports everything it needs
 433     (except other extractors), so that lazy_extractors works correctly
 434
 435     _GEO_BYPASS attribute may be set to False in order to disable
 436     geo restriction bypass mechanisms for a particular extractor.
 437     Though it won't disable explicit geo restriction bypass based on
 438     country code provided with geo_bypass_country.
 439
 440     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 441     countries for this extractor. One of these countries will be used by
 442     geo restriction bypass mechanism right away in order to bypass
 443     geo restriction, of course, if the mechanism is not disabled.
 444
 445     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 446     IP blocks in CIDR notation for this extractor. One of these IP blocks
 447     will be used by geo restriction bypass mechanism similarly
 448     to _GEO_COUNTRIES.
 449
 450     The _WORKING attribute should be set to False for broken IEs
 451     in order to warn the users and skip the tests.
 452     """
 453
 454     _ready = False
 455     _downloader = None
 456     _x_forwarded_for_ip = None
 457     _GEO_BYPASS = True
 458     _GEO_COUNTRIES = None
 459     _GEO_IP_BLOCKS = None
 460     _WORKING = True
 461
 462     _LOGIN_HINTS = {
 463         'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
 464         'cookies': (
 465             'Use --cookies-from-browser or --cookies for the authentication. '
 466             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 467         'password': 'Use --username and --password, or --netrc to provide account credentials',
 468     }
 469
 470     def __init__(self, downloader=None):
 471         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 472         If a downloader is not passed during initialization,
 473         it must be set using "set_downloader()" before "extract()" is called"""
 474         self._ready = False
 475         self._x_forwarded_for_ip = None
 476         self._printed_messages = set()
 477         self.set_downloader(downloader)
 478
 479     @classmethod
 480     def _match_valid_url(cls, url):
 481         # This does not use has/getattr intentionally - we want to know whether
 482         # we have cached the regexp for *this* class, whereas getattr would also
 483         # match the superclass
 484         if '_VALID_URL_RE' not in cls.__dict__:
 485             if '_VALID_URL' not in cls.__dict__:
 486                 cls._VALID_URL = cls._make_valid_url()
 487             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 488         return cls._VALID_URL_RE.match(url)
 489
 490     @classmethod
 491     def suitable(cls, url):
 492         """Receives a URL and returns True if suitable for this IE."""
 493         # This function must import everything it needs (except other extractors),
 494         # so that lazy_extractors works correctly
 495         return cls._match_valid_url(url) is not None
 496
 497     @classmethod
 498     def _match_id(cls, url):
 499         return cls._match_valid_url(url).group('id')
 500
 501     @classmethod
 502     def get_temp_id(cls, url):
 503         try:
 504             return cls._match_id(url)
 505         except (IndexError, AttributeError):
 506             return None
 507
 508     @classmethod
 509     def working(cls):
 510         """Getter method for _WORKING."""
 511         return cls._WORKING
 512
 513     def initialize(self):
 514         """Initializes an instance (authentication, etc)."""
 515         self._printed_messages = set()
 516         self._initialize_geo_bypass({
 517             'countries': self._GEO_COUNTRIES,
 518             'ip_blocks': self._GEO_IP_BLOCKS,
 519         })
 520         if not self._ready:
 521             self._real_initialize()
 522             self._ready = True
 523
 524     def _initialize_geo_bypass(self, geo_bypass_context):
 525         """
 526         Initialize geo restriction bypass mechanism.
 527
 528         This method is used to initialize geo bypass mechanism based on faking
 529         X-Forwarded-For HTTP header. A random country from provided country list
 530         is selected and a random IP belonging to this country is generated. This
 531         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 532         HTTP requests.
 533
 534         This method will be used for initial geo bypass mechanism initialization
 535         during the instance initialization with _GEO_COUNTRIES and
 536         _GEO_IP_BLOCKS.
 537
 538         You may also manually call it from extractor's code if geo bypass
 539         information is not available beforehand (e.g. obtained during
 540         extraction) or due to some other reason. In this case you should pass
 541         this information in geo bypass context passed as first argument. It may
 542         contain following fields:
 543
 544         countries:  List of geo unrestricted countries (similar
 545                     to _GEO_COUNTRIES)
 546         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 547                     (similar to _GEO_IP_BLOCKS)
 548
 549         """
 550         if not self._x_forwarded_for_ip:
 551
 552             # Geo bypass mechanism is explicitly disabled by user
 553             if not self.get_param('geo_bypass', True):
 554                 return
 555
 556             if not geo_bypass_context:
 557                 geo_bypass_context = {}
 558
 559             # Backward compatibility: previously _initialize_geo_bypass
 560             # expected a list of countries, some 3rd party code may still use
 561             # it this way
 562             if isinstance(geo_bypass_context, (list, tuple)):
 563                 geo_bypass_context = {
 564                     'countries': geo_bypass_context,
 565                 }
 566
 567             # The whole point of geo bypass mechanism is to fake IP
 568             # as X-Forwarded-For HTTP header based on some IP block or
 569             # country code.
 570
 571             # Path 1: bypassing based on IP block in CIDR notation
 572
 573             # Explicit IP block specified by user, use it right away
 574             # regardless of whether extractor is geo bypassable or not
 575             ip_block = self.get_param('geo_bypass_ip_block', None)
 576
 577             # Otherwise use random IP block from geo bypass context but only
 578             # if extractor is known as geo bypassable
 579             if not ip_block:
 580                 ip_blocks = geo_bypass_context.get('ip_blocks')
 581                 if self._GEO_BYPASS and ip_blocks:
 582                     ip_block = random.choice(ip_blocks)
 583
 584             if ip_block:
 585                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 586                 self._downloader.write_debug(
 587                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 588                 return
 589
 590             # Path 2: bypassing based on country code
 591
 592             # Explicit country code specified by user, use it right away
 593             # regardless of whether extractor is geo bypassable or not
 594             country = self.get_param('geo_bypass_country', None)
 595
 596             # Otherwise use random country code from geo bypass context but
 597             # only if extractor is known as geo bypassable
 598             if not country:
 599                 countries = geo_bypass_context.get('countries')
 600                 if self._GEO_BYPASS and countries:
 601                     country = random.choice(countries)
 602
 603             if country:
 604                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 605                 self._downloader.write_debug(
 606                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 607
 608     def extract(self, url):
 609         """Extracts URL information and returns it in list of dicts."""
 610         try:
 611             for _ in range(2):
 612                 try:
 613                     self.initialize()
 614                     self.write_debug('Extracting URL: %s' % url)
 615                     ie_result = self._real_extract(url)
 616                     if ie_result is None:
 617                         return None
 618                     if self._x_forwarded_for_ip:
 619                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 620                     subtitles = ie_result.get('subtitles')
 621                     if (subtitles and 'live_chat' in subtitles
 622                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 623                         del subtitles['live_chat']
 624                     return ie_result
 625                 except GeoRestrictedError as e:
 626                     if self.__maybe_fake_ip_and_retry(e.countries):
 627                         continue
 628                     raise
 629         except UnsupportedError:
 630             raise
 631         except ExtractorError as e:
 632             kwargs = {
 633                 'video_id': e.video_id or self.get_temp_id(url),
 634                 'ie': self.IE_NAME,
 635                 'tb': e.traceback or sys.exc_info()[2],
 636                 'expected': e.expected,
 637                 'cause': e.cause
 638             }
 639             if hasattr(e, 'countries'):
 640                 kwargs['countries'] = e.countries
 641             raise type(e)(e.msg, **kwargs)
 642         except compat_http_client.IncompleteRead as e:
 643             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 644         except (KeyError, StopIteration) as e:
 645             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 646
 647     def __maybe_fake_ip_and_retry(self, countries):
 648         if (not self.get_param('geo_bypass_country', None)
 649                 and self._GEO_BYPASS
 650                 and self.get_param('geo_bypass', True)
 651                 and not self._x_forwarded_for_ip
 652                 and countries):
 653             country_code = random.choice(countries)
 654             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 655             if self._x_forwarded_for_ip:
 656                 self.report_warning(
 657                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 658                     % (self._x_forwarded_for_ip, country_code.upper()))
 659                 return True
 660         return False
 661
 662     def set_downloader(self, downloader):
 663         """Sets the downloader for this IE."""
 664         self._downloader = downloader
 665
 666     def _real_initialize(self):
 667         """Real initialization process. Redefine in subclasses."""
 668         pass
 669
 670     def _real_extract(self, url):
 671         """Real extraction process. Redefine in subclasses."""
 672         pass
 673
 674     @classmethod
 675     def ie_key(cls):
 676         """A string for getting the InfoExtractor with get_info_extractor"""
 677         return cls.__name__[:-2]
 678
 679     @property
 680     def IE_NAME(self):
 681         return compat_str(type(self).__name__[:-2])
 682
 683     @staticmethod
 684     def __can_accept_status_code(err, expected_status):
 685         assert isinstance(err, compat_urllib_error.HTTPError)
 686         if expected_status is None:
 687             return False
 688         elif callable(expected_status):
 689             return expected_status(err.code) is True
 690         else:
 691             return err.code in variadic(expected_status)
 692
 693     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 694         """
 695         Return the response handle.
 696
 697         See _download_webpage docstring for arguments specification.
 698         """
 699         if not self._downloader._first_webpage_request:
 700             sleep_interval = self.get_param('sleep_interval_requests') or 0
 701             if sleep_interval > 0:
 702                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 703                 time.sleep(sleep_interval)
 704         else:
 705             self._downloader._first_webpage_request = False
 706
 707         if note is None:
 708             self.report_download_webpage(video_id)
 709         elif note is not False:
 710             if video_id is None:
 711                 self.to_screen('%s' % (note,))
 712             else:
 713                 self.to_screen('%s: %s' % (video_id, note))
 714
 715         # Some sites check X-Forwarded-For HTTP header in order to figure out
 716         # the origin of the client behind proxy. This allows bypassing geo
 717         # restriction by faking this header's value to IP that belongs to some
 718         # geo unrestricted country. We will do so once we encounter any
 719         # geo restriction error.
 720         if self._x_forwarded_for_ip:
 721             if 'X-Forwarded-For' not in headers:
 722                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 723
 724         if isinstance(url_or_request, compat_urllib_request.Request):
 725             url_or_request = update_Request(
 726                 url_or_request, data=data, headers=headers, query=query)
 727         else:
 728             if query:
 729                 url_or_request = update_url_query(url_or_request, query)
 730             if data is not None or headers:
 731                 url_or_request = sanitized_Request(url_or_request, data, headers)
 732         try:
 733             return self._downloader.urlopen(url_or_request)
 734         except network_exceptions as err:
 735             if isinstance(err, compat_urllib_error.HTTPError):
 736                 if self.__can_accept_status_code(err, expected_status):
 737                     # Retain reference to error to prevent file object from
 738                     # being closed before it can be read. Works around the
 739                     # effects of <https://bugs.python.org/issue15002>
 740                     # introduced in Python 3.4.1.
 741                     err.fp._error = err
 742                     return err.fp
 743
 744             if errnote is False:
 745                 return False
 746             if errnote is None:
 747                 errnote = 'Unable to download webpage'
 748
 749             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 750             if fatal:
 751                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 752             else:
 753                 self.report_warning(errmsg)
 754                 return False
 755
 756     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 757         """
 758         Return a tuple (page content as string, URL handle).
 759
 760         See _download_webpage docstring for arguments specification.
 761         """
 762         # Strip hashes from the URL (#1038)
 763         if isinstance(url_or_request, (compat_str, str)):
 764             url_or_request = url_or_request.partition('#')[0]
 765
 766         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 767         if urlh is False:
 768             assert not fatal
 769             return False
 770         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 771         return (content, urlh)
 772
 773     @staticmethod
 774     def _guess_encoding_from_content(content_type, webpage_bytes):
 775         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 776         if m:
 777             encoding = m.group(1)
 778         else:
 779             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 780                           webpage_bytes[:1024])
 781             if m:
 782                 encoding = m.group(1).decode('ascii')
 783             elif webpage_bytes.startswith(b'\xff\xfe'):
 784                 encoding = 'utf-16'
 785             else:
 786                 encoding = 'utf-8'
 787
 788         return encoding
 789
 790     def __check_blocked(self, content):
 791         first_block = content[:512]
 792         if ('<title>Access to this site is blocked</title>' in content
 793                 and 'Websense' in first_block):
 794             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 795             blocked_iframe = self._html_search_regex(
 796                 r'<iframe src="([^"]+)"', content,
 797                 'Websense information URL', default=None)
 798             if blocked_iframe:
 799                 msg += ' Visit %s for more details' % blocked_iframe
 800             raise ExtractorError(msg, expected=True)
 801         if '<title>The URL you requested has been blocked</title>' in first_block:
 802             msg = (
 803                 'Access to this webpage has been blocked by Indian censorship. '
 804                 'Use a VPN or proxy server (with --proxy) to route around it.')
 805             block_msg = self._html_search_regex(
 806                 r'</h1><p>(.*?)</p>',
 807                 content, 'block message', default=None)
 808             if block_msg:
 809                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 810             raise ExtractorError(msg, expected=True)
 811         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 812                 and 'blocklist.rkn.gov.ru' in content):
 813             raise ExtractorError(
 814                 'Access to this webpage has been blocked by decision of the Russian government. '
 815                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 816                 expected=True)
 817
 818     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 819         content_type = urlh.headers.get('Content-Type', '')
 820         webpage_bytes = urlh.read()
 821         if prefix is not None:
 822             webpage_bytes = prefix + webpage_bytes
 823         if not encoding:
 824             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 825         if self.get_param('dump_intermediate_pages', False):
 826             self.to_screen('Dumping request to ' + urlh.geturl())
 827             dump = base64.b64encode(webpage_bytes).decode('ascii')
 828             self._downloader.to_screen(dump)
 829         if self.get_param('write_pages', False):
 830             basen = '%s_%s' % (video_id, urlh.geturl())
 831             trim_length = self.get_param('trim_file_name') or 240
 832             if len(basen) > trim_length:
 833                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 834                 basen = basen[:trim_length - len(h)] + h
 835             raw_filename = basen + '.dump'
 836             filename = sanitize_filename(raw_filename, restricted=True)
 837             self.to_screen('Saving request to ' + filename)
 838             # Working around MAX_PATH limitation on Windows (see
 839             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 840             if compat_os_name == 'nt':
 841                 absfilepath = os.path.abspath(filename)
 842                 if len(absfilepath) > 259:
 843                     filename = '\\\\?\\' + absfilepath
 844             with open(filename, 'wb') as outf:
 845                 outf.write(webpage_bytes)
 846
 847         try:
 848             content = webpage_bytes.decode(encoding, 'replace')
 849         except LookupError:
 850             content = webpage_bytes.decode('utf-8', 'replace')
 851
 852         self.__check_blocked(content)
 853
 854         return content
 855
 856     def _download_webpage(
 857             self, url_or_request, video_id, note=None, errnote=None,
 858             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 859             headers={}, query={}, expected_status=None):
 860         """
 861         Return the data of the page as a string.
 862
 863         Arguments:
 864         url_or_request -- plain text URL as a string or
 865             a compat_urllib_request.Requestobject
 866         video_id -- Video/playlist/item identifier (string)
 867
 868         Keyword arguments:
 869         note -- note printed before downloading (string)
 870         errnote -- note printed in case of an error (string)
 871         fatal -- flag denoting whether error should be considered fatal,
 872             i.e. whether it should cause ExtractionError to be raised,
 873             otherwise a warning will be reported and extraction continued
 874         tries -- number of tries
 875         timeout -- sleep interval between tries
 876         encoding -- encoding for a page content decoding, guessed automatically
 877             when not explicitly specified
 878         data -- POST data (bytes)
 879         headers -- HTTP headers (dict)
 880         query -- URL query (dict)
 881         expected_status -- allows to accept failed HTTP requests (non 2xx
 882             status code) by explicitly specifying a set of accepted status
 883             codes. Can be any of the following entities:
 884                 - an integer type specifying an exact failed status code to
 885                   accept
 886                 - a list or a tuple of integer types specifying a list of
 887                   failed status codes to accept
 888                 - a callable accepting an actual failed status code and
 889                   returning True if it should be accepted
 890             Note that this argument does not affect success status codes (2xx)
 891             which are always accepted.
 892         """
 893
 894         success = False
 895         try_count = 0
 896         while success is False:
 897             try:
 898                 res = self._download_webpage_handle(
 899                     url_or_request, video_id, note, errnote, fatal,
 900                     encoding=encoding, data=data, headers=headers, query=query,
 901                     expected_status=expected_status)
 902                 success = True
 903             except compat_http_client.IncompleteRead as e:
 904                 try_count += 1
 905                 if try_count >= tries:
 906                     raise e
 907                 self._sleep(timeout, video_id)
 908         if res is False:
 909             return res
 910         else:
 911             content, _ = res
 912             return content
 913
 914     def _download_xml_handle(
 915             self, url_or_request, video_id, note='Downloading XML',
 916             errnote='Unable to download XML', transform_source=None,
 917             fatal=True, encoding=None, data=None, headers={}, query={},
 918             expected_status=None):
 919         """
 920         Return a tuple (xml as an compat_etree_Element, URL handle).
 921
 922         See _download_webpage docstring for arguments specification.
 923         """
 924         res = self._download_webpage_handle(
 925             url_or_request, video_id, note, errnote, fatal=fatal,
 926             encoding=encoding, data=data, headers=headers, query=query,
 927             expected_status=expected_status)
 928         if res is False:
 929             return res
 930         xml_string, urlh = res
 931         return self._parse_xml(
 932             xml_string, video_id, transform_source=transform_source,
 933             fatal=fatal), urlh
 934
 935     def _download_xml(
 936             self, url_or_request, video_id,
 937             note='Downloading XML', errnote='Unable to download XML',
 938             transform_source=None, fatal=True, encoding=None,
 939             data=None, headers={}, query={}, expected_status=None):
 940         """
 941         Return the xml as an compat_etree_Element.
 942
 943         See _download_webpage docstring for arguments specification.
 944         """
 945         res = self._download_xml_handle(
 946             url_or_request, video_id, note=note, errnote=errnote,
 947             transform_source=transform_source, fatal=fatal, encoding=encoding,
 948             data=data, headers=headers, query=query,
 949             expected_status=expected_status)
 950         return res if res is False else res[0]
 951
 952     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 953         if transform_source:
 954             xml_string = transform_source(xml_string)
 955         try:
 956             return compat_etree_fromstring(xml_string.encode('utf-8'))
 957         except compat_xml_parse_error as ve:
 958             errmsg = '%s: Failed to parse XML ' % video_id
 959             if fatal:
 960                 raise ExtractorError(errmsg, cause=ve)
 961             else:
 962                 self.report_warning(errmsg + str(ve))
 963
 964     def _download_json_handle(
 965             self, url_or_request, video_id, note='Downloading JSON metadata',
 966             errnote='Unable to download JSON metadata', transform_source=None,
 967             fatal=True, encoding=None, data=None, headers={}, query={},
 968             expected_status=None):
 969         """
 970         Return a tuple (JSON object, URL handle).
 971
 972         See _download_webpage docstring for arguments specification.
 973         """
 974         res = self._download_webpage_handle(
 975             url_or_request, video_id, note, errnote, fatal=fatal,
 976             encoding=encoding, data=data, headers=headers, query=query,
 977             expected_status=expected_status)
 978         if res is False:
 979             return res
 980         json_string, urlh = res
 981         return self._parse_json(
 982             json_string, video_id, transform_source=transform_source,
 983             fatal=fatal), urlh
 984
 985     def _download_json(
 986             self, url_or_request, video_id, note='Downloading JSON metadata',
 987             errnote='Unable to download JSON metadata', transform_source=None,
 988             fatal=True, encoding=None, data=None, headers={}, query={},
 989             expected_status=None):
 990         """
 991         Return the JSON object as a dict.
 992
 993         See _download_webpage docstring for arguments specification.
 994         """
 995         res = self._download_json_handle(
 996             url_or_request, video_id, note=note, errnote=errnote,
 997             transform_source=transform_source, fatal=fatal, encoding=encoding,
 998             data=data, headers=headers, query=query,
 999             expected_status=expected_status)
1000         return res if res is False else res[0]
1001
1002     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
1003         if transform_source:
1004             json_string = transform_source(json_string)
1005         try:
1006             return json.loads(json_string)
1007         except ValueError as ve:
1008             errmsg = '%s: Failed to parse JSON ' % video_id
1009             if fatal:
1010                 raise ExtractorError(errmsg, cause=ve)
1011             else:
1012                 self.report_warning(errmsg + str(ve))
1013
1014     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1015         return self._parse_json(
1016             data[data.find('{'):data.rfind('}') + 1],
1017             video_id, transform_source, fatal)
1018
1019     def _download_socket_json_handle(
1020             self, url_or_request, video_id, note='Polling socket',
1021             errnote='Unable to poll socket', transform_source=None,
1022             fatal=True, encoding=None, data=None, headers={}, query={},
1023             expected_status=None):
1024         """
1025         Return a tuple (JSON object, URL handle).
1026
1027         See _download_webpage docstring for arguments specification.
1028         """
1029         res = self._download_webpage_handle(
1030             url_or_request, video_id, note, errnote, fatal=fatal,
1031             encoding=encoding, data=data, headers=headers, query=query,
1032             expected_status=expected_status)
1033         if res is False:
1034             return res
1035         webpage, urlh = res
1036         return self._parse_socket_response_as_json(
1037             webpage, video_id, transform_source=transform_source,
1038             fatal=fatal), urlh
1039
1040     def _download_socket_json(
1041             self, url_or_request, video_id, note='Polling socket',
1042             errnote='Unable to poll socket', transform_source=None,
1043             fatal=True, encoding=None, data=None, headers={}, query={},
1044             expected_status=None):
1045         """
1046         Return the JSON object as a dict.
1047
1048         See _download_webpage docstring for arguments specification.
1049         """
1050         res = self._download_socket_json_handle(
1051             url_or_request, video_id, note=note, errnote=errnote,
1052             transform_source=transform_source, fatal=fatal, encoding=encoding,
1053             data=data, headers=headers, query=query,
1054             expected_status=expected_status)
1055         return res if res is False else res[0]
1056
1057     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1058         idstr = format_field(video_id, template='%s: ')
1059         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1060         if only_once:
1061             if f'WARNING: {msg}' in self._printed_messages:
1062                 return
1063             self._printed_messages.add(f'WARNING: {msg}')
1064         self._downloader.report_warning(msg, *args, **kwargs)
1065
1066     def to_screen(self, msg, *args, **kwargs):
1067         """Print msg to screen, prefixing it with '[ie_name]'"""
1068         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1069
1070     def write_debug(self, msg, *args, **kwargs):
1071         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1072
1073     def get_param(self, name, default=None, *args, **kwargs):
1074         if self._downloader:
1075             return self._downloader.params.get(name, default, *args, **kwargs)
1076         return default
1077
1078     def report_drm(self, video_id, partial=False):
1079         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1080
1081     def report_extraction(self, id_or_name):
1082         """Report information extraction."""
1083         self.to_screen('%s: Extracting information' % id_or_name)
1084
1085     def report_download_webpage(self, video_id):
1086         """Report webpage download."""
1087         self.to_screen('%s: Downloading webpage' % video_id)
1088
1089     def report_age_confirmation(self):
1090         """Report attempt to confirm age."""
1091         self.to_screen('Confirming age')
1092
1093     def report_login(self):
1094         """Report attempt to log in."""
1095         self.to_screen('Logging in')
1096
1097     def raise_login_required(
1098             self, msg='This video is only available for registered users',
1099             metadata_available=False, method='any'):
1100         if metadata_available and (
1101                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1102             self.report_warning(msg)
1103         if method is not None:
1104             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1105         raise ExtractorError(msg, expected=True)
1106
1107     def raise_geo_restricted(
1108             self, msg='This video is not available from your location due to geo restriction',
1109             countries=None, metadata_available=False):
1110         if metadata_available and (
1111                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1112             self.report_warning(msg)
1113         else:
1114             raise GeoRestrictedError(msg, countries=countries)
1115
1116     def raise_no_formats(self, msg, expected=False, video_id=None):
1117         if expected and (
1118                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1119             self.report_warning(msg, video_id)
1120         elif isinstance(msg, ExtractorError):
1121             raise msg
1122         else:
1123             raise ExtractorError(msg, expected=expected, video_id=video_id)
1124
1125     # Methods for following #608
1126     @staticmethod
1127     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1128         """Returns a URL that points to a page that should be processed"""
1129         if ie is not None:
1130             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1131         if video_id is not None:
1132             kwargs['id'] = video_id
1133         if video_title is not None:
1134             kwargs['title'] = video_title
1135         return {
1136             **kwargs,
1137             '_type': 'url_transparent' if url_transparent else 'url',
1138             'url': url,
1139         }
1140
1141     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, **kwargs):
1142         urls = (self.url_result(self._proto_relative_url(m), ie)
1143                 for m in orderedSet(map(getter, matches) if getter else matches))
1144         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1145
1146     @staticmethod
1147     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1148         """Returns a playlist"""
1149         if playlist_id:
1150             kwargs['id'] = playlist_id
1151         if playlist_title:
1152             kwargs['title'] = playlist_title
1153         if playlist_description is not None:
1154             kwargs['description'] = playlist_description
1155         return {
1156             **kwargs,
1157             '_type': 'multi_video' if multi_video else 'playlist',
1158             'entries': entries,
1159         }
1160
1161     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1162         """
1163         Perform a regex search on the given string, using a single or a list of
1164         patterns returning the first matching group.
1165         In case of failure return a default value or raise a WARNING or a
1166         RegexNotFoundError, depending on fatal, specifying the field name.
1167         """
1168         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1169             mobj = re.search(pattern, string, flags)
1170         else:
1171             for p in pattern:
1172                 mobj = re.search(p, string, flags)
1173                 if mobj:
1174                     break
1175
1176         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1177
1178         if mobj:
1179             if group is None:
1180                 # return the first matching group
1181                 return next(g for g in mobj.groups() if g is not None)
1182             elif isinstance(group, (list, tuple)):
1183                 return tuple(mobj.group(g) for g in group)
1184             else:
1185                 return mobj.group(group)
1186         elif default is not NO_DEFAULT:
1187             return default
1188         elif fatal:
1189             raise RegexNotFoundError('Unable to extract %s' % _name)
1190         else:
1191             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1192             return None
1193
1194     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1195         """
1196         Like _search_regex, but strips HTML tags and unescapes entities.
1197         """
1198         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1199         if res:
1200             return clean_html(res).strip()
1201         else:
1202             return res
1203
1204     def _get_netrc_login_info(self, netrc_machine=None):
1205         username = None
1206         password = None
1207         netrc_machine = netrc_machine or self._NETRC_MACHINE
1208
1209         if self.get_param('usenetrc', False):
1210             try:
1211                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1212                 if os.path.isdir(netrc_file):
1213                     netrc_file = os.path.join(netrc_file, '.netrc')
1214                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1215                 if info is not None:
1216                     username = info[0]
1217                     password = info[2]
1218                 else:
1219                     raise netrc.NetrcParseError(
1220                         'No authenticators for %s' % netrc_machine)
1221             except (IOError, netrc.NetrcParseError) as err:
1222                 self.report_warning(
1223                     'parsing .netrc: %s' % error_to_compat_str(err))
1224
1225         return username, password
1226
1227     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1228         """
1229         Get the login info as (username, password)
1230         First look for the manually specified credentials using username_option
1231         and password_option as keys in params dictionary. If no such credentials
1232         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1233         value.
1234         If there's no info available, return (None, None)
1235         """
1236
1237         # Attempt to use provided username and password or .netrc data
1238         username = self.get_param(username_option)
1239         if username is not None:
1240             password = self.get_param(password_option)
1241         else:
1242             username, password = self._get_netrc_login_info(netrc_machine)
1243
1244         return username, password
1245
1246     def _get_tfa_info(self, note='two-factor verification code'):
1247         """
1248         Get the two-factor authentication info
1249         TODO - asking the user will be required for sms/phone verify
1250         currently just uses the command line option
1251         If there's no info available, return None
1252         """
1253
1254         tfa = self.get_param('twofactor')
1255         if tfa is not None:
1256             return tfa
1257
1258         return compat_getpass('Type %s and press [Return]: ' % note)
1259
1260     # Helper functions for extracting OpenGraph info
1261     @staticmethod
1262     def _og_regexes(prop):
1263         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1264         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1265                        % {'prop': re.escape(prop)})
1266         template = r'<meta[^>]+?%s[^>]+?%s'
1267         return [
1268             template % (property_re, content_re),
1269             template % (content_re, property_re),
1270         ]
1271
1272     @staticmethod
1273     def _meta_regex(prop):
1274         return r'''(?isx)<meta
1275                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1276                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1277
1278     def _og_search_property(self, prop, html, name=None, **kargs):
1279         prop = variadic(prop)
1280         if name is None:
1281             name = 'OpenGraph %s' % prop[0]
1282         og_regexes = []
1283         for p in prop:
1284             og_regexes.extend(self._og_regexes(p))
1285         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1286         if escaped is None:
1287             return None
1288         return unescapeHTML(escaped)
1289
1290     def _og_search_thumbnail(self, html, **kargs):
1291         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1292
1293     def _og_search_description(self, html, **kargs):
1294         return self._og_search_property('description', html, fatal=False, **kargs)
1295
1296     def _og_search_title(self, html, **kargs):
1297         kargs.setdefault('fatal', False)
1298         return self._og_search_property('title', html, **kargs)
1299
1300     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1301         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1302         if secure:
1303             regexes = self._og_regexes('video:secure_url') + regexes
1304         return self._html_search_regex(regexes, html, name, **kargs)
1305
1306     def _og_search_url(self, html, **kargs):
1307         return self._og_search_property('url', html, **kargs)
1308
1309     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1310         name = variadic(name)
1311         if display_name is None:
1312             display_name = name[0]
1313         return self._html_search_regex(
1314             [self._meta_regex(n) for n in name],
1315             html, display_name, fatal=fatal, group='content', **kwargs)
1316
1317     def _dc_search_uploader(self, html):
1318         return self._html_search_meta('dc.creator', html, 'uploader')
1319
1320     def _rta_search(self, html):
1321         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1322         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1323                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1324                      html):
1325             return 18
1326         return 0
1327
1328     def _media_rating_search(self, html):
1329         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1330         rating = self._html_search_meta('rating', html)
1331
1332         if not rating:
1333             return None
1334
1335         RATING_TABLE = {
1336             'safe for kids': 0,
1337             'general': 8,
1338             '14 years': 14,
1339             'mature': 17,
1340             'restricted': 19,
1341         }
1342         return RATING_TABLE.get(rating.lower())
1343
1344     def _family_friendly_search(self, html):
1345         # See http://schema.org/VideoObject
1346         family_friendly = self._html_search_meta(
1347             'isFamilyFriendly', html, default=None)
1348
1349         if not family_friendly:
1350             return None
1351
1352         RATING_TABLE = {
1353             '1': 0,
1354             'true': 0,
1355             '0': 18,
1356             'false': 18,
1357         }
1358         return RATING_TABLE.get(family_friendly.lower())
1359
1360     def _twitter_search_player(self, html):
1361         return self._html_search_meta('twitter:player', html,
1362                                       'twitter card player')
1363
1364     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1365         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1366         default = kwargs.get('default', NO_DEFAULT)
1367         # JSON-LD may be malformed and thus `fatal` should be respected.
1368         # At the same time `default` may be passed that assumes `fatal=False`
1369         # for _search_regex. Let's simulate the same behavior here as well.
1370         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1371         json_ld = []
1372         for mobj in json_ld_list:
1373             json_ld_item = self._parse_json(
1374                 mobj.group('json_ld'), video_id, fatal=fatal)
1375             if not json_ld_item:
1376                 continue
1377             if isinstance(json_ld_item, dict):
1378                 json_ld.append(json_ld_item)
1379             elif isinstance(json_ld_item, (list, tuple)):
1380                 json_ld.extend(json_ld_item)
1381         if json_ld:
1382             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1383         if json_ld:
1384             return json_ld
1385         if default is not NO_DEFAULT:
1386             return default
1387         elif fatal:
1388             raise RegexNotFoundError('Unable to extract JSON-LD')
1389         else:
1390             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1391             return {}
1392
1393     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1394         if isinstance(json_ld, compat_str):
1395             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1396         if not json_ld:
1397             return {}
1398         info = {}
1399         if not isinstance(json_ld, (list, tuple, dict)):
1400             return info
1401         if isinstance(json_ld, dict):
1402             json_ld = [json_ld]
1403
1404         INTERACTION_TYPE_MAP = {
1405             'CommentAction': 'comment',
1406             'AgreeAction': 'like',
1407             'DisagreeAction': 'dislike',
1408             'LikeAction': 'like',
1409             'DislikeAction': 'dislike',
1410             'ListenAction': 'view',
1411             'WatchAction': 'view',
1412             'ViewAction': 'view',
1413         }
1414
1415         def extract_interaction_type(e):
1416             interaction_type = e.get('interactionType')
1417             if isinstance(interaction_type, dict):
1418                 interaction_type = interaction_type.get('@type')
1419             return str_or_none(interaction_type)
1420
1421         def extract_interaction_statistic(e):
1422             interaction_statistic = e.get('interactionStatistic')
1423             if isinstance(interaction_statistic, dict):
1424                 interaction_statistic = [interaction_statistic]
1425             if not isinstance(interaction_statistic, list):
1426                 return
1427             for is_e in interaction_statistic:
1428                 if not isinstance(is_e, dict):
1429                     continue
1430                 if is_e.get('@type') != 'InteractionCounter':
1431                     continue
1432                 interaction_type = extract_interaction_type(is_e)
1433                 if not interaction_type:
1434                     continue
1435                 # For interaction count some sites provide string instead of
1436                 # an integer (as per spec) with non digit characters (e.g. ",")
1437                 # so extracting count with more relaxed str_to_int
1438                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1439                 if interaction_count is None:
1440                     continue
1441                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1442                 if not count_kind:
1443                     continue
1444                 count_key = '%s_count' % count_kind
1445                 if info.get(count_key) is not None:
1446                     continue
1447                 info[count_key] = interaction_count
1448
1449         def extract_chapter_information(e):
1450             chapters = [{
1451                 'title': part.get('name'),
1452                 'start_time': part.get('startOffset'),
1453                 'end_time': part.get('endOffset'),
1454             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1455             for idx, (last_c, current_c, next_c) in enumerate(zip(
1456                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1457                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1458                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1459                 if None in current_c.values():
1460                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1461                     return
1462             if chapters:
1463                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1464                 info['chapters'] = chapters
1465
1466         def extract_video_object(e):
1467             assert e['@type'] == 'VideoObject'
1468             author = e.get('author')
1469             info.update({
1470                 'url': url_or_none(e.get('contentUrl')),
1471                 'title': unescapeHTML(e.get('name')),
1472                 'description': unescapeHTML(e.get('description')),
1473                 'thumbnails': [{'url': url_or_none(url)}
1474                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1475                 'duration': parse_duration(e.get('duration')),
1476                 'timestamp': unified_timestamp(e.get('uploadDate')),
1477                 # author can be an instance of 'Organization' or 'Person' types.
1478                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1479                 # however some websites are using 'Text' type instead.
1480                 # 1. https://schema.org/VideoObject
1481                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1482                 'filesize': float_or_none(e.get('contentSize')),
1483                 'tbr': int_or_none(e.get('bitrate')),
1484                 'width': int_or_none(e.get('width')),
1485                 'height': int_or_none(e.get('height')),
1486                 'view_count': int_or_none(e.get('interactionCount')),
1487             })
1488             extract_interaction_statistic(e)
1489             extract_chapter_information(e)
1490
1491         def traverse_json_ld(json_ld, at_top_level=True):
1492             for e in json_ld:
1493                 if at_top_level and '@context' not in e:
1494                     continue
1495                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1496                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1497                     break
1498                 item_type = e.get('@type')
1499                 if expected_type is not None and expected_type != item_type:
1500                     continue
1501                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1502                 if rating is not None:
1503                     info['average_rating'] = rating
1504                 if item_type in ('TVEpisode', 'Episode'):
1505                     episode_name = unescapeHTML(e.get('name'))
1506                     info.update({
1507                         'episode': episode_name,
1508                         'episode_number': int_or_none(e.get('episodeNumber')),
1509                         'description': unescapeHTML(e.get('description')),
1510                     })
1511                     if not info.get('title') and episode_name:
1512                         info['title'] = episode_name
1513                     part_of_season = e.get('partOfSeason')
1514                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1515                         info.update({
1516                             'season': unescapeHTML(part_of_season.get('name')),
1517                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1518                         })
1519                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1520                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1521                         info['series'] = unescapeHTML(part_of_series.get('name'))
1522                 elif item_type == 'Movie':
1523                     info.update({
1524                         'title': unescapeHTML(e.get('name')),
1525                         'description': unescapeHTML(e.get('description')),
1526                         'duration': parse_duration(e.get('duration')),
1527                         'timestamp': unified_timestamp(e.get('dateCreated')),
1528                     })
1529                 elif item_type in ('Article', 'NewsArticle'):
1530                     info.update({
1531                         'timestamp': parse_iso8601(e.get('datePublished')),
1532                         'title': unescapeHTML(e.get('headline')),
1533                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1534                     })
1535                     if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1536                         extract_video_object(e['video'][0])
1537                 elif item_type == 'VideoObject':
1538                     extract_video_object(e)
1539                     if expected_type is None:
1540                         continue
1541                     else:
1542                         break
1543                 video = e.get('video')
1544                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1545                     extract_video_object(video)
1546                 if expected_type is None:
1547                     continue
1548                 else:
1549                     break
1550         traverse_json_ld(json_ld)
1551
1552         return dict((k, v) for k, v in info.items() if v is not None)
1553
1554     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1555         return self._parse_json(
1556             self._search_regex(
1557                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1558                 webpage, 'next.js data', fatal=fatal, **kw),
1559             video_id, transform_source=transform_source, fatal=fatal)
1560
1561     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1562         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1563         # not all website do this, but it can be changed
1564         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1565         rectx = re.escape(context_name)
1566         js, arg_keys, arg_vals = self._search_regex(
1567             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1568              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1569             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1570
1571         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1572
1573         for key, val in args.items():
1574             if val in ('undefined', 'void 0'):
1575                 args[key] = 'null'
1576
1577         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1578
1579     @staticmethod
1580     def _hidden_inputs(html):
1581         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1582         hidden_inputs = {}
1583         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1584             attrs = extract_attributes(input)
1585             if not input:
1586                 continue
1587             if attrs.get('type') not in ('hidden', 'submit'):
1588                 continue
1589             name = attrs.get('name') or attrs.get('id')
1590             value = attrs.get('value')
1591             if name and value is not None:
1592                 hidden_inputs[name] = value
1593         return hidden_inputs
1594
1595     def _form_hidden_inputs(self, form_id, html):
1596         form = self._search_regex(
1597             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1598             html, '%s form' % form_id, group='form')
1599         return self._hidden_inputs(form)
1600
1601     class FormatSort:
1602         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1603
1604         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1605                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1606                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1607         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1608                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1609                         'fps', 'fs_approx', 'source', 'id')
1610
1611         settings = {
1612             'vcodec': {'type': 'ordered', 'regex': True,
1613                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1614             'acodec': {'type': 'ordered', 'regex': True,
1615                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1616             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1617                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1618             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1619                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1620             'vext': {'type': 'ordered', 'field': 'video_ext',
1621                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1622                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1623             'aext': {'type': 'ordered', 'field': 'audio_ext',
1624                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1625                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1626             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1627             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1628                            'field': ('vcodec', 'acodec'),
1629                            'function': lambda it: int(any(v != 'none' for v in it))},
1630             'ie_pref': {'priority': True, 'type': 'extractor'},
1631             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1632             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1633             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1634             'quality': {'convert': 'float', 'default': -1},
1635             'filesize': {'convert': 'bytes'},
1636             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1637             'id': {'convert': 'string', 'field': 'format_id'},
1638             'height': {'convert': 'float_none'},
1639             'width': {'convert': 'float_none'},
1640             'fps': {'convert': 'float_none'},
1641             'tbr': {'convert': 'float_none'},
1642             'vbr': {'convert': 'float_none'},
1643             'abr': {'convert': 'float_none'},
1644             'asr': {'convert': 'float_none'},
1645             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1646
1647             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1648             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1649             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1650             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1651             'res': {'type': 'multiple', 'field': ('height', 'width'),
1652                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1653
1654             # For compatibility with youtube-dl
1655             'format_id': {'type': 'alias', 'field': 'id'},
1656             'preference': {'type': 'alias', 'field': 'ie_pref'},
1657             'language_preference': {'type': 'alias', 'field': 'lang'},
1658
1659             # Deprecated
1660             'dimension': {'type': 'alias', 'field': 'res'},
1661             'resolution': {'type': 'alias', 'field': 'res'},
1662             'extension': {'type': 'alias', 'field': 'ext'},
1663             'bitrate': {'type': 'alias', 'field': 'br'},
1664             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1665             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1666             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1667             'framerate': {'type': 'alias', 'field': 'fps'},
1668             'protocol': {'type': 'alias', 'field': 'proto'},
1669             'source_preference': {'type': 'alias', 'field': 'source'},
1670             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1671             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1672             'samplerate': {'type': 'alias', 'field': 'asr'},
1673             'video_ext': {'type': 'alias', 'field': 'vext'},
1674             'audio_ext': {'type': 'alias', 'field': 'aext'},
1675             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1676             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1677             'video': {'type': 'alias', 'field': 'hasvid'},
1678             'has_video': {'type': 'alias', 'field': 'hasvid'},
1679             'audio': {'type': 'alias', 'field': 'hasaud'},
1680             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1681             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1682             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1683         }
1684
1685         def __init__(self, ie, field_preference):
1686             self._order = []
1687             self.ydl = ie._downloader
1688             self.evaluate_params(self.ydl.params, field_preference)
1689             if ie.get_param('verbose'):
1690                 self.print_verbose_info(self.ydl.write_debug)
1691
1692         def _get_field_setting(self, field, key):
1693             if field not in self.settings:
1694                 if key in ('forced', 'priority'):
1695                     return False
1696                 self.ydl.deprecation_warning(
1697                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1698                     'and may be removed in a future version')
1699                 self.settings[field] = {}
1700             propObj = self.settings[field]
1701             if key not in propObj:
1702                 type = propObj.get('type')
1703                 if key == 'field':
1704                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1705                 elif key == 'convert':
1706                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1707                 else:
1708                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1709                 propObj[key] = default
1710             return propObj[key]
1711
1712         def _resolve_field_value(self, field, value, convertNone=False):
1713             if value is None:
1714                 if not convertNone:
1715                     return None
1716             else:
1717                 value = value.lower()
1718             conversion = self._get_field_setting(field, 'convert')
1719             if conversion == 'ignore':
1720                 return None
1721             if conversion == 'string':
1722                 return value
1723             elif conversion == 'float_none':
1724                 return float_or_none(value)
1725             elif conversion == 'bytes':
1726                 return FileDownloader.parse_bytes(value)
1727             elif conversion == 'order':
1728                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1729                 use_regex = self._get_field_setting(field, 'regex')
1730                 list_length = len(order_list)
1731                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1732                 if use_regex and value is not None:
1733                     for i, regex in enumerate(order_list):
1734                         if regex and re.match(regex, value):
1735                             return list_length - i
1736                     return list_length - empty_pos  # not in list
1737                 else:  # not regex or  value = None
1738                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1739             else:
1740                 if value.isnumeric():
1741                     return float(value)
1742                 else:
1743                     self.settings[field]['convert'] = 'string'
1744                     return value
1745
1746         def evaluate_params(self, params, sort_extractor):
1747             self._use_free_order = params.get('prefer_free_formats', False)
1748             self._sort_user = params.get('format_sort', [])
1749             self._sort_extractor = sort_extractor
1750
1751             def add_item(field, reverse, closest, limit_text):
1752                 field = field.lower()
1753                 if field in self._order:
1754                     return
1755                 self._order.append(field)
1756                 limit = self._resolve_field_value(field, limit_text)
1757                 data = {
1758                     'reverse': reverse,
1759                     'closest': False if limit is None else closest,
1760                     'limit_text': limit_text,
1761                     'limit': limit}
1762                 if field in self.settings:
1763                     self.settings[field].update(data)
1764                 else:
1765                     self.settings[field] = data
1766
1767             sort_list = (
1768                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1769                 + (tuple() if params.get('format_sort_force', False)
1770                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1771                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1772
1773             for item in sort_list:
1774                 match = re.match(self.regex, item)
1775                 if match is None:
1776                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1777                 field = match.group('field')
1778                 if field is None:
1779                     continue
1780                 if self._get_field_setting(field, 'type') == 'alias':
1781                     alias, field = field, self._get_field_setting(field, 'field')
1782                     if alias not in ('format_id', 'preference', 'language_preference'):
1783                         self.ydl.deprecation_warning(
1784                             f'Format sorting alias {alias} is deprecated '
1785                             f'and may be removed in a future version. Please use {field} instead')
1786                 reverse = match.group('reverse') is not None
1787                 closest = match.group('separator') == '~'
1788                 limit_text = match.group('limit')
1789
1790                 has_limit = limit_text is not None
1791                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1792                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1793
1794                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1795                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1796                 limit_count = len(limits)
1797                 for (i, f) in enumerate(fields):
1798                     add_item(f, reverse, closest,
1799                              limits[i] if i < limit_count
1800                              else limits[0] if has_limit and not has_multiple_limits
1801                              else None)
1802
1803         def print_verbose_info(self, write_debug):
1804             if self._sort_user:
1805                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1806             if self._sort_extractor:
1807                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1808             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1809                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1810                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1811                               self._get_field_setting(field, 'limit_text'),
1812                               self._get_field_setting(field, 'limit'))
1813                 if self._get_field_setting(field, 'limit_text') is not None else '')
1814                 for field in self._order if self._get_field_setting(field, 'visible')]))
1815
1816         def _calculate_field_preference_from_value(self, format, field, type, value):
1817             reverse = self._get_field_setting(field, 'reverse')
1818             closest = self._get_field_setting(field, 'closest')
1819             limit = self._get_field_setting(field, 'limit')
1820
1821             if type == 'extractor':
1822                 maximum = self._get_field_setting(field, 'max')
1823                 if value is None or (maximum is not None and value >= maximum):
1824                     value = -1
1825             elif type == 'boolean':
1826                 in_list = self._get_field_setting(field, 'in_list')
1827                 not_in_list = self._get_field_setting(field, 'not_in_list')
1828                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1829             elif type == 'ordered':
1830                 value = self._resolve_field_value(field, value, True)
1831
1832             # try to convert to number
1833             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1834             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1835             if is_num:
1836                 value = val_num
1837
1838             return ((-10, 0) if value is None
1839                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1840                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1841                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1842                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1843                     else (-1, value, 0))
1844
1845         def _calculate_field_preference(self, format, field):
1846             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1847             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1848             if type == 'multiple':
1849                 type = 'field'  # Only 'field' is allowed in multiple for now
1850                 actual_fields = self._get_field_setting(field, 'field')
1851
1852                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1853             else:
1854                 value = get_value(field)
1855             return self._calculate_field_preference_from_value(format, field, type, value)
1856
1857         def calculate_preference(self, format):
1858             # Determine missing protocol
1859             if not format.get('protocol'):
1860                 format['protocol'] = determine_protocol(format)
1861
1862             # Determine missing ext
1863             if not format.get('ext') and 'url' in format:
1864                 format['ext'] = determine_ext(format['url'])
1865             if format.get('vcodec') == 'none':
1866                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1867                 format['video_ext'] = 'none'
1868             else:
1869                 format['video_ext'] = format['ext']
1870                 format['audio_ext'] = 'none'
1871             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1872             #    format['preference'] = -1000
1873
1874             # Determine missing bitrates
1875             if format.get('tbr') is None:
1876                 if format.get('vbr') is not None and format.get('abr') is not None:
1877                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1878             else:
1879                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1880                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1881                 if format.get('acodec') != 'none' and format.get('abr') is None:
1882                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1883
1884             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1885
1886     def _sort_formats(self, formats, field_preference=[]):
1887         if not formats:
1888             return
1889         format_sort = self.FormatSort(self, field_preference)
1890         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1891
1892     def _check_formats(self, formats, video_id):
1893         if formats:
1894             formats[:] = filter(
1895                 lambda f: self._is_valid_url(
1896                     f['url'], video_id,
1897                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1898                 formats)
1899
1900     @staticmethod
1901     def _remove_duplicate_formats(formats):
1902         format_urls = set()
1903         unique_formats = []
1904         for f in formats:
1905             if f['url'] not in format_urls:
1906                 format_urls.add(f['url'])
1907                 unique_formats.append(f)
1908         formats[:] = unique_formats
1909
1910     def _is_valid_url(self, url, video_id, item='video', headers={}):
1911         url = self._proto_relative_url(url, scheme='http:')
1912         # For now assume non HTTP(S) URLs always valid
1913         if not (url.startswith('http://') or url.startswith('https://')):
1914             return True
1915         try:
1916             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1917             return True
1918         except ExtractorError as e:
1919             self.to_screen(
1920                 '%s: %s URL is invalid, skipping: %s'
1921                 % (video_id, item, error_to_compat_str(e.cause)))
1922             return False
1923
1924     def http_scheme(self):
1925         """ Either "http:" or "https:", depending on the user's preferences """
1926         return (
1927             'http:'
1928             if self.get_param('prefer_insecure', False)
1929             else 'https:')
1930
1931     def _proto_relative_url(self, url, scheme=None):
1932         if url is None:
1933             return url
1934         if url.startswith('//'):
1935             if scheme is None:
1936                 scheme = self.http_scheme()
1937             return scheme + url
1938         else:
1939             return url
1940
1941     def _sleep(self, timeout, video_id, msg_template=None):
1942         if msg_template is None:
1943             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1944         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1945         self.to_screen(msg)
1946         time.sleep(timeout)
1947
1948     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1949                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1950                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1951         manifest = self._download_xml(
1952             manifest_url, video_id, 'Downloading f4m manifest',
1953             'Unable to download f4m manifest',
1954             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1955             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1956             transform_source=transform_source,
1957             fatal=fatal, data=data, headers=headers, query=query)
1958
1959         if manifest is False:
1960             return []
1961
1962         return self._parse_f4m_formats(
1963             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1964             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1965
1966     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1967                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1968                            fatal=True, m3u8_id=None):
1969         if not isinstance(manifest, compat_etree_Element) and not fatal:
1970             return []
1971
1972         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1973         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1974         if akamai_pv is not None and ';' in akamai_pv.text:
1975             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1976             if playerVerificationChallenge.strip() != '':
1977                 return []
1978
1979         formats = []
1980         manifest_version = '1.0'
1981         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1982         if not media_nodes:
1983             manifest_version = '2.0'
1984             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1985         # Remove unsupported DRM protected media from final formats
1986         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1987         media_nodes = remove_encrypted_media(media_nodes)
1988         if not media_nodes:
1989             return formats
1990
1991         manifest_base_url = get_base_url(manifest)
1992
1993         bootstrap_info = xpath_element(
1994             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1995             'bootstrap info', default=None)
1996
1997         vcodec = None
1998         mime_type = xpath_text(
1999             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2000             'base URL', default=None)
2001         if mime_type and mime_type.startswith('audio/'):
2002             vcodec = 'none'
2003
2004         for i, media_el in enumerate(media_nodes):
2005             tbr = int_or_none(media_el.attrib.get('bitrate'))
2006             width = int_or_none(media_el.attrib.get('width'))
2007             height = int_or_none(media_el.attrib.get('height'))
2008             format_id = join_nonempty(f4m_id, tbr or i)
2009             # If <bootstrapInfo> is present, the specified f4m is a
2010             # stream-level manifest, and only set-level manifests may refer to
2011             # external resources.  See section 11.4 and section 4 of F4M spec
2012             if bootstrap_info is None:
2013                 media_url = None
2014                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2015                 if manifest_version == '2.0':
2016                     media_url = media_el.attrib.get('href')
2017                 if media_url is None:
2018                     media_url = media_el.attrib.get('url')
2019                 if not media_url:
2020                     continue
2021                 manifest_url = (
2022                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2023                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2024                 # If media_url is itself a f4m manifest do the recursive extraction
2025                 # since bitrates in parent manifest (this one) and media_url manifest
2026                 # may differ leading to inability to resolve the format by requested
2027                 # bitrate in f4m downloader
2028                 ext = determine_ext(manifest_url)
2029                 if ext == 'f4m':
2030                     f4m_formats = self._extract_f4m_formats(
2031                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2032                         transform_source=transform_source, fatal=fatal)
2033                     # Sometimes stream-level manifest contains single media entry that
2034                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2035                     # At the same time parent's media entry in set-level manifest may
2036                     # contain it. We will copy it from parent in such cases.
2037                     if len(f4m_formats) == 1:
2038                         f = f4m_formats[0]
2039                         f.update({
2040                             'tbr': f.get('tbr') or tbr,
2041                             'width': f.get('width') or width,
2042                             'height': f.get('height') or height,
2043                             'format_id': f.get('format_id') if not tbr else format_id,
2044                             'vcodec': vcodec,
2045                         })
2046                     formats.extend(f4m_formats)
2047                     continue
2048                 elif ext == 'm3u8':
2049                     formats.extend(self._extract_m3u8_formats(
2050                         manifest_url, video_id, 'mp4', preference=preference,
2051                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2052                     continue
2053             formats.append({
2054                 'format_id': format_id,
2055                 'url': manifest_url,
2056                 'manifest_url': manifest_url,
2057                 'ext': 'flv' if bootstrap_info is not None else None,
2058                 'protocol': 'f4m',
2059                 'tbr': tbr,
2060                 'width': width,
2061                 'height': height,
2062                 'vcodec': vcodec,
2063                 'preference': preference,
2064                 'quality': quality,
2065             })
2066         return formats
2067
2068     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2069         return {
2070             'format_id': join_nonempty(m3u8_id, 'meta'),
2071             'url': m3u8_url,
2072             'ext': ext,
2073             'protocol': 'm3u8',
2074             'preference': preference - 100 if preference else -100,
2075             'quality': quality,
2076             'resolution': 'multiple',
2077             'format_note': 'Quality selection URL',
2078         }
2079
2080     def _report_ignoring_subs(self, name):
2081         self.report_warning(bug_reports_message(
2082             f'Ignoring subtitle tracks found in the {name} manifest; '
2083             'if any subtitle tracks are missing,'
2084         ), only_once=True)
2085
2086     def _extract_m3u8_formats(self, *args, **kwargs):
2087         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2088         if subs:
2089             self._report_ignoring_subs('HLS')
2090         return fmts
2091
2092     def _extract_m3u8_formats_and_subtitles(
2093             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2094             preference=None, quality=None, m3u8_id=None, note=None,
2095             errnote=None, fatal=True, live=False, data=None, headers={},
2096             query={}):
2097
2098         res = self._download_webpage_handle(
2099             m3u8_url, video_id,
2100             note='Downloading m3u8 information' if note is None else note,
2101             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2102             fatal=fatal, data=data, headers=headers, query=query)
2103
2104         if res is False:
2105             return [], {}
2106
2107         m3u8_doc, urlh = res
2108         m3u8_url = urlh.geturl()
2109
2110         return self._parse_m3u8_formats_and_subtitles(
2111             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2112             preference=preference, quality=quality, m3u8_id=m3u8_id,
2113             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2114             headers=headers, query=query, video_id=video_id)
2115
2116     def _parse_m3u8_formats_and_subtitles(
2117             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2118             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2119             errnote=None, fatal=True, data=None, headers={}, query={},
2120             video_id=None):
2121         formats, subtitles = [], {}
2122
2123         has_drm = re.search('|'.join([
2124             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2125             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2126         ]), m3u8_doc)
2127
2128         def format_url(url):
2129             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2130
2131         if self.get_param('hls_split_discontinuity', False):
2132             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2133                 if not m3u8_doc:
2134                     if not manifest_url:
2135                         return []
2136                     m3u8_doc = self._download_webpage(
2137                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2138                         note=False, errnote='Failed to download m3u8 playlist information')
2139                     if m3u8_doc is False:
2140                         return []
2141                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2142
2143         else:
2144             def _extract_m3u8_playlist_indices(*args, **kwargs):
2145                 return [None]
2146
2147         # References:
2148         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2149         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2150         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2151
2152         # We should try extracting formats only from master playlists [1, 4.3.4],
2153         # i.e. playlists that describe available qualities. On the other hand
2154         # media playlists [1, 4.3.3] should be returned as is since they contain
2155         # just the media without qualities renditions.
2156         # Fortunately, master playlist can be easily distinguished from media
2157         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2158         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2159         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2160         # media playlist and MUST NOT appear in master playlist thus we can
2161         # clearly detect media playlist with this criterion.
2162
2163         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2164             formats = [{
2165                 'format_id': join_nonempty(m3u8_id, idx),
2166                 'format_index': idx,
2167                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2168                 'ext': ext,
2169                 'protocol': entry_protocol,
2170                 'preference': preference,
2171                 'quality': quality,
2172                 'has_drm': has_drm,
2173             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2174
2175             return formats, subtitles
2176
2177         groups = {}
2178         last_stream_inf = {}
2179
2180         def extract_media(x_media_line):
2181             media = parse_m3u8_attributes(x_media_line)
2182             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2183             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2184             if not (media_type and group_id and name):
2185                 return
2186             groups.setdefault(group_id, []).append(media)
2187             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2188             if media_type == 'SUBTITLES':
2189                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2190                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2191                 # However, lack of URI has been spotted in the wild.
2192                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2193                 if not media.get('URI'):
2194                     return
2195                 url = format_url(media['URI'])
2196                 sub_info = {
2197                     'url': url,
2198                     'ext': determine_ext(url),
2199                 }
2200                 if sub_info['ext'] == 'm3u8':
2201                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2202                     # files may contain is WebVTT:
2203                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2204                     sub_info['ext'] = 'vtt'
2205                     sub_info['protocol'] = 'm3u8_native'
2206                 lang = media.get('LANGUAGE') or 'und'
2207                 subtitles.setdefault(lang, []).append(sub_info)
2208             if media_type not in ('VIDEO', 'AUDIO'):
2209                 return
2210             media_url = media.get('URI')
2211             if media_url:
2212                 manifest_url = format_url(media_url)
2213                 formats.extend({
2214                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2215                     'format_note': name,
2216                     'format_index': idx,
2217                     'url': manifest_url,
2218                     'manifest_url': m3u8_url,
2219                     'language': media.get('LANGUAGE'),
2220                     'ext': ext,
2221                     'protocol': entry_protocol,
2222                     'preference': preference,
2223                     'quality': quality,
2224                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2225                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2226
2227         def build_stream_name():
2228             # Despite specification does not mention NAME attribute for
2229             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2230             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2231             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2232             stream_name = last_stream_inf.get('NAME')
2233             if stream_name:
2234                 return stream_name
2235             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2236             # from corresponding rendition group
2237             stream_group_id = last_stream_inf.get('VIDEO')
2238             if not stream_group_id:
2239                 return
2240             stream_group = groups.get(stream_group_id)
2241             if not stream_group:
2242                 return stream_group_id
2243             rendition = stream_group[0]
2244             return rendition.get('NAME') or stream_group_id
2245
2246         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2247         # chance to detect video only formats when EXT-X-STREAM-INF tags
2248         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2249         for line in m3u8_doc.splitlines():
2250             if line.startswith('#EXT-X-MEDIA:'):
2251                 extract_media(line)
2252
2253         for line in m3u8_doc.splitlines():
2254             if line.startswith('#EXT-X-STREAM-INF:'):
2255                 last_stream_inf = parse_m3u8_attributes(line)
2256             elif line.startswith('#') or not line.strip():
2257                 continue
2258             else:
2259                 tbr = float_or_none(
2260                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2261                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2262                 manifest_url = format_url(line.strip())
2263
2264                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2265                     format_id = [m3u8_id, None, idx]
2266                     # Bandwidth of live streams may differ over time thus making
2267                     # format_id unpredictable. So it's better to keep provided
2268                     # format_id intact.
2269                     if not live:
2270                         stream_name = build_stream_name()
2271                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2272                     f = {
2273                         'format_id': join_nonempty(*format_id),
2274                         'format_index': idx,
2275                         'url': manifest_url,
2276                         'manifest_url': m3u8_url,
2277                         'tbr': tbr,
2278                         'ext': ext,
2279                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2280                         'protocol': entry_protocol,
2281                         'preference': preference,
2282                         'quality': quality,
2283                     }
2284                     resolution = last_stream_inf.get('RESOLUTION')
2285                     if resolution:
2286                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2287                         if mobj:
2288                             f['width'] = int(mobj.group('width'))
2289                             f['height'] = int(mobj.group('height'))
2290                     # Unified Streaming Platform
2291                     mobj = re.search(
2292                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2293                     if mobj:
2294                         abr, vbr = mobj.groups()
2295                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2296                         f.update({
2297                             'vbr': vbr,
2298                             'abr': abr,
2299                         })
2300                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2301                     f.update(codecs)
2302                     audio_group_id = last_stream_inf.get('AUDIO')
2303                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2304                     # references a rendition group MUST have a CODECS attribute.
2305                     # However, this is not always respected, for example, [2]
2306                     # contains EXT-X-STREAM-INF tag which references AUDIO
2307                     # rendition group but does not have CODECS and despite
2308                     # referencing an audio group it represents a complete
2309                     # (with audio and video) format. So, for such cases we will
2310                     # ignore references to rendition groups and treat them
2311                     # as complete formats.
2312                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2313                         audio_group = groups.get(audio_group_id)
2314                         if audio_group and audio_group[0].get('URI'):
2315                             # TODO: update acodec for audio only formats with
2316                             # the same GROUP-ID
2317                             f['acodec'] = 'none'
2318                     if not f.get('ext'):
2319                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2320                     formats.append(f)
2321
2322                     # for DailyMotion
2323                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2324                     if progressive_uri:
2325                         http_f = f.copy()
2326                         del http_f['manifest_url']
2327                         http_f.update({
2328                             'format_id': f['format_id'].replace('hls-', 'http-'),
2329                             'protocol': 'http',
2330                             'url': progressive_uri,
2331                         })
2332                         formats.append(http_f)
2333
2334                 last_stream_inf = {}
2335         return formats, subtitles
2336
2337     def _extract_m3u8_vod_duration(
2338             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2339
2340         m3u8_vod = self._download_webpage(
2341             m3u8_vod_url, video_id,
2342             note='Downloading m3u8 VOD manifest' if note is None else note,
2343             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2344             fatal=False, data=data, headers=headers, query=query)
2345
2346         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2347
2348     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2349         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2350             return None
2351
2352         return int(sum(
2353             float(line[len('#EXTINF:'):].split(',')[0])
2354             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2355
2356     @staticmethod
2357     def _xpath_ns(path, namespace=None):
2358         if not namespace:
2359             return path
2360         out = []
2361         for c in path.split('/'):
2362             if not c or c == '.':
2363                 out.append(c)
2364             else:
2365                 out.append('{%s}%s' % (namespace, c))
2366         return '/'.join(out)
2367
2368     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2369         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2370
2371         if smil is False:
2372             assert not fatal
2373             return [], {}
2374
2375         namespace = self._parse_smil_namespace(smil)
2376
2377         fmts = self._parse_smil_formats(
2378             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2379         subs = self._parse_smil_subtitles(
2380             smil, namespace=namespace)
2381
2382         return fmts, subs
2383
2384     def _extract_smil_formats(self, *args, **kwargs):
2385         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2386         if subs:
2387             self._report_ignoring_subs('SMIL')
2388         return fmts
2389
2390     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2391         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2392         if smil is False:
2393             return {}
2394         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2395
2396     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2397         return self._download_xml(
2398             smil_url, video_id, 'Downloading SMIL file',
2399             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2400
2401     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2402         namespace = self._parse_smil_namespace(smil)
2403
2404         formats = self._parse_smil_formats(
2405             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2406         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2407
2408         video_id = os.path.splitext(url_basename(smil_url))[0]
2409         title = None
2410         description = None
2411         upload_date = None
2412         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2413             name = meta.attrib.get('name')
2414             content = meta.attrib.get('content')
2415             if not name or not content:
2416                 continue
2417             if not title and name == 'title':
2418                 title = content
2419             elif not description and name in ('description', 'abstract'):
2420                 description = content
2421             elif not upload_date and name == 'date':
2422                 upload_date = unified_strdate(content)
2423
2424         thumbnails = [{
2425             'id': image.get('type'),
2426             'url': image.get('src'),
2427             'width': int_or_none(image.get('width')),
2428             'height': int_or_none(image.get('height')),
2429         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2430
2431         return {
2432             'id': video_id,
2433             'title': title or video_id,
2434             'description': description,
2435             'upload_date': upload_date,
2436             'thumbnails': thumbnails,
2437             'formats': formats,
2438             'subtitles': subtitles,
2439         }
2440
2441     def _parse_smil_namespace(self, smil):
2442         return self._search_regex(
2443             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2444
2445     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2446         base = smil_url
2447         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2448             b = meta.get('base') or meta.get('httpBase')
2449             if b:
2450                 base = b
2451                 break
2452
2453         formats = []
2454         rtmp_count = 0
2455         http_count = 0
2456         m3u8_count = 0
2457         imgs_count = 0
2458
2459         srcs = set()
2460         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2461         for medium in media:
2462             src = medium.get('src')
2463             if not src or src in srcs:
2464                 continue
2465             srcs.add(src)
2466
2467             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2468             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2469             width = int_or_none(medium.get('width'))
2470             height = int_or_none(medium.get('height'))
2471             proto = medium.get('proto')
2472             ext = medium.get('ext')
2473             src_ext = determine_ext(src)
2474             streamer = medium.get('streamer') or base
2475
2476             if proto == 'rtmp' or streamer.startswith('rtmp'):
2477                 rtmp_count += 1
2478                 formats.append({
2479                     'url': streamer,
2480                     'play_path': src,
2481                     'ext': 'flv',
2482                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2483                     'tbr': bitrate,
2484                     'filesize': filesize,
2485                     'width': width,
2486                     'height': height,
2487                 })
2488                 if transform_rtmp_url:
2489                     streamer, src = transform_rtmp_url(streamer, src)
2490                     formats[-1].update({
2491                         'url': streamer,
2492                         'play_path': src,
2493                     })
2494                 continue
2495
2496             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2497             src_url = src_url.strip()
2498
2499             if proto == 'm3u8' or src_ext == 'm3u8':
2500                 m3u8_formats = self._extract_m3u8_formats(
2501                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2502                 if len(m3u8_formats) == 1:
2503                     m3u8_count += 1
2504                     m3u8_formats[0].update({
2505                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2506                         'tbr': bitrate,
2507                         'width': width,
2508                         'height': height,
2509                     })
2510                 formats.extend(m3u8_formats)
2511             elif src_ext == 'f4m':
2512                 f4m_url = src_url
2513                 if not f4m_params:
2514                     f4m_params = {
2515                         'hdcore': '3.2.0',
2516                         'plugin': 'flowplayer-3.2.0.1',
2517                     }
2518                 f4m_url += '&' if '?' in f4m_url else '?'
2519                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2520                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2521             elif src_ext == 'mpd':
2522                 formats.extend(self._extract_mpd_formats(
2523                     src_url, video_id, mpd_id='dash', fatal=False))
2524             elif re.search(r'\.ism/[Mm]anifest', src_url):
2525                 formats.extend(self._extract_ism_formats(
2526                     src_url, video_id, ism_id='mss', fatal=False))
2527             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2528                 http_count += 1
2529                 formats.append({
2530                     'url': src_url,
2531                     'ext': ext or src_ext or 'flv',
2532                     'format_id': 'http-%d' % (bitrate or http_count),
2533                     'tbr': bitrate,
2534                     'filesize': filesize,
2535                     'width': width,
2536                     'height': height,
2537                 })
2538
2539         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2540             src = medium.get('src')
2541             if not src or src in srcs:
2542                 continue
2543             srcs.add(src)
2544
2545             imgs_count += 1
2546             formats.append({
2547                 'format_id': 'imagestream-%d' % (imgs_count),
2548                 'url': src,
2549                 'ext': mimetype2ext(medium.get('type')),
2550                 'acodec': 'none',
2551                 'vcodec': 'none',
2552                 'width': int_or_none(medium.get('width')),
2553                 'height': int_or_none(medium.get('height')),
2554                 'format_note': 'SMIL storyboards',
2555             })
2556
2557         return formats
2558
2559     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2560         urls = []
2561         subtitles = {}
2562         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2563             src = textstream.get('src')
2564             if not src or src in urls:
2565                 continue
2566             urls.append(src)
2567             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2568             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2569             subtitles.setdefault(lang, []).append({
2570                 'url': src,
2571                 'ext': ext,
2572             })
2573         return subtitles
2574
2575     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2576         xspf = self._download_xml(
2577             xspf_url, playlist_id, 'Downloading xpsf playlist',
2578             'Unable to download xspf manifest', fatal=fatal)
2579         if xspf is False:
2580             return []
2581         return self._parse_xspf(
2582             xspf, playlist_id, xspf_url=xspf_url,
2583             xspf_base_url=base_url(xspf_url))
2584
2585     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2586         NS_MAP = {
2587             'xspf': 'http://xspf.org/ns/0/',
2588             's1': 'http://static.streamone.nl/player/ns/0',
2589         }
2590
2591         entries = []
2592         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2593             title = xpath_text(
2594                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2595             description = xpath_text(
2596                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2597             thumbnail = xpath_text(
2598                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2599             duration = float_or_none(
2600                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2601
2602             formats = []
2603             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2604                 format_url = urljoin(xspf_base_url, location.text)
2605                 if not format_url:
2606                     continue
2607                 formats.append({
2608                     'url': format_url,
2609                     'manifest_url': xspf_url,
2610                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2611                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2612                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2613                 })
2614             self._sort_formats(formats)
2615
2616             entries.append({
2617                 'id': playlist_id,
2618                 'title': title,
2619                 'description': description,
2620                 'thumbnail': thumbnail,
2621                 'duration': duration,
2622                 'formats': formats,
2623             })
2624         return entries
2625
2626     def _extract_mpd_formats(self, *args, **kwargs):
2627         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2628         if subs:
2629             self._report_ignoring_subs('DASH')
2630         return fmts
2631
2632     def _extract_mpd_formats_and_subtitles(
2633             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2634             fatal=True, data=None, headers={}, query={}):
2635         res = self._download_xml_handle(
2636             mpd_url, video_id,
2637             note='Downloading MPD manifest' if note is None else note,
2638             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2639             fatal=fatal, data=data, headers=headers, query=query)
2640         if res is False:
2641             return [], {}
2642         mpd_doc, urlh = res
2643         if mpd_doc is None:
2644             return [], {}
2645         mpd_base_url = base_url(urlh.geturl())
2646
2647         return self._parse_mpd_formats_and_subtitles(
2648             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2649
2650     def _parse_mpd_formats(self, *args, **kwargs):
2651         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2652         if subs:
2653             self._report_ignoring_subs('DASH')
2654         return fmts
2655
2656     def _parse_mpd_formats_and_subtitles(
2657             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2658         """
2659         Parse formats from MPD manifest.
2660         References:
2661          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2662             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2663          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2664         """
2665         if not self.get_param('dynamic_mpd', True):
2666             if mpd_doc.get('type') == 'dynamic':
2667                 return [], {}
2668
2669         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2670
2671         def _add_ns(path):
2672             return self._xpath_ns(path, namespace)
2673
2674         def is_drm_protected(element):
2675             return element.find(_add_ns('ContentProtection')) is not None
2676
2677         def extract_multisegment_info(element, ms_parent_info):
2678             ms_info = ms_parent_info.copy()
2679
2680             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2681             # common attributes and elements.  We will only extract relevant
2682             # for us.
2683             def extract_common(source):
2684                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2685                 if segment_timeline is not None:
2686                     s_e = segment_timeline.findall(_add_ns('S'))
2687                     if s_e:
2688                         ms_info['total_number'] = 0
2689                         ms_info['s'] = []
2690                         for s in s_e:
2691                             r = int(s.get('r', 0))
2692                             ms_info['total_number'] += 1 + r
2693                             ms_info['s'].append({
2694                                 't': int(s.get('t', 0)),
2695                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2696                                 'd': int(s.attrib['d']),
2697                                 'r': r,
2698                             })
2699                 start_number = source.get('startNumber')
2700                 if start_number:
2701                     ms_info['start_number'] = int(start_number)
2702                 timescale = source.get('timescale')
2703                 if timescale:
2704                     ms_info['timescale'] = int(timescale)
2705                 segment_duration = source.get('duration')
2706                 if segment_duration:
2707                     ms_info['segment_duration'] = float(segment_duration)
2708
2709             def extract_Initialization(source):
2710                 initialization = source.find(_add_ns('Initialization'))
2711                 if initialization is not None:
2712                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2713
2714             segment_list = element.find(_add_ns('SegmentList'))
2715             if segment_list is not None:
2716                 extract_common(segment_list)
2717                 extract_Initialization(segment_list)
2718                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2719                 if segment_urls_e:
2720                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2721             else:
2722                 segment_template = element.find(_add_ns('SegmentTemplate'))
2723                 if segment_template is not None:
2724                     extract_common(segment_template)
2725                     media = segment_template.get('media')
2726                     if media:
2727                         ms_info['media'] = media
2728                     initialization = segment_template.get('initialization')
2729                     if initialization:
2730                         ms_info['initialization'] = initialization
2731                     else:
2732                         extract_Initialization(segment_template)
2733             return ms_info
2734
2735         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2736         formats, subtitles = [], {}
2737         stream_numbers = collections.defaultdict(int)
2738         for period in mpd_doc.findall(_add_ns('Period')):
2739             period_duration = parse_duration(period.get('duration')) or mpd_duration
2740             period_ms_info = extract_multisegment_info(period, {
2741                 'start_number': 1,
2742                 'timescale': 1,
2743             })
2744             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2745                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2746                 for representation in adaptation_set.findall(_add_ns('Representation')):
2747                     representation_attrib = adaptation_set.attrib.copy()
2748                     representation_attrib.update(representation.attrib)
2749                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2750                     mime_type = representation_attrib['mimeType']
2751                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2752
2753                     codecs = parse_codecs(representation_attrib.get('codecs', ''))
2754                     if content_type not in ('video', 'audio', 'text'):
2755                         if mime_type == 'image/jpeg':
2756                             content_type = mime_type
2757                         elif codecs['vcodec'] != 'none':
2758                             content_type = 'video'
2759                         elif codecs['acodec'] != 'none':
2760                             content_type = 'audio'
2761                         elif codecs.get('tcodec', 'none') != 'none':
2762                             content_type = 'text'
2763                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2764                             content_type = 'text'
2765                         else:
2766                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2767                             continue
2768
2769                     base_url = ''
2770                     for element in (representation, adaptation_set, period, mpd_doc):
2771                         base_url_e = element.find(_add_ns('BaseURL'))
2772                         if base_url_e is not None:
2773                             base_url = base_url_e.text + base_url
2774                             if re.match(r'^https?://', base_url):
2775                                 break
2776                     if mpd_base_url and base_url.startswith('/'):
2777                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2778                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2779                         if not mpd_base_url.endswith('/'):
2780                             mpd_base_url += '/'
2781                         base_url = mpd_base_url + base_url
2782                     representation_id = representation_attrib.get('id')
2783                     lang = representation_attrib.get('lang')
2784                     url_el = representation.find(_add_ns('BaseURL'))
2785                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2786                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2787                     if representation_id is not None:
2788                         format_id = representation_id
2789                     else:
2790                         format_id = content_type
2791                     if mpd_id:
2792                         format_id = mpd_id + '-' + format_id
2793                     if content_type in ('video', 'audio'):
2794                         f = {
2795                             'format_id': format_id,
2796                             'manifest_url': mpd_url,
2797                             'ext': mimetype2ext(mime_type),
2798                             'width': int_or_none(representation_attrib.get('width')),
2799                             'height': int_or_none(representation_attrib.get('height')),
2800                             'tbr': float_or_none(bandwidth, 1000),
2801                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2802                             'fps': int_or_none(representation_attrib.get('frameRate')),
2803                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2804                             'format_note': 'DASH %s' % content_type,
2805                             'filesize': filesize,
2806                             'container': mimetype2ext(mime_type) + '_dash',
2807                             **codecs
2808                         }
2809                     elif content_type == 'text':
2810                         f = {
2811                             'ext': mimetype2ext(mime_type),
2812                             'manifest_url': mpd_url,
2813                             'filesize': filesize,
2814                         }
2815                     elif content_type == 'image/jpeg':
2816                         # See test case in VikiIE
2817                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2818                         f = {
2819                             'format_id': format_id,
2820                             'ext': 'mhtml',
2821                             'manifest_url': mpd_url,
2822                             'format_note': 'DASH storyboards (jpeg)',
2823                             'acodec': 'none',
2824                             'vcodec': 'none',
2825                         }
2826                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2827                         f['has_drm'] = True
2828                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2829
2830                     def prepare_template(template_name, identifiers):
2831                         tmpl = representation_ms_info[template_name]
2832                         # First of, % characters outside $...$ templates
2833                         # must be escaped by doubling for proper processing
2834                         # by % operator string formatting used further (see
2835                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2836                         t = ''
2837                         in_template = False
2838                         for c in tmpl:
2839                             t += c
2840                             if c == '$':
2841                                 in_template = not in_template
2842                             elif c == '%' and not in_template:
2843                                 t += c
2844                         # Next, $...$ templates are translated to their
2845                         # %(...) counterparts to be used with % operator
2846                         if representation_id is not None:
2847                             t = t.replace('$RepresentationID$', representation_id)
2848                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2849                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2850                         t.replace('$$', '$')
2851                         return t
2852
2853                     # @initialization is a regular template like @media one
2854                     # so it should be handled just the same way (see
2855                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2856                     if 'initialization' in representation_ms_info:
2857                         initialization_template = prepare_template(
2858                             'initialization',
2859                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2860                             # $Time$ shall not be included for @initialization thus
2861                             # only $Bandwidth$ remains
2862                             ('Bandwidth', ))
2863                         representation_ms_info['initialization_url'] = initialization_template % {
2864                             'Bandwidth': bandwidth,
2865                         }
2866
2867                     def location_key(location):
2868                         return 'url' if re.match(r'^https?://', location) else 'path'
2869
2870                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2871
2872                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2873                         media_location_key = location_key(media_template)
2874
2875                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2876                         # can't be used at the same time
2877                         if '%(Number' in media_template and 's' not in representation_ms_info:
2878                             segment_duration = None
2879                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2880                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2881                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2882                             representation_ms_info['fragments'] = [{
2883                                 media_location_key: media_template % {
2884                                     'Number': segment_number,
2885                                     'Bandwidth': bandwidth,
2886                                 },
2887                                 'duration': segment_duration,
2888                             } for segment_number in range(
2889                                 representation_ms_info['start_number'],
2890                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2891                         else:
2892                             # $Number*$ or $Time$ in media template with S list available
2893                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2894                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2895                             representation_ms_info['fragments'] = []
2896                             segment_time = 0
2897                             segment_d = None
2898                             segment_number = representation_ms_info['start_number']
2899
2900                             def add_segment_url():
2901                                 segment_url = media_template % {
2902                                     'Time': segment_time,
2903                                     'Bandwidth': bandwidth,
2904                                     'Number': segment_number,
2905                                 }
2906                                 representation_ms_info['fragments'].append({
2907                                     media_location_key: segment_url,
2908                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2909                                 })
2910
2911                             for num, s in enumerate(representation_ms_info['s']):
2912                                 segment_time = s.get('t') or segment_time
2913                                 segment_d = s['d']
2914                                 add_segment_url()
2915                                 segment_number += 1
2916                                 for r in range(s.get('r', 0)):
2917                                     segment_time += segment_d
2918                                     add_segment_url()
2919                                     segment_number += 1
2920                                 segment_time += segment_d
2921                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2922                         # No media template
2923                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2924                         # or any YouTube dashsegments video
2925                         fragments = []
2926                         segment_index = 0
2927                         timescale = representation_ms_info['timescale']
2928                         for s in representation_ms_info['s']:
2929                             duration = float_or_none(s['d'], timescale)
2930                             for r in range(s.get('r', 0) + 1):
2931                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2932                                 fragments.append({
2933                                     location_key(segment_uri): segment_uri,
2934                                     'duration': duration,
2935                                 })
2936                                 segment_index += 1
2937                         representation_ms_info['fragments'] = fragments
2938                     elif 'segment_urls' in representation_ms_info:
2939                         # Segment URLs with no SegmentTimeline
2940                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2941                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2942                         fragments = []
2943                         segment_duration = float_or_none(
2944                             representation_ms_info['segment_duration'],
2945                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2946                         for segment_url in representation_ms_info['segment_urls']:
2947                             fragment = {
2948                                 location_key(segment_url): segment_url,
2949                             }
2950                             if segment_duration:
2951                                 fragment['duration'] = segment_duration
2952                             fragments.append(fragment)
2953                         representation_ms_info['fragments'] = fragments
2954                     # If there is a fragments key available then we correctly recognized fragmented media.
2955                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2956                     # assumption is not necessarily correct since we may simply have no support for
2957                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2958                     if 'fragments' in representation_ms_info:
2959                         f.update({
2960                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2961                             'url': mpd_url or base_url,
2962                             'fragment_base_url': base_url,
2963                             'fragments': [],
2964                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2965                         })
2966                         if 'initialization_url' in representation_ms_info:
2967                             initialization_url = representation_ms_info['initialization_url']
2968                             if not f.get('url'):
2969                                 f['url'] = initialization_url
2970                             f['fragments'].append({location_key(initialization_url): initialization_url})
2971                         f['fragments'].extend(representation_ms_info['fragments'])
2972                     else:
2973                         # Assuming direct URL to unfragmented media.
2974                         f['url'] = base_url
2975                     if content_type in ('video', 'audio', 'image/jpeg'):
2976                         f['manifest_stream_number'] = stream_numbers[f['url']]
2977                         stream_numbers[f['url']] += 1
2978                         formats.append(f)
2979                     elif content_type == 'text':
2980                         subtitles.setdefault(lang or 'und', []).append(f)
2981
2982         return formats, subtitles
2983
2984     def _extract_ism_formats(self, *args, **kwargs):
2985         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2986         if subs:
2987             self._report_ignoring_subs('ISM')
2988         return fmts
2989
2990     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2991         res = self._download_xml_handle(
2992             ism_url, video_id,
2993             note='Downloading ISM manifest' if note is None else note,
2994             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2995             fatal=fatal, data=data, headers=headers, query=query)
2996         if res is False:
2997             return [], {}
2998         ism_doc, urlh = res
2999         if ism_doc is None:
3000             return [], {}
3001
3002         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3003
3004     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3005         """
3006         Parse formats from ISM manifest.
3007         References:
3008          1. [MS-SSTR]: Smooth Streaming Protocol,
3009             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3010         """
3011         if ism_doc.get('IsLive') == 'TRUE':
3012             return [], {}
3013
3014         duration = int(ism_doc.attrib['Duration'])
3015         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3016
3017         formats = []
3018         subtitles = {}
3019         for stream in ism_doc.findall('StreamIndex'):
3020             stream_type = stream.get('Type')
3021             if stream_type not in ('video', 'audio', 'text'):
3022                 continue
3023             url_pattern = stream.attrib['Url']
3024             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3025             stream_name = stream.get('Name')
3026             stream_language = stream.get('Language', 'und')
3027             for track in stream.findall('QualityLevel'):
3028                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3029                 # TODO: add support for WVC1 and WMAP
3030                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3031                     self.report_warning('%s is not a supported codec' % fourcc)
3032                     continue
3033                 tbr = int(track.attrib['Bitrate']) // 1000
3034                 # [1] does not mention Width and Height attributes. However,
3035                 # they're often present while MaxWidth and MaxHeight are
3036                 # missing, so should be used as fallbacks
3037                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3038                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3039                 sampling_rate = int_or_none(track.get('SamplingRate'))
3040
3041                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3042                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3043
3044                 fragments = []
3045                 fragment_ctx = {
3046                     'time': 0,
3047                 }
3048                 stream_fragments = stream.findall('c')
3049                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3050                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3051                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3052                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3053                     if not fragment_ctx['duration']:
3054                         try:
3055                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3056                         except IndexError:
3057                             next_fragment_time = duration
3058                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3059                     for _ in range(fragment_repeat):
3060                         fragments.append({
3061                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3062                             'duration': fragment_ctx['duration'] / stream_timescale,
3063                         })
3064                         fragment_ctx['time'] += fragment_ctx['duration']
3065
3066                 if stream_type == 'text':
3067                     subtitles.setdefault(stream_language, []).append({
3068                         'ext': 'ismt',
3069                         'protocol': 'ism',
3070                         'url': ism_url,
3071                         'manifest_url': ism_url,
3072                         'fragments': fragments,
3073                         '_download_params': {
3074                             'stream_type': stream_type,
3075                             'duration': duration,
3076                             'timescale': stream_timescale,
3077                             'fourcc': fourcc,
3078                             'language': stream_language,
3079                             'codec_private_data': track.get('CodecPrivateData'),
3080                         }
3081                     })
3082                 elif stream_type in ('video', 'audio'):
3083                     formats.append({
3084                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3085                         'url': ism_url,
3086                         'manifest_url': ism_url,
3087                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3088                         'width': width,
3089                         'height': height,
3090                         'tbr': tbr,
3091                         'asr': sampling_rate,
3092                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3093                         'acodec': 'none' if stream_type == 'video' else fourcc,
3094                         'protocol': 'ism',
3095                         'fragments': fragments,
3096                         'has_drm': ism_doc.find('Protection') is not None,
3097                         '_download_params': {
3098                             'stream_type': stream_type,
3099                             'duration': duration,
3100                             'timescale': stream_timescale,
3101                             'width': width or 0,
3102                             'height': height or 0,
3103                             'fourcc': fourcc,
3104                             'language': stream_language,
3105                             'codec_private_data': track.get('CodecPrivateData'),
3106                             'sampling_rate': sampling_rate,
3107                             'channels': int_or_none(track.get('Channels', 2)),
3108                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3109                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3110                         },
3111                     })
3112         return formats, subtitles
3113
3114     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3115         def absolute_url(item_url):
3116             return urljoin(base_url, item_url)
3117
3118         def parse_content_type(content_type):
3119             if not content_type:
3120                 return {}
3121             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3122             if ctr:
3123                 mimetype, codecs = ctr.groups()
3124                 f = parse_codecs(codecs)
3125                 f['ext'] = mimetype2ext(mimetype)
3126                 return f
3127             return {}
3128
3129         def _media_formats(src, cur_media_type, type_info={}):
3130             full_url = absolute_url(src)
3131             ext = type_info.get('ext') or determine_ext(full_url)
3132             if ext == 'm3u8':
3133                 is_plain_url = False
3134                 formats = self._extract_m3u8_formats(
3135                     full_url, video_id, ext='mp4',
3136                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3137                     preference=preference, quality=quality, fatal=False)
3138             elif ext == 'mpd':
3139                 is_plain_url = False
3140                 formats = self._extract_mpd_formats(
3141                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3142             else:
3143                 is_plain_url = True
3144                 formats = [{
3145                     'url': full_url,
3146                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3147                 }]
3148             return is_plain_url, formats
3149
3150         entries = []
3151         # amp-video and amp-audio are very similar to their HTML5 counterparts
3152         # so we wll include them right here (see
3153         # https://www.ampproject.org/docs/reference/components/amp-video)
3154         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3155         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3156         media_tags = [(media_tag, media_tag_name, media_type, '')
3157                       for media_tag, media_tag_name, media_type
3158                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3159         media_tags.extend(re.findall(
3160             # We only allow video|audio followed by a whitespace or '>'.
3161             # Allowing more characters may end up in significant slow down (see
3162             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3163             # http://www.porntrex.com/maps/videositemap.xml).
3164             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3165         for media_tag, _, media_type, media_content in media_tags:
3166             media_info = {
3167                 'formats': [],
3168                 'subtitles': {},
3169             }
3170             media_attributes = extract_attributes(media_tag)
3171             src = strip_or_none(media_attributes.get('src'))
3172             if src:
3173                 _, formats = _media_formats(src, media_type)
3174                 media_info['formats'].extend(formats)
3175             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3176             if media_content:
3177                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3178                     s_attr = extract_attributes(source_tag)
3179                     # data-video-src and data-src are non standard but seen
3180                     # several times in the wild
3181                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3182                     if not src:
3183                         continue
3184                     f = parse_content_type(s_attr.get('type'))
3185                     is_plain_url, formats = _media_formats(src, media_type, f)
3186                     if is_plain_url:
3187                         # width, height, res, label and title attributes are
3188                         # all not standard but seen several times in the wild
3189                         labels = [
3190                             s_attr.get(lbl)
3191                             for lbl in ('label', 'title')
3192                             if str_or_none(s_attr.get(lbl))
3193                         ]
3194                         width = int_or_none(s_attr.get('width'))
3195                         height = (int_or_none(s_attr.get('height'))
3196                                   or int_or_none(s_attr.get('res')))
3197                         if not width or not height:
3198                             for lbl in labels:
3199                                 resolution = parse_resolution(lbl)
3200                                 if not resolution:
3201                                     continue
3202                                 width = width or resolution.get('width')
3203                                 height = height or resolution.get('height')
3204                         for lbl in labels:
3205                             tbr = parse_bitrate(lbl)
3206                             if tbr:
3207                                 break
3208                         else:
3209                             tbr = None
3210                         f.update({
3211                             'width': width,
3212                             'height': height,
3213                             'tbr': tbr,
3214                             'format_id': s_attr.get('label') or s_attr.get('title'),
3215                         })
3216                         f.update(formats[0])
3217                         media_info['formats'].append(f)
3218                     else:
3219                         media_info['formats'].extend(formats)
3220                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3221                     track_attributes = extract_attributes(track_tag)
3222                     kind = track_attributes.get('kind')
3223                     if not kind or kind in ('subtitles', 'captions'):
3224                         src = strip_or_none(track_attributes.get('src'))
3225                         if not src:
3226                             continue
3227                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3228                         media_info['subtitles'].setdefault(lang, []).append({
3229                             'url': absolute_url(src),
3230                         })
3231             for f in media_info['formats']:
3232                 f.setdefault('http_headers', {})['Referer'] = base_url
3233             if media_info['formats'] or media_info['subtitles']:
3234                 entries.append(media_info)
3235         return entries
3236
3237     def _extract_akamai_formats(self, *args, **kwargs):
3238         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3239         if subs:
3240             self._report_ignoring_subs('akamai')
3241         return fmts
3242
3243     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3244         signed = 'hdnea=' in manifest_url
3245         if not signed:
3246             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3247             manifest_url = re.sub(
3248                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3249                 '', manifest_url).strip('?')
3250
3251         formats = []
3252         subtitles = {}
3253
3254         hdcore_sign = 'hdcore=3.7.0'
3255         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3256         hds_host = hosts.get('hds')
3257         if hds_host:
3258             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3259         if 'hdcore=' not in f4m_url:
3260             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3261         f4m_formats = self._extract_f4m_formats(
3262             f4m_url, video_id, f4m_id='hds', fatal=False)
3263         for entry in f4m_formats:
3264             entry.update({'extra_param_to_segment_url': hdcore_sign})
3265         formats.extend(f4m_formats)
3266
3267         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3268         hls_host = hosts.get('hls')
3269         if hls_host:
3270             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3271         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3272             m3u8_url, video_id, 'mp4', 'm3u8_native',
3273             m3u8_id='hls', fatal=False)
3274         formats.extend(m3u8_formats)
3275         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3276
3277         http_host = hosts.get('http')
3278         if http_host and m3u8_formats and not signed:
3279             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3280             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3281             qualities_length = len(qualities)
3282             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3283                 i = 0
3284                 for f in m3u8_formats:
3285                     if f['vcodec'] != 'none':
3286                         for protocol in ('http', 'https'):
3287                             http_f = f.copy()
3288                             del http_f['manifest_url']
3289                             http_url = re.sub(
3290                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3291                             http_f.update({
3292                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3293                                 'url': http_url,
3294                                 'protocol': protocol,
3295                             })
3296                             formats.append(http_f)
3297                         i += 1
3298
3299         return formats, subtitles
3300
3301     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3302         query = compat_urlparse.urlparse(url).query
3303         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3304         mobj = re.search(
3305             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3306         url_base = mobj.group('url')
3307         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3308         formats = []
3309
3310         def manifest_url(manifest):
3311             m_url = '%s/%s' % (http_base_url, manifest)
3312             if query:
3313                 m_url += '?%s' % query
3314             return m_url
3315
3316         if 'm3u8' not in skip_protocols:
3317             formats.extend(self._extract_m3u8_formats(
3318                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3319                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3320         if 'f4m' not in skip_protocols:
3321             formats.extend(self._extract_f4m_formats(
3322                 manifest_url('manifest.f4m'),
3323                 video_id, f4m_id='hds', fatal=False))
3324         if 'dash' not in skip_protocols:
3325             formats.extend(self._extract_mpd_formats(
3326                 manifest_url('manifest.mpd'),
3327                 video_id, mpd_id='dash', fatal=False))
3328         if re.search(r'(?:/smil:|\.smil)', url_base):
3329             if 'smil' not in skip_protocols:
3330                 rtmp_formats = self._extract_smil_formats(
3331                     manifest_url('jwplayer.smil'),
3332                     video_id, fatal=False)
3333                 for rtmp_format in rtmp_formats:
3334                     rtsp_format = rtmp_format.copy()
3335                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3336                     del rtsp_format['play_path']
3337                     del rtsp_format['ext']
3338                     rtsp_format.update({
3339                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3340                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3341                         'protocol': 'rtsp',
3342                     })
3343                     formats.extend([rtmp_format, rtsp_format])
3344         else:
3345             for protocol in ('rtmp', 'rtsp'):
3346                 if protocol not in skip_protocols:
3347                     formats.append({
3348                         'url': '%s:%s' % (protocol, url_base),
3349                         'format_id': protocol,
3350                         'protocol': protocol,
3351                     })
3352         return formats
3353
3354     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3355         mobj = re.search(
3356             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3357             webpage)
3358         if mobj:
3359             try:
3360                 jwplayer_data = self._parse_json(mobj.group('options'),
3361                                                  video_id=video_id,
3362                                                  transform_source=transform_source)
3363             except ExtractorError:
3364                 pass
3365             else:
3366                 if isinstance(jwplayer_data, dict):
3367                     return jwplayer_data
3368
3369     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3370         jwplayer_data = self._find_jwplayer_data(
3371             webpage, video_id, transform_source=js_to_json)
3372         return self._parse_jwplayer_data(
3373             jwplayer_data, video_id, *args, **kwargs)
3374
3375     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3376                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3377         # JWPlayer backward compatibility: flattened playlists
3378         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3379         if 'playlist' not in jwplayer_data:
3380             jwplayer_data = {'playlist': [jwplayer_data]}
3381
3382         entries = []
3383
3384         # JWPlayer backward compatibility: single playlist item
3385         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3386         if not isinstance(jwplayer_data['playlist'], list):
3387             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3388
3389         for video_data in jwplayer_data['playlist']:
3390             # JWPlayer backward compatibility: flattened sources
3391             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3392             if 'sources' not in video_data:
3393                 video_data['sources'] = [video_data]
3394
3395             this_video_id = video_id or video_data['mediaid']
3396
3397             formats = self._parse_jwplayer_formats(
3398                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3399                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3400
3401             subtitles = {}
3402             tracks = video_data.get('tracks')
3403             if tracks and isinstance(tracks, list):
3404                 for track in tracks:
3405                     if not isinstance(track, dict):
3406                         continue
3407                     track_kind = track.get('kind')
3408                     if not track_kind or not isinstance(track_kind, compat_str):
3409                         continue
3410                     if track_kind.lower() not in ('captions', 'subtitles'):
3411                         continue
3412                     track_url = urljoin(base_url, track.get('file'))
3413                     if not track_url:
3414                         continue
3415                     subtitles.setdefault(track.get('label') or 'en', []).append({
3416                         'url': self._proto_relative_url(track_url)
3417                     })
3418
3419             entry = {
3420                 'id': this_video_id,
3421                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3422                 'description': clean_html(video_data.get('description')),
3423                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3424                 'timestamp': int_or_none(video_data.get('pubdate')),
3425                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3426                 'subtitles': subtitles,
3427             }
3428             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3429             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3430                 entry.update({
3431                     '_type': 'url_transparent',
3432                     'url': formats[0]['url'],
3433                 })
3434             else:
3435                 self._sort_formats(formats)
3436                 entry['formats'] = formats
3437             entries.append(entry)
3438         if len(entries) == 1:
3439             return entries[0]
3440         else:
3441             return self.playlist_result(entries)
3442
3443     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3444                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3445         urls = []
3446         formats = []
3447         for source in jwplayer_sources_data:
3448             if not isinstance(source, dict):
3449                 continue
3450             source_url = urljoin(
3451                 base_url, self._proto_relative_url(source.get('file')))
3452             if not source_url or source_url in urls:
3453                 continue
3454             urls.append(source_url)
3455             source_type = source.get('type') or ''
3456             ext = mimetype2ext(source_type) or determine_ext(source_url)
3457             if source_type == 'hls' or ext == 'm3u8':
3458                 formats.extend(self._extract_m3u8_formats(
3459                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3460                     m3u8_id=m3u8_id, fatal=False))
3461             elif source_type == 'dash' or ext == 'mpd':
3462                 formats.extend(self._extract_mpd_formats(
3463                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3464             elif ext == 'smil':
3465                 formats.extend(self._extract_smil_formats(
3466                     source_url, video_id, fatal=False))
3467             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3468             elif source_type.startswith('audio') or ext in (
3469                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3470                 formats.append({
3471                     'url': source_url,
3472                     'vcodec': 'none',
3473                     'ext': ext,
3474                 })
3475             else:
3476                 height = int_or_none(source.get('height'))
3477                 if height is None:
3478                     # Often no height is provided but there is a label in
3479                     # format like "1080p", "720p SD", or 1080.
3480                     height = int_or_none(self._search_regex(
3481                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3482                         'height', default=None))
3483                 a_format = {
3484                     'url': source_url,
3485                     'width': int_or_none(source.get('width')),
3486                     'height': height,
3487                     'tbr': int_or_none(source.get('bitrate')),
3488                     'ext': ext,
3489                 }
3490                 if source_url.startswith('rtmp'):
3491                     a_format['ext'] = 'flv'
3492                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3493                     # of jwplayer.flash.swf
3494                     rtmp_url_parts = re.split(
3495                         r'((?:mp4|mp3|flv):)', source_url, 1)
3496                     if len(rtmp_url_parts) == 3:
3497                         rtmp_url, prefix, play_path = rtmp_url_parts
3498                         a_format.update({
3499                             'url': rtmp_url,
3500                             'play_path': prefix + play_path,
3501                         })
3502                     if rtmp_params:
3503                         a_format.update(rtmp_params)
3504                 formats.append(a_format)
3505         return formats
3506
3507     def _live_title(self, name):
3508         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3509         return name
3510
3511     def _int(self, v, name, fatal=False, **kwargs):
3512         res = int_or_none(v, **kwargs)
3513         if res is None:
3514             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3515             if fatal:
3516                 raise ExtractorError(msg)
3517             else:
3518                 self.report_warning(msg)
3519         return res
3520
3521     def _float(self, v, name, fatal=False, **kwargs):
3522         res = float_or_none(v, **kwargs)
3523         if res is None:
3524             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3525             if fatal:
3526                 raise ExtractorError(msg)
3527             else:
3528                 self.report_warning(msg)
3529         return res
3530
3531     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3532                     path='/', secure=False, discard=False, rest={}, **kwargs):
3533         cookie = compat_cookiejar_Cookie(
3534             0, name, value, port, port is not None, domain, True,
3535             domain.startswith('.'), path, True, secure, expire_time,
3536             discard, None, None, rest)
3537         self._downloader.cookiejar.set_cookie(cookie)
3538
3539     def _get_cookies(self, url):
3540         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3541         req = sanitized_Request(url)
3542         self._downloader.cookiejar.add_cookie_header(req)
3543         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3544
3545     def _apply_first_set_cookie_header(self, url_handle, cookie):
3546         """
3547         Apply first Set-Cookie header instead of the last. Experimental.
3548
3549         Some sites (e.g. [1-3]) may serve two cookies under the same name
3550         in Set-Cookie header and expect the first (old) one to be set rather
3551         than second (new). However, as of RFC6265 the newer one cookie
3552         should be set into cookie store what actually happens.
3553         We will workaround this issue by resetting the cookie to
3554         the first one manually.
3555         1. https://new.vk.com/
3556         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3557         3. https://learning.oreilly.com/
3558         """
3559         for header, cookies in url_handle.headers.items():
3560             if header.lower() != 'set-cookie':
3561                 continue
3562             if sys.version_info[0] >= 3:
3563                 cookies = cookies.encode('iso-8859-1')
3564             cookies = cookies.decode('utf-8')
3565             cookie_value = re.search(
3566                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3567             if cookie_value:
3568                 value, domain = cookie_value.groups()
3569                 self._set_cookie(domain, cookie, value)
3570                 break
3571
3572     def get_testcases(self, include_onlymatching=False):
3573         t = getattr(self, '_TEST', None)
3574         if t:
3575             assert not hasattr(self, '_TESTS'), \
3576                 '%s has _TEST and _TESTS' % type(self).__name__
3577             tests = [t]
3578         else:
3579             tests = getattr(self, '_TESTS', [])
3580         for t in tests:
3581             if not include_onlymatching and t.get('only_matching', False):
3582                 continue
3583             t['name'] = type(self).__name__[:-len('IE')]
3584             yield t
3585
3586     def is_suitable(self, age_limit):
3587         """ Test whether the extractor is generally suitable for the given
3588         age limit (i.e. pornographic sites are not, all others usually are) """
3589
3590         any_restricted = False
3591         for tc in self.get_testcases(include_onlymatching=False):
3592             if tc.get('playlist', []):
3593                 tc = tc['playlist'][0]
3594             is_restricted = age_restricted(
3595                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3596             if not is_restricted:
3597                 return True
3598             any_restricted = any_restricted or is_restricted
3599         return not any_restricted
3600
3601     def extract_subtitles(self, *args, **kwargs):
3602         if (self.get_param('writesubtitles', False)
3603                 or self.get_param('listsubtitles')):
3604             return self._get_subtitles(*args, **kwargs)
3605         return {}
3606
3607     def _get_subtitles(self, *args, **kwargs):
3608         raise NotImplementedError('This method must be implemented by subclasses')
3609
3610     def extract_comments(self, *args, **kwargs):
3611         if not self.get_param('getcomments'):
3612             return None
3613         generator = self._get_comments(*args, **kwargs)
3614
3615         def extractor():
3616             comments = []
3617             interrupted = True
3618             try:
3619                 while True:
3620                     comments.append(next(generator))
3621             except StopIteration:
3622                 interrupted = False
3623             except KeyboardInterrupt:
3624                 self.to_screen('Interrupted by user')
3625             except Exception as e:
3626                 if self.get_param('ignoreerrors') is not True:
3627                     raise
3628                 self._downloader.report_error(e)
3629             comment_count = len(comments)
3630             self.to_screen(f'Extracted {comment_count} comments')
3631             return {
3632                 'comments': comments,
3633                 'comment_count': None if interrupted else comment_count
3634             }
3635         return extractor
3636
3637     def _get_comments(self, *args, **kwargs):
3638         raise NotImplementedError('This method must be implemented by subclasses')
3639
3640     @staticmethod
3641     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3642         """ Merge subtitle items for one language. Items with duplicated URLs
3643         will be dropped. """
3644         list1_urls = set([item['url'] for item in subtitle_list1])
3645         ret = list(subtitle_list1)
3646         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3647         return ret
3648
3649     @classmethod
3650     def _merge_subtitles(cls, *dicts, target=None):
3651         """ Merge subtitle dictionaries, language by language. """
3652         if target is None:
3653             target = {}
3654         for d in dicts:
3655             for lang, subs in d.items():
3656                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3657         return target
3658
3659     def extract_automatic_captions(self, *args, **kwargs):
3660         if (self.get_param('writeautomaticsub', False)
3661                 or self.get_param('listsubtitles')):
3662             return self._get_automatic_captions(*args, **kwargs)
3663         return {}
3664
3665     def _get_automatic_captions(self, *args, **kwargs):
3666         raise NotImplementedError('This method must be implemented by subclasses')
3667
3668     def mark_watched(self, *args, **kwargs):
3669         if not self.get_param('mark_watched', False):
3670             return
3671         if (self._get_login_info()[0] is not None
3672                 or self.get_param('cookiefile')
3673                 or self.get_param('cookiesfrombrowser')):
3674             self._mark_watched(*args, **kwargs)
3675
3676     def _mark_watched(self, *args, **kwargs):
3677         raise NotImplementedError('This method must be implemented by subclasses')
3678
3679     def geo_verification_headers(self):
3680         headers = {}
3681         geo_verification_proxy = self.get_param('geo_verification_proxy')
3682         if geo_verification_proxy:
3683             headers['Ytdl-request-proxy'] = geo_verification_proxy
3684         return headers
3685
3686     def _generic_id(self, url):
3687         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3688
3689     def _generic_title(self, url):
3690         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3691
3692     @staticmethod
3693     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3694         all_known = all(map(
3695             lambda x: x is not None,
3696             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3697         return (
3698             'private' if is_private
3699             else 'premium_only' if needs_premium
3700             else 'subscriber_only' if needs_subscription
3701             else 'needs_auth' if needs_auth
3702             else 'unlisted' if is_unlisted
3703             else 'public' if all_known
3704             else None)
3705
3706     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3707         '''
3708         @returns            A list of values for the extractor argument given by "key"
3709                             or "default" if no such key is present
3710         @param default      The default value to return when the key is not present (default: [])
3711         @param casesense    When false, the values are converted to lower case
3712         '''
3713         val = traverse_obj(
3714             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3715         if val is None:
3716             return [] if default is NO_DEFAULT else default
3717         return list(val) if casesense else [x.lower() for x in val]
3718
3719     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3720         if not playlist_id or not video_id:
3721             return not video_id
3722
3723         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3724         if no_playlist is not None:
3725             return not no_playlist
3726
3727         video_id = '' if video_id is True else f' {video_id}'
3728         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3729         if self.get_param('noplaylist'):
3730             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3731             return False
3732         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3733         return True
3734
3735
3736 class SearchInfoExtractor(InfoExtractor):
3737     """
3738     Base class for paged search queries extractors.
3739     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3740     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3741     """
3742
3743     _MAX_RESULTS = float('inf')
3744
3745     @classmethod
3746     def _make_valid_url(cls):
3747         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3748
3749     def _real_extract(self, query):
3750         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3751         if prefix == '':
3752             return self._get_n_results(query, 1)
3753         elif prefix == 'all':
3754             return self._get_n_results(query, self._MAX_RESULTS)
3755         else:
3756             n = int(prefix)
3757             if n <= 0:
3758                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3759             elif n > self._MAX_RESULTS:
3760                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3761                 n = self._MAX_RESULTS
3762             return self._get_n_results(query, n)
3763
3764     def _get_n_results(self, query, n):
3765         """Get a specified number of results for a query.
3766         Either this function or _search_results must be overridden by subclasses """
3767         return self.playlist_result(
3768             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3769             query, query)
3770
3771     def _search_results(self, query):
3772         """Returns an iterator of search results"""
3773         raise NotImplementedError('This method must be implemented by subclasses')
3774
3775     @property
3776     def SEARCH_KEY(self):
3777         return self._SEARCH_KEY