yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import collections
   6 import hashlib
   7 import itertools
   8 import json
   9 import netrc
  10 import os
  11 import random
  12 import re
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar_Cookie,
  19     compat_cookies_SimpleCookie,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader import FileDownloader
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     error_to_compat_str,
  49     extract_attributes,
  50     ExtractorError,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     GeoRestrictedError,
  55     GeoUtils,
  56     int_or_none,
  57     join_nonempty,
  58     js_to_json,
  59     JSON_LD_RE,
  60     mimetype2ext,
  61     network_exceptions,
  62     NO_DEFAULT,
  63     orderedSet,
  64     parse_bitrate,
  65     parse_codecs,
  66     parse_duration,
  67     parse_iso8601,
  68     parse_m3u8_attributes,
  69     parse_resolution,
  70     RegexNotFoundError,
  71     sanitize_filename,
  72     sanitized_Request,
  73     str_or_none,
  74     str_to_int,
  75     strip_or_none,
  76     traverse_obj,
  77     unescapeHTML,
  78     UnsupportedError,
  79     unified_strdate,
  80     unified_timestamp,
  81     update_Request,
  82     update_url_query,
  83     url_basename,
  84     url_or_none,
  85     urljoin,
  86     variadic,
  87     xpath_element,
  88     xpath_text,
  89     xpath_with_ns,
  90 )
  91
  92
  93 class InfoExtractor(object):
  94     """Information Extractor class.
  95
  96     Information extractors are the classes that, given a URL, extract
  97     information about the video (or videos) the URL refers to. This
  98     information includes the real video URL, the video title, author and
  99     others. The information is stored in a dictionary which is then
 100     passed to the YoutubeDL. The YoutubeDL processes this
 101     information possibly downloading the video to the file system, among
 102     other possible outcomes.
 103
 104     The type field determines the type of the result.
 105     By far the most common value (and the default if _type is missing) is
 106     "video", which indicates a single video.
 107
 108     For a video, the dictionaries must include the following fields:
 109
 110     id:             Video identifier.
 111     title:          Video title, unescaped.
 112
 113     Additionally, it must contain either a formats entry or a url one:
 114
 115     formats:        A list of dictionaries for each format available, ordered
 116                     from worst to best quality.
 117
 118                     Potential fields:
 119                     * url        The mandatory URL representing the media:
 120                                    for plain file media - HTTP URL of this file,
 121                                    for RTMP - RTMP URL,
 122                                    for HLS - URL of the M3U8 media playlist,
 123                                    for HDS - URL of the F4M manifest,
 124                                    for DASH
 125                                      - HTTP URL to plain file media (in case of
 126                                        unfragmented media)
 127                                      - URL of the MPD manifest or base URL
 128                                        representing the media if MPD manifest
 129                                        is parsed from a string (in case of
 130                                        fragmented media)
 131                                    for MSS - URL of the ISM manifest.
 132                     * manifest_url
 133                                  The URL of the manifest file in case of
 134                                  fragmented media:
 135                                    for HLS - URL of the M3U8 master playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH - URL of the MPD manifest,
 138                                    for MSS - URL of the ISM manifest.
 139                     * ext        Will be calculated from URL if missing
 140                     * format     A human-readable description of the format
 141                                  ("mp4 container with h264/opus").
 142                                  Calculated from the format_id, width, height.
 143                                  and format_note fields if missing.
 144                     * format_id  A short description of the format
 145                                  ("mp4_h264_opus" or "19").
 146                                 Technically optional, but strongly recommended.
 147                     * format_note Additional info about the format
 148                                  ("3D" or "DASH video")
 149                     * width      Width of the video, if known
 150                     * height     Height of the video, if known
 151                     * resolution Textual description of width and height
 152                     * dynamic_range The dynamic range of the video. One of:
 153                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 154                     * tbr        Average bitrate of audio and video in KBit/s
 155                     * abr        Average audio bitrate in KBit/s
 156                     * acodec     Name of the audio codec in use
 157                     * asr        Audio sampling rate in Hertz
 158                     * vbr        Average video bitrate in KBit/s
 159                     * fps        Frame rate
 160                     * vcodec     Name of the video codec in use
 161                     * container  Name of the container format
 162                     * filesize   The number of bytes, if known in advance
 163                     * filesize_approx  An estimate for the number of bytes
 164                     * player_url SWF Player URL (used for rtmpdump).
 165                     * protocol   The protocol that will be used for the actual
 166                                  download, lower-case. One of "http", "https" or
 167                                  one of the protocols defined in downloader.PROTOCOL_MAP
 168                     * fragment_base_url
 169                                  Base URL for fragments. Each fragment's path
 170                                  value (if present) will be relative to
 171                                  this URL.
 172                     * fragments  A list of fragments of a fragmented media.
 173                                  Each fragment entry must contain either an url
 174                                  or a path. If an url is present it should be
 175                                  considered by a client. Otherwise both path and
 176                                  fragment_base_url must be present. Here is
 177                                  the list of all potential fields:
 178                                  * "url" - fragment's URL
 179                                  * "path" - fragment's path relative to
 180                                             fragment_base_url
 181                                  * "duration" (optional, int or float)
 182                                  * "filesize" (optional, int)
 183                     * is_from_start  Is a live format that can be downloaded
 184                                 from the start. Boolean
 185                     * preference Order number of this format. If this field is
 186                                  present and not None, the formats get sorted
 187                                  by this field, regardless of all other values.
 188                                  -1 for default (order by other properties),
 189                                  -2 or smaller for less than default.
 190                                  < -1000 to hide the format (if there is
 191                                     another one which is strictly better)
 192                     * language   Language code, e.g. "de" or "en-US".
 193                     * language_preference  Is this in the language mentioned in
 194                                  the URL?
 195                                  10 if it's what the URL is about,
 196                                  -1 for default (don't know),
 197                                  -10 otherwise, other values reserved for now.
 198                     * quality    Order number of the video quality of this
 199                                  format, irrespective of the file format.
 200                                  -1 for default (order by other properties),
 201                                  -2 or smaller for less than default.
 202                     * source_preference  Order number for this video source
 203                                   (quality takes higher priority)
 204                                  -1 for default (order by other properties),
 205                                  -2 or smaller for less than default.
 206                     * http_headers  A dictionary of additional HTTP headers
 207                                  to add to the request.
 208                     * stretched_ratio  If given and not 1, indicates that the
 209                                  video's pixels are not square.
 210                                  width : height ratio as float.
 211                     * no_resume  The server does not support resuming the
 212                                  (HTTP or RTMP) download. Boolean.
 213                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 214                     * downloader_options  A dictionary of downloader options as
 215                                  described in FileDownloader
 216                     RTMP formats can also have the additional fields: page_url,
 217                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 218                     rtmp_protocol, rtmp_real_time
 219
 220     url:            Final video URL.
 221     ext:            Video filename extension.
 222     format:         The video format, defaults to ext (used for --get-format)
 223     player_url:     SWF Player URL (used for rtmpdump).
 224
 225     The following fields are optional:
 226
 227     alt_title:      A secondary title of the video.
 228     display_id      An alternative identifier for the video, not necessarily
 229                     unique, but available before title. Typically, id is
 230                     something like "4234987", title "Dancing naked mole rats",
 231                     and display_id "dancing-naked-mole-rats"
 232     thumbnails:     A list of dictionaries, with the following entries:
 233                         * "id" (optional, string) - Thumbnail format ID
 234                         * "url"
 235                         * "preference" (optional, int) - quality of the image
 236                         * "width" (optional, int)
 237                         * "height" (optional, int)
 238                         * "resolution" (optional, string "{width}x{height}",
 239                                         deprecated)
 240                         * "filesize" (optional, int)
 241     thumbnail:      Full URL to a video thumbnail image.
 242     description:    Full video description.
 243     uploader:       Full name of the video uploader.
 244     license:        License name the video is licensed under.
 245     creator:        The creator of the video.
 246     release_timestamp: UNIX timestamp of the moment the video was released.
 247     release_date:   The date (YYYYMMDD) when the video was released.
 248     timestamp:      UNIX timestamp of the moment the video was uploaded
 249     upload_date:    Video upload date (YYYYMMDD).
 250                     If not explicitly set, calculated from timestamp.
 251     uploader_id:    Nickname or id of the video uploader.
 252     uploader_url:   Full URL to a personal webpage of the video uploader.
 253     channel:        Full name of the channel the video is uploaded on.
 254                     Note that channel fields may or may not repeat uploader
 255                     fields. This depends on a particular extractor.
 256     channel_id:     Id of the channel.
 257     channel_url:    Full URL to a channel webpage.
 258     location:       Physical location where the video was filmed.
 259     subtitles:      The available subtitles as a dictionary in the format
 260                     {tag: subformats}. "tag" is usually a language code, and
 261                     "subformats" is a list sorted from lower to higher
 262                     preference, each element is a dictionary with the "ext"
 263                     entry and one of:
 264                         * "data": The subtitles file contents
 265                         * "url": A URL pointing to the subtitles file
 266                     It can optionally also have:
 267                         * "name": Name or description of the subtitles
 268                     "ext" will be calculated from URL if missing
 269     automatic_captions: Like 'subtitles'; contains automatically generated
 270                     captions instead of normal subtitles
 271     duration:       Length of the video in seconds, as an integer or float.
 272     view_count:     How many users have watched the video on the platform.
 273     like_count:     Number of positive ratings of the video
 274     dislike_count:  Number of negative ratings of the video
 275     repost_count:   Number of reposts of the video
 276     average_rating: Average rating give by users, the scale used depends on the webpage
 277     comment_count:  Number of comments on the video
 278     comments:       A list of comments, each with one or more of the following
 279                     properties (all but one of text or html optional):
 280                         * "author" - human-readable name of the comment author
 281                         * "author_id" - user ID of the comment author
 282                         * "author_thumbnail" - The thumbnail of the comment author
 283                         * "id" - Comment ID
 284                         * "html" - Comment as HTML
 285                         * "text" - Plain text of the comment
 286                         * "timestamp" - UNIX timestamp of comment
 287                         * "parent" - ID of the comment this one is replying to.
 288                                      Set to "root" to indicate that this is a
 289                                      comment to the original video.
 290                         * "like_count" - Number of positive ratings of the comment
 291                         * "dislike_count" - Number of negative ratings of the comment
 292                         * "is_favorited" - Whether the comment is marked as
 293                                            favorite by the video uploader
 294                         * "author_is_uploader" - Whether the comment is made by
 295                                                  the video uploader
 296     age_limit:      Age restriction for the video, as an integer (years)
 297     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 298                     should allow to get the same result again. (It will be set
 299                     by YoutubeDL if it's missing)
 300     categories:     A list of categories that the video falls in, for example
 301                     ["Sports", "Berlin"]
 302     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 303     cast:           A list of the video cast
 304     is_live:        True, False, or None (=unknown). Whether this video is a
 305                     live stream that goes on instead of a fixed-length video.
 306     was_live:       True, False, or None (=unknown). Whether this video was
 307                     originally a live stream.
 308     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 309                     If absent, automatically set from is_live, was_live
 310     start_time:     Time in seconds where the reproduction should start, as
 311                     specified in the URL.
 312     end_time:       Time in seconds where the reproduction should end, as
 313                     specified in the URL.
 314     chapters:       A list of dictionaries, with the following entries:
 315                         * "start_time" - The start time of the chapter in seconds
 316                         * "end_time" - The end time of the chapter in seconds
 317                         * "title" (optional, string)
 318     playable_in_embed: Whether this video is allowed to play in embedded
 319                     players on other sites. Can be True (=always allowed),
 320                     False (=never allowed), None (=unknown), or a string
 321                     specifying the criteria for embedability (Eg: 'whitelist')
 322     availability:   Under what condition the video is available. One of
 323                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 324                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 325                     to set it
 326     __post_extractor: A function to be called just before the metadata is
 327                     written to either disk, logger or console. The function
 328                     must return a dict which will be added to the info_dict.
 329                     This is usefull for additional information that is
 330                     time-consuming to extract. Note that the fields thus
 331                     extracted will not be available to output template and
 332                     match_filter. So, only "comments" and "comment_count" are
 333                     currently allowed to be extracted via this method.
 334
 335     The following fields should only be used when the video belongs to some logical
 336     chapter or section:
 337
 338     chapter:        Name or title of the chapter the video belongs to.
 339     chapter_number: Number of the chapter the video belongs to, as an integer.
 340     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 341
 342     The following fields should only be used when the video is an episode of some
 343     series, programme or podcast:
 344
 345     series:         Title of the series or programme the video episode belongs to.
 346     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 347     season:         Title of the season the video episode belongs to.
 348     season_number:  Number of the season the video episode belongs to, as an integer.
 349     season_id:      Id of the season the video episode belongs to, as a unicode string.
 350     episode:        Title of the video episode. Unlike mandatory video title field,
 351                     this field should denote the exact title of the video episode
 352                     without any kind of decoration.
 353     episode_number: Number of the video episode within a season, as an integer.
 354     episode_id:     Id of the video episode, as a unicode string.
 355
 356     The following fields should only be used when the media is a track or a part of
 357     a music album:
 358
 359     track:          Title of the track.
 360     track_number:   Number of the track within an album or a disc, as an integer.
 361     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 362                     as a unicode string.
 363     artist:         Artist(s) of the track.
 364     genre:          Genre(s) of the track.
 365     album:          Title of the album the track belongs to.
 366     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 367     album_artist:   List of all artists appeared on the album (e.g.
 368                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 369                     and compilations).
 370     disc_number:    Number of the disc or other physical medium the track belongs to,
 371                     as an integer.
 372     release_year:   Year (YYYY) when the album was released.
 373
 374     Unless mentioned otherwise, the fields should be Unicode strings.
 375
 376     Unless mentioned otherwise, None is equivalent to absence of information.
 377
 378
 379     _type "playlist" indicates multiple videos.
 380     There must be a key "entries", which is a list, an iterable, or a PagedList
 381     object, each element of which is a valid dictionary by this specification.
 382
 383     Additionally, playlists can have "id", "title", and any other relevent
 384     attributes with the same semantics as videos (see above).
 385
 386
 387     _type "multi_video" indicates that there are multiple videos that
 388     form a single show, for examples multiple acts of an opera or TV episode.
 389     It must have an entries key like a playlist and contain all the keys
 390     required for a video at the same time.
 391
 392
 393     _type "url" indicates that the video must be extracted from another
 394     location, possibly by a different extractor. Its only required key is:
 395     "url" - the next URL to extract.
 396     The key "ie_key" can be set to the class name (minus the trailing "IE",
 397     e.g. "Youtube") if the extractor class is known in advance.
 398     Additionally, the dictionary may have any properties of the resolved entity
 399     known in advance, for example "title" if the title of the referred video is
 400     known ahead of time.
 401
 402
 403     _type "url_transparent" entities have the same specification as "url", but
 404     indicate that the given additional information is more precise than the one
 405     associated with the resolved URL.
 406     This is useful when a site employs a video service that hosts the video and
 407     its technical metadata, but that video service does not embed a useful
 408     title, description etc.
 409
 410
 411     Subclasses of this one should re-define the _real_initialize() and
 412     _real_extract() methods and define a _VALID_URL regexp.
 413     Probably, they should also be added to the list of extractors.
 414
 415     Subclasses may also override suitable() if necessary, but ensure the function
 416     signature is preserved and that this function imports everything it needs
 417     (except other extractors), so that lazy_extractors works correctly
 418
 419     _GEO_BYPASS attribute may be set to False in order to disable
 420     geo restriction bypass mechanisms for a particular extractor.
 421     Though it won't disable explicit geo restriction bypass based on
 422     country code provided with geo_bypass_country.
 423
 424     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 425     countries for this extractor. One of these countries will be used by
 426     geo restriction bypass mechanism right away in order to bypass
 427     geo restriction, of course, if the mechanism is not disabled.
 428
 429     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 430     IP blocks in CIDR notation for this extractor. One of these IP blocks
 431     will be used by geo restriction bypass mechanism similarly
 432     to _GEO_COUNTRIES.
 433
 434     The _WORKING attribute should be set to False for broken IEs
 435     in order to warn the users and skip the tests.
 436     """
 437
 438     _ready = False
 439     _downloader = None
 440     _x_forwarded_for_ip = None
 441     _GEO_BYPASS = True
 442     _GEO_COUNTRIES = None
 443     _GEO_IP_BLOCKS = None
 444     _WORKING = True
 445
 446     _LOGIN_HINTS = {
 447         'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
 448         'cookies': (
 449             'Use --cookies-from-browser or --cookies for the authentication. '
 450             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 451         'password': 'Use --username and --password, or --netrc to provide account credentials',
 452     }
 453
 454     def __init__(self, downloader=None):
 455         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 456         If a downloader is not passed during initialization,
 457         it must be set using "set_downloader()" before "extract()" is called"""
 458         self._ready = False
 459         self._x_forwarded_for_ip = None
 460         self._printed_messages = set()
 461         self.set_downloader(downloader)
 462
 463     @classmethod
 464     def _match_valid_url(cls, url):
 465         # This does not use has/getattr intentionally - we want to know whether
 466         # we have cached the regexp for *this* class, whereas getattr would also
 467         # match the superclass
 468         if '_VALID_URL_RE' not in cls.__dict__:
 469             if '_VALID_URL' not in cls.__dict__:
 470                 cls._VALID_URL = cls._make_valid_url()
 471             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 472         return cls._VALID_URL_RE.match(url)
 473
 474     @classmethod
 475     def suitable(cls, url):
 476         """Receives a URL and returns True if suitable for this IE."""
 477         # This function must import everything it needs (except other extractors),
 478         # so that lazy_extractors works correctly
 479         return cls._match_valid_url(url) is not None
 480
 481     @classmethod
 482     def _match_id(cls, url):
 483         return cls._match_valid_url(url).group('id')
 484
 485     @classmethod
 486     def get_temp_id(cls, url):
 487         try:
 488             return cls._match_id(url)
 489         except (IndexError, AttributeError):
 490             return None
 491
 492     @classmethod
 493     def working(cls):
 494         """Getter method for _WORKING."""
 495         return cls._WORKING
 496
 497     def initialize(self):
 498         """Initializes an instance (authentication, etc)."""
 499         self._printed_messages = set()
 500         self._initialize_geo_bypass({
 501             'countries': self._GEO_COUNTRIES,
 502             'ip_blocks': self._GEO_IP_BLOCKS,
 503         })
 504         if not self._ready:
 505             self._real_initialize()
 506             self._ready = True
 507
 508     def _initialize_geo_bypass(self, geo_bypass_context):
 509         """
 510         Initialize geo restriction bypass mechanism.
 511
 512         This method is used to initialize geo bypass mechanism based on faking
 513         X-Forwarded-For HTTP header. A random country from provided country list
 514         is selected and a random IP belonging to this country is generated. This
 515         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 516         HTTP requests.
 517
 518         This method will be used for initial geo bypass mechanism initialization
 519         during the instance initialization with _GEO_COUNTRIES and
 520         _GEO_IP_BLOCKS.
 521
 522         You may also manually call it from extractor's code if geo bypass
 523         information is not available beforehand (e.g. obtained during
 524         extraction) or due to some other reason. In this case you should pass
 525         this information in geo bypass context passed as first argument. It may
 526         contain following fields:
 527
 528         countries:  List of geo unrestricted countries (similar
 529                     to _GEO_COUNTRIES)
 530         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 531                     (similar to _GEO_IP_BLOCKS)
 532
 533         """
 534         if not self._x_forwarded_for_ip:
 535
 536             # Geo bypass mechanism is explicitly disabled by user
 537             if not self.get_param('geo_bypass', True):
 538                 return
 539
 540             if not geo_bypass_context:
 541                 geo_bypass_context = {}
 542
 543             # Backward compatibility: previously _initialize_geo_bypass
 544             # expected a list of countries, some 3rd party code may still use
 545             # it this way
 546             if isinstance(geo_bypass_context, (list, tuple)):
 547                 geo_bypass_context = {
 548                     'countries': geo_bypass_context,
 549                 }
 550
 551             # The whole point of geo bypass mechanism is to fake IP
 552             # as X-Forwarded-For HTTP header based on some IP block or
 553             # country code.
 554
 555             # Path 1: bypassing based on IP block in CIDR notation
 556
 557             # Explicit IP block specified by user, use it right away
 558             # regardless of whether extractor is geo bypassable or not
 559             ip_block = self.get_param('geo_bypass_ip_block', None)
 560
 561             # Otherwise use random IP block from geo bypass context but only
 562             # if extractor is known as geo bypassable
 563             if not ip_block:
 564                 ip_blocks = geo_bypass_context.get('ip_blocks')
 565                 if self._GEO_BYPASS and ip_blocks:
 566                     ip_block = random.choice(ip_blocks)
 567
 568             if ip_block:
 569                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 570                 self._downloader.write_debug(
 571                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 572                 return
 573
 574             # Path 2: bypassing based on country code
 575
 576             # Explicit country code specified by user, use it right away
 577             # regardless of whether extractor is geo bypassable or not
 578             country = self.get_param('geo_bypass_country', None)
 579
 580             # Otherwise use random country code from geo bypass context but
 581             # only if extractor is known as geo bypassable
 582             if not country:
 583                 countries = geo_bypass_context.get('countries')
 584                 if self._GEO_BYPASS and countries:
 585                     country = random.choice(countries)
 586
 587             if country:
 588                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 589                 self._downloader.write_debug(
 590                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 591
 592     def extract(self, url):
 593         """Extracts URL information and returns it in list of dicts."""
 594         try:
 595             for _ in range(2):
 596                 try:
 597                     self.initialize()
 598                     self.write_debug('Extracting URL: %s' % url)
 599                     ie_result = self._real_extract(url)
 600                     if ie_result is None:
 601                         return None
 602                     if self._x_forwarded_for_ip:
 603                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 604                     subtitles = ie_result.get('subtitles')
 605                     if (subtitles and 'live_chat' in subtitles
 606                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 607                         del subtitles['live_chat']
 608                     return ie_result
 609                 except GeoRestrictedError as e:
 610                     if self.__maybe_fake_ip_and_retry(e.countries):
 611                         continue
 612                     raise
 613         except UnsupportedError:
 614             raise
 615         except ExtractorError as e:
 616             kwargs = {
 617                 'video_id': e.video_id or self.get_temp_id(url),
 618                 'ie': self.IE_NAME,
 619                 'tb': e.traceback or sys.exc_info()[2],
 620                 'expected': e.expected,
 621                 'cause': e.cause
 622             }
 623             if hasattr(e, 'countries'):
 624                 kwargs['countries'] = e.countries
 625             raise type(e)(e.msg, **kwargs)
 626         except compat_http_client.IncompleteRead as e:
 627             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 628         except (KeyError, StopIteration) as e:
 629             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 630
 631     def __maybe_fake_ip_and_retry(self, countries):
 632         if (not self.get_param('geo_bypass_country', None)
 633                 and self._GEO_BYPASS
 634                 and self.get_param('geo_bypass', True)
 635                 and not self._x_forwarded_for_ip
 636                 and countries):
 637             country_code = random.choice(countries)
 638             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 639             if self._x_forwarded_for_ip:
 640                 self.report_warning(
 641                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 642                     % (self._x_forwarded_for_ip, country_code.upper()))
 643                 return True
 644         return False
 645
 646     def set_downloader(self, downloader):
 647         """Sets the downloader for this IE."""
 648         self._downloader = downloader
 649
 650     def _real_initialize(self):
 651         """Real initialization process. Redefine in subclasses."""
 652         pass
 653
 654     def _real_extract(self, url):
 655         """Real extraction process. Redefine in subclasses."""
 656         pass
 657
 658     @classmethod
 659     def ie_key(cls):
 660         """A string for getting the InfoExtractor with get_info_extractor"""
 661         return cls.__name__[:-2]
 662
 663     @property
 664     def IE_NAME(self):
 665         return compat_str(type(self).__name__[:-2])
 666
 667     @staticmethod
 668     def __can_accept_status_code(err, expected_status):
 669         assert isinstance(err, compat_urllib_error.HTTPError)
 670         if expected_status is None:
 671             return False
 672         elif callable(expected_status):
 673             return expected_status(err.code) is True
 674         else:
 675             return err.code in variadic(expected_status)
 676
 677     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 678         """
 679         Return the response handle.
 680
 681         See _download_webpage docstring for arguments specification.
 682         """
 683         if not self._downloader._first_webpage_request:
 684             sleep_interval = self.get_param('sleep_interval_requests') or 0
 685             if sleep_interval > 0:
 686                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 687                 time.sleep(sleep_interval)
 688         else:
 689             self._downloader._first_webpage_request = False
 690
 691         if note is None:
 692             self.report_download_webpage(video_id)
 693         elif note is not False:
 694             if video_id is None:
 695                 self.to_screen('%s' % (note,))
 696             else:
 697                 self.to_screen('%s: %s' % (video_id, note))
 698
 699         # Some sites check X-Forwarded-For HTTP header in order to figure out
 700         # the origin of the client behind proxy. This allows bypassing geo
 701         # restriction by faking this header's value to IP that belongs to some
 702         # geo unrestricted country. We will do so once we encounter any
 703         # geo restriction error.
 704         if self._x_forwarded_for_ip:
 705             if 'X-Forwarded-For' not in headers:
 706                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 707
 708         if isinstance(url_or_request, compat_urllib_request.Request):
 709             url_or_request = update_Request(
 710                 url_or_request, data=data, headers=headers, query=query)
 711         else:
 712             if query:
 713                 url_or_request = update_url_query(url_or_request, query)
 714             if data is not None or headers:
 715                 url_or_request = sanitized_Request(url_or_request, data, headers)
 716         try:
 717             return self._downloader.urlopen(url_or_request)
 718         except network_exceptions as err:
 719             if isinstance(err, compat_urllib_error.HTTPError):
 720                 if self.__can_accept_status_code(err, expected_status):
 721                     # Retain reference to error to prevent file object from
 722                     # being closed before it can be read. Works around the
 723                     # effects of <https://bugs.python.org/issue15002>
 724                     # introduced in Python 3.4.1.
 725                     err.fp._error = err
 726                     return err.fp
 727
 728             if errnote is False:
 729                 return False
 730             if errnote is None:
 731                 errnote = 'Unable to download webpage'
 732
 733             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 734             if fatal:
 735                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 736             else:
 737                 self.report_warning(errmsg)
 738                 return False
 739
 740     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 741         """
 742         Return a tuple (page content as string, URL handle).
 743
 744         See _download_webpage docstring for arguments specification.
 745         """
 746         # Strip hashes from the URL (#1038)
 747         if isinstance(url_or_request, (compat_str, str)):
 748             url_or_request = url_or_request.partition('#')[0]
 749
 750         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 751         if urlh is False:
 752             assert not fatal
 753             return False
 754         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 755         return (content, urlh)
 756
 757     @staticmethod
 758     def _guess_encoding_from_content(content_type, webpage_bytes):
 759         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 760         if m:
 761             encoding = m.group(1)
 762         else:
 763             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 764                           webpage_bytes[:1024])
 765             if m:
 766                 encoding = m.group(1).decode('ascii')
 767             elif webpage_bytes.startswith(b'\xff\xfe'):
 768                 encoding = 'utf-16'
 769             else:
 770                 encoding = 'utf-8'
 771
 772         return encoding
 773
 774     def __check_blocked(self, content):
 775         first_block = content[:512]
 776         if ('<title>Access to this site is blocked</title>' in content
 777                 and 'Websense' in first_block):
 778             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 779             blocked_iframe = self._html_search_regex(
 780                 r'<iframe src="([^"]+)"', content,
 781                 'Websense information URL', default=None)
 782             if blocked_iframe:
 783                 msg += ' Visit %s for more details' % blocked_iframe
 784             raise ExtractorError(msg, expected=True)
 785         if '<title>The URL you requested has been blocked</title>' in first_block:
 786             msg = (
 787                 'Access to this webpage has been blocked by Indian censorship. '
 788                 'Use a VPN or proxy server (with --proxy) to route around it.')
 789             block_msg = self._html_search_regex(
 790                 r'</h1><p>(.*?)</p>',
 791                 content, 'block message', default=None)
 792             if block_msg:
 793                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 794             raise ExtractorError(msg, expected=True)
 795         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 796                 and 'blocklist.rkn.gov.ru' in content):
 797             raise ExtractorError(
 798                 'Access to this webpage has been blocked by decision of the Russian government. '
 799                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 800                 expected=True)
 801
 802     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 803         content_type = urlh.headers.get('Content-Type', '')
 804         webpage_bytes = urlh.read()
 805         if prefix is not None:
 806             webpage_bytes = prefix + webpage_bytes
 807         if not encoding:
 808             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 809         if self.get_param('dump_intermediate_pages', False):
 810             self.to_screen('Dumping request to ' + urlh.geturl())
 811             dump = base64.b64encode(webpage_bytes).decode('ascii')
 812             self._downloader.to_screen(dump)
 813         if self.get_param('write_pages', False):
 814             basen = '%s_%s' % (video_id, urlh.geturl())
 815             trim_length = self.get_param('trim_file_name') or 240
 816             if len(basen) > trim_length:
 817                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 818                 basen = basen[:trim_length - len(h)] + h
 819             raw_filename = basen + '.dump'
 820             filename = sanitize_filename(raw_filename, restricted=True)
 821             self.to_screen('Saving request to ' + filename)
 822             # Working around MAX_PATH limitation on Windows (see
 823             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 824             if compat_os_name == 'nt':
 825                 absfilepath = os.path.abspath(filename)
 826                 if len(absfilepath) > 259:
 827                     filename = '\\\\?\\' + absfilepath
 828             with open(filename, 'wb') as outf:
 829                 outf.write(webpage_bytes)
 830
 831         try:
 832             content = webpage_bytes.decode(encoding, 'replace')
 833         except LookupError:
 834             content = webpage_bytes.decode('utf-8', 'replace')
 835
 836         self.__check_blocked(content)
 837
 838         return content
 839
 840     def _download_webpage(
 841             self, url_or_request, video_id, note=None, errnote=None,
 842             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 843             headers={}, query={}, expected_status=None):
 844         """
 845         Return the data of the page as a string.
 846
 847         Arguments:
 848         url_or_request -- plain text URL as a string or
 849             a compat_urllib_request.Requestobject
 850         video_id -- Video/playlist/item identifier (string)
 851
 852         Keyword arguments:
 853         note -- note printed before downloading (string)
 854         errnote -- note printed in case of an error (string)
 855         fatal -- flag denoting whether error should be considered fatal,
 856             i.e. whether it should cause ExtractionError to be raised,
 857             otherwise a warning will be reported and extraction continued
 858         tries -- number of tries
 859         timeout -- sleep interval between tries
 860         encoding -- encoding for a page content decoding, guessed automatically
 861             when not explicitly specified
 862         data -- POST data (bytes)
 863         headers -- HTTP headers (dict)
 864         query -- URL query (dict)
 865         expected_status -- allows to accept failed HTTP requests (non 2xx
 866             status code) by explicitly specifying a set of accepted status
 867             codes. Can be any of the following entities:
 868                 - an integer type specifying an exact failed status code to
 869                   accept
 870                 - a list or a tuple of integer types specifying a list of
 871                   failed status codes to accept
 872                 - a callable accepting an actual failed status code and
 873                   returning True if it should be accepted
 874             Note that this argument does not affect success status codes (2xx)
 875             which are always accepted.
 876         """
 877
 878         success = False
 879         try_count = 0
 880         while success is False:
 881             try:
 882                 res = self._download_webpage_handle(
 883                     url_or_request, video_id, note, errnote, fatal,
 884                     encoding=encoding, data=data, headers=headers, query=query,
 885                     expected_status=expected_status)
 886                 success = True
 887             except compat_http_client.IncompleteRead as e:
 888                 try_count += 1
 889                 if try_count >= tries:
 890                     raise e
 891                 self._sleep(timeout, video_id)
 892         if res is False:
 893             return res
 894         else:
 895             content, _ = res
 896             return content
 897
 898     def _download_xml_handle(
 899             self, url_or_request, video_id, note='Downloading XML',
 900             errnote='Unable to download XML', transform_source=None,
 901             fatal=True, encoding=None, data=None, headers={}, query={},
 902             expected_status=None):
 903         """
 904         Return a tuple (xml as an compat_etree_Element, URL handle).
 905
 906         See _download_webpage docstring for arguments specification.
 907         """
 908         res = self._download_webpage_handle(
 909             url_or_request, video_id, note, errnote, fatal=fatal,
 910             encoding=encoding, data=data, headers=headers, query=query,
 911             expected_status=expected_status)
 912         if res is False:
 913             return res
 914         xml_string, urlh = res
 915         return self._parse_xml(
 916             xml_string, video_id, transform_source=transform_source,
 917             fatal=fatal), urlh
 918
 919     def _download_xml(
 920             self, url_or_request, video_id,
 921             note='Downloading XML', errnote='Unable to download XML',
 922             transform_source=None, fatal=True, encoding=None,
 923             data=None, headers={}, query={}, expected_status=None):
 924         """
 925         Return the xml as an compat_etree_Element.
 926
 927         See _download_webpage docstring for arguments specification.
 928         """
 929         res = self._download_xml_handle(
 930             url_or_request, video_id, note=note, errnote=errnote,
 931             transform_source=transform_source, fatal=fatal, encoding=encoding,
 932             data=data, headers=headers, query=query,
 933             expected_status=expected_status)
 934         return res if res is False else res[0]
 935
 936     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 937         if transform_source:
 938             xml_string = transform_source(xml_string)
 939         try:
 940             return compat_etree_fromstring(xml_string.encode('utf-8'))
 941         except compat_xml_parse_error as ve:
 942             errmsg = '%s: Failed to parse XML ' % video_id
 943             if fatal:
 944                 raise ExtractorError(errmsg, cause=ve)
 945             else:
 946                 self.report_warning(errmsg + str(ve))
 947
 948     def _download_json_handle(
 949             self, url_or_request, video_id, note='Downloading JSON metadata',
 950             errnote='Unable to download JSON metadata', transform_source=None,
 951             fatal=True, encoding=None, data=None, headers={}, query={},
 952             expected_status=None):
 953         """
 954         Return a tuple (JSON object, URL handle).
 955
 956         See _download_webpage docstring for arguments specification.
 957         """
 958         res = self._download_webpage_handle(
 959             url_or_request, video_id, note, errnote, fatal=fatal,
 960             encoding=encoding, data=data, headers=headers, query=query,
 961             expected_status=expected_status)
 962         if res is False:
 963             return res
 964         json_string, urlh = res
 965         return self._parse_json(
 966             json_string, video_id, transform_source=transform_source,
 967             fatal=fatal), urlh
 968
 969     def _download_json(
 970             self, url_or_request, video_id, note='Downloading JSON metadata',
 971             errnote='Unable to download JSON metadata', transform_source=None,
 972             fatal=True, encoding=None, data=None, headers={}, query={},
 973             expected_status=None):
 974         """
 975         Return the JSON object as a dict.
 976
 977         See _download_webpage docstring for arguments specification.
 978         """
 979         res = self._download_json_handle(
 980             url_or_request, video_id, note=note, errnote=errnote,
 981             transform_source=transform_source, fatal=fatal, encoding=encoding,
 982             data=data, headers=headers, query=query,
 983             expected_status=expected_status)
 984         return res if res is False else res[0]
 985
 986     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 987         if transform_source:
 988             json_string = transform_source(json_string)
 989         try:
 990             return json.loads(json_string)
 991         except ValueError as ve:
 992             errmsg = '%s: Failed to parse JSON ' % video_id
 993             if fatal:
 994                 raise ExtractorError(errmsg, cause=ve)
 995             else:
 996                 self.report_warning(errmsg + str(ve))
 997
 998     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 999         return self._parse_json(
1000             data[data.find('{'):data.rfind('}') + 1],
1001             video_id, transform_source, fatal)
1002
1003     def _download_socket_json_handle(
1004             self, url_or_request, video_id, note='Polling socket',
1005             errnote='Unable to poll socket', transform_source=None,
1006             fatal=True, encoding=None, data=None, headers={}, query={},
1007             expected_status=None):
1008         """
1009         Return a tuple (JSON object, URL handle).
1010
1011         See _download_webpage docstring for arguments specification.
1012         """
1013         res = self._download_webpage_handle(
1014             url_or_request, video_id, note, errnote, fatal=fatal,
1015             encoding=encoding, data=data, headers=headers, query=query,
1016             expected_status=expected_status)
1017         if res is False:
1018             return res
1019         webpage, urlh = res
1020         return self._parse_socket_response_as_json(
1021             webpage, video_id, transform_source=transform_source,
1022             fatal=fatal), urlh
1023
1024     def _download_socket_json(
1025             self, url_or_request, video_id, note='Polling socket',
1026             errnote='Unable to poll socket', transform_source=None,
1027             fatal=True, encoding=None, data=None, headers={}, query={},
1028             expected_status=None):
1029         """
1030         Return the JSON object as a dict.
1031
1032         See _download_webpage docstring for arguments specification.
1033         """
1034         res = self._download_socket_json_handle(
1035             url_or_request, video_id, note=note, errnote=errnote,
1036             transform_source=transform_source, fatal=fatal, encoding=encoding,
1037             data=data, headers=headers, query=query,
1038             expected_status=expected_status)
1039         return res if res is False else res[0]
1040
1041     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1042         idstr = format_field(video_id, template='%s: ')
1043         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1044         if only_once:
1045             if f'WARNING: {msg}' in self._printed_messages:
1046                 return
1047             self._printed_messages.add(f'WARNING: {msg}')
1048         self._downloader.report_warning(msg, *args, **kwargs)
1049
1050     def to_screen(self, msg, *args, **kwargs):
1051         """Print msg to screen, prefixing it with '[ie_name]'"""
1052         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1053
1054     def write_debug(self, msg, *args, **kwargs):
1055         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1056
1057     def get_param(self, name, default=None, *args, **kwargs):
1058         if self._downloader:
1059             return self._downloader.params.get(name, default, *args, **kwargs)
1060         return default
1061
1062     def report_drm(self, video_id, partial=False):
1063         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1064
1065     def report_extraction(self, id_or_name):
1066         """Report information extraction."""
1067         self.to_screen('%s: Extracting information' % id_or_name)
1068
1069     def report_download_webpage(self, video_id):
1070         """Report webpage download."""
1071         self.to_screen('%s: Downloading webpage' % video_id)
1072
1073     def report_age_confirmation(self):
1074         """Report attempt to confirm age."""
1075         self.to_screen('Confirming age')
1076
1077     def report_login(self):
1078         """Report attempt to log in."""
1079         self.to_screen('Logging in')
1080
1081     def raise_login_required(
1082             self, msg='This video is only available for registered users',
1083             metadata_available=False, method='any'):
1084         if metadata_available and (
1085                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1086             self.report_warning(msg)
1087         if method is not None:
1088             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1089         raise ExtractorError(msg, expected=True)
1090
1091     def raise_geo_restricted(
1092             self, msg='This video is not available from your location due to geo restriction',
1093             countries=None, metadata_available=False):
1094         if metadata_available and (
1095                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1096             self.report_warning(msg)
1097         else:
1098             raise GeoRestrictedError(msg, countries=countries)
1099
1100     def raise_no_formats(self, msg, expected=False, video_id=None):
1101         if expected and (
1102                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1103             self.report_warning(msg, video_id)
1104         elif isinstance(msg, ExtractorError):
1105             raise msg
1106         else:
1107             raise ExtractorError(msg, expected=expected, video_id=video_id)
1108
1109     # Methods for following #608
1110     @staticmethod
1111     def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1112         """Returns a URL that points to a page that should be processed"""
1113         # TODO: ie should be the class used for getting the info
1114         video_info = {'_type': 'url',
1115                       'url': url,
1116                       'ie_key': ie}
1117         video_info.update(kwargs)
1118         if video_id is not None:
1119             video_info['id'] = video_id
1120         if video_title is not None:
1121             video_info['title'] = video_title
1122         return video_info
1123
1124     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1125         urls = orderedSet(
1126             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1127             for m in matches)
1128         return self.playlist_result(
1129             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1130
1131     @staticmethod
1132     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1133         """Returns a playlist"""
1134         video_info = {'_type': 'playlist',
1135                       'entries': entries}
1136         video_info.update(kwargs)
1137         if playlist_id:
1138             video_info['id'] = playlist_id
1139         if playlist_title:
1140             video_info['title'] = playlist_title
1141         if playlist_description is not None:
1142             video_info['description'] = playlist_description
1143         return video_info
1144
1145     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1146         """
1147         Perform a regex search on the given string, using a single or a list of
1148         patterns returning the first matching group.
1149         In case of failure return a default value or raise a WARNING or a
1150         RegexNotFoundError, depending on fatal, specifying the field name.
1151         """
1152         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1153             mobj = re.search(pattern, string, flags)
1154         else:
1155             for p in pattern:
1156                 mobj = re.search(p, string, flags)
1157                 if mobj:
1158                     break
1159
1160         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1161
1162         if mobj:
1163             if group is None:
1164                 # return the first matching group
1165                 return next(g for g in mobj.groups() if g is not None)
1166             elif isinstance(group, (list, tuple)):
1167                 return tuple(mobj.group(g) for g in group)
1168             else:
1169                 return mobj.group(group)
1170         elif default is not NO_DEFAULT:
1171             return default
1172         elif fatal:
1173             raise RegexNotFoundError('Unable to extract %s' % _name)
1174         else:
1175             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1176             return None
1177
1178     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1179         """
1180         Like _search_regex, but strips HTML tags and unescapes entities.
1181         """
1182         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1183         if res:
1184             return clean_html(res).strip()
1185         else:
1186             return res
1187
1188     def _get_netrc_login_info(self, netrc_machine=None):
1189         username = None
1190         password = None
1191         netrc_machine = netrc_machine or self._NETRC_MACHINE
1192
1193         if self.get_param('usenetrc', False):
1194             try:
1195                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1196                 if os.path.isdir(netrc_file):
1197                     netrc_file = os.path.join(netrc_file, '.netrc')
1198                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1199                 if info is not None:
1200                     username = info[0]
1201                     password = info[2]
1202                 else:
1203                     raise netrc.NetrcParseError(
1204                         'No authenticators for %s' % netrc_machine)
1205             except (IOError, netrc.NetrcParseError) as err:
1206                 self.report_warning(
1207                     'parsing .netrc: %s' % error_to_compat_str(err))
1208
1209         return username, password
1210
1211     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1212         """
1213         Get the login info as (username, password)
1214         First look for the manually specified credentials using username_option
1215         and password_option as keys in params dictionary. If no such credentials
1216         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1217         value.
1218         If there's no info available, return (None, None)
1219         """
1220
1221         # Attempt to use provided username and password or .netrc data
1222         username = self.get_param(username_option)
1223         if username is not None:
1224             password = self.get_param(password_option)
1225         else:
1226             username, password = self._get_netrc_login_info(netrc_machine)
1227
1228         return username, password
1229
1230     def _get_tfa_info(self, note='two-factor verification code'):
1231         """
1232         Get the two-factor authentication info
1233         TODO - asking the user will be required for sms/phone verify
1234         currently just uses the command line option
1235         If there's no info available, return None
1236         """
1237
1238         tfa = self.get_param('twofactor')
1239         if tfa is not None:
1240             return tfa
1241
1242         return compat_getpass('Type %s and press [Return]: ' % note)
1243
1244     # Helper functions for extracting OpenGraph info
1245     @staticmethod
1246     def _og_regexes(prop):
1247         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1248         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1249                        % {'prop': re.escape(prop)})
1250         template = r'<meta[^>]+?%s[^>]+?%s'
1251         return [
1252             template % (property_re, content_re),
1253             template % (content_re, property_re),
1254         ]
1255
1256     @staticmethod
1257     def _meta_regex(prop):
1258         return r'''(?isx)<meta
1259                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1260                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1261
1262     def _og_search_property(self, prop, html, name=None, **kargs):
1263         prop = variadic(prop)
1264         if name is None:
1265             name = 'OpenGraph %s' % prop[0]
1266         og_regexes = []
1267         for p in prop:
1268             og_regexes.extend(self._og_regexes(p))
1269         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1270         if escaped is None:
1271             return None
1272         return unescapeHTML(escaped)
1273
1274     def _og_search_thumbnail(self, html, **kargs):
1275         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1276
1277     def _og_search_description(self, html, **kargs):
1278         return self._og_search_property('description', html, fatal=False, **kargs)
1279
1280     def _og_search_title(self, html, **kargs):
1281         return self._og_search_property('title', html, **kargs)
1282
1283     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1284         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1285         if secure:
1286             regexes = self._og_regexes('video:secure_url') + regexes
1287         return self._html_search_regex(regexes, html, name, **kargs)
1288
1289     def _og_search_url(self, html, **kargs):
1290         return self._og_search_property('url', html, **kargs)
1291
1292     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1293         name = variadic(name)
1294         if display_name is None:
1295             display_name = name[0]
1296         return self._html_search_regex(
1297             [self._meta_regex(n) for n in name],
1298             html, display_name, fatal=fatal, group='content', **kwargs)
1299
1300     def _dc_search_uploader(self, html):
1301         return self._html_search_meta('dc.creator', html, 'uploader')
1302
1303     def _rta_search(self, html):
1304         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1305         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1306                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1307                      html):
1308             return 18
1309         return 0
1310
1311     def _media_rating_search(self, html):
1312         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1313         rating = self._html_search_meta('rating', html)
1314
1315         if not rating:
1316             return None
1317
1318         RATING_TABLE = {
1319             'safe for kids': 0,
1320             'general': 8,
1321             '14 years': 14,
1322             'mature': 17,
1323             'restricted': 19,
1324         }
1325         return RATING_TABLE.get(rating.lower())
1326
1327     def _family_friendly_search(self, html):
1328         # See http://schema.org/VideoObject
1329         family_friendly = self._html_search_meta(
1330             'isFamilyFriendly', html, default=None)
1331
1332         if not family_friendly:
1333             return None
1334
1335         RATING_TABLE = {
1336             '1': 0,
1337             'true': 0,
1338             '0': 18,
1339             'false': 18,
1340         }
1341         return RATING_TABLE.get(family_friendly.lower())
1342
1343     def _twitter_search_player(self, html):
1344         return self._html_search_meta('twitter:player', html,
1345                                       'twitter card player')
1346
1347     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1348         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1349         default = kwargs.get('default', NO_DEFAULT)
1350         # JSON-LD may be malformed and thus `fatal` should be respected.
1351         # At the same time `default` may be passed that assumes `fatal=False`
1352         # for _search_regex. Let's simulate the same behavior here as well.
1353         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1354         json_ld = []
1355         for mobj in json_ld_list:
1356             json_ld_item = self._parse_json(
1357                 mobj.group('json_ld'), video_id, fatal=fatal)
1358             if not json_ld_item:
1359                 continue
1360             if isinstance(json_ld_item, dict):
1361                 json_ld.append(json_ld_item)
1362             elif isinstance(json_ld_item, (list, tuple)):
1363                 json_ld.extend(json_ld_item)
1364         if json_ld:
1365             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1366         if json_ld:
1367             return json_ld
1368         if default is not NO_DEFAULT:
1369             return default
1370         elif fatal:
1371             raise RegexNotFoundError('Unable to extract JSON-LD')
1372         else:
1373             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1374             return {}
1375
1376     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1377         if isinstance(json_ld, compat_str):
1378             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1379         if not json_ld:
1380             return {}
1381         info = {}
1382         if not isinstance(json_ld, (list, tuple, dict)):
1383             return info
1384         if isinstance(json_ld, dict):
1385             json_ld = [json_ld]
1386
1387         INTERACTION_TYPE_MAP = {
1388             'CommentAction': 'comment',
1389             'AgreeAction': 'like',
1390             'DisagreeAction': 'dislike',
1391             'LikeAction': 'like',
1392             'DislikeAction': 'dislike',
1393             'ListenAction': 'view',
1394             'WatchAction': 'view',
1395             'ViewAction': 'view',
1396         }
1397
1398         def extract_interaction_type(e):
1399             interaction_type = e.get('interactionType')
1400             if isinstance(interaction_type, dict):
1401                 interaction_type = interaction_type.get('@type')
1402             return str_or_none(interaction_type)
1403
1404         def extract_interaction_statistic(e):
1405             interaction_statistic = e.get('interactionStatistic')
1406             if isinstance(interaction_statistic, dict):
1407                 interaction_statistic = [interaction_statistic]
1408             if not isinstance(interaction_statistic, list):
1409                 return
1410             for is_e in interaction_statistic:
1411                 if not isinstance(is_e, dict):
1412                     continue
1413                 if is_e.get('@type') != 'InteractionCounter':
1414                     continue
1415                 interaction_type = extract_interaction_type(is_e)
1416                 if not interaction_type:
1417                     continue
1418                 # For interaction count some sites provide string instead of
1419                 # an integer (as per spec) with non digit characters (e.g. ",")
1420                 # so extracting count with more relaxed str_to_int
1421                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1422                 if interaction_count is None:
1423                     continue
1424                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1425                 if not count_kind:
1426                     continue
1427                 count_key = '%s_count' % count_kind
1428                 if info.get(count_key) is not None:
1429                     continue
1430                 info[count_key] = interaction_count
1431
1432         def extract_video_object(e):
1433             assert e['@type'] == 'VideoObject'
1434             author = e.get('author')
1435             info.update({
1436                 'url': url_or_none(e.get('contentUrl')),
1437                 'title': unescapeHTML(e.get('name')),
1438                 'description': unescapeHTML(e.get('description')),
1439                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1440                 'duration': parse_duration(e.get('duration')),
1441                 'timestamp': unified_timestamp(e.get('uploadDate')),
1442                 # author can be an instance of 'Organization' or 'Person' types.
1443                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1444                 # however some websites are using 'Text' type instead.
1445                 # 1. https://schema.org/VideoObject
1446                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1447                 'filesize': float_or_none(e.get('contentSize')),
1448                 'tbr': int_or_none(e.get('bitrate')),
1449                 'width': int_or_none(e.get('width')),
1450                 'height': int_or_none(e.get('height')),
1451                 'view_count': int_or_none(e.get('interactionCount')),
1452             })
1453             extract_interaction_statistic(e)
1454
1455         def traverse_json_ld(json_ld, at_top_level=True):
1456             for e in json_ld:
1457                 if at_top_level and '@context' not in e:
1458                     continue
1459                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1460                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1461                     break
1462                 item_type = e.get('@type')
1463                 if expected_type is not None and expected_type != item_type:
1464                     continue
1465                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1466                 if rating is not None:
1467                     info['average_rating'] = rating
1468                 if item_type in ('TVEpisode', 'Episode'):
1469                     episode_name = unescapeHTML(e.get('name'))
1470                     info.update({
1471                         'episode': episode_name,
1472                         'episode_number': int_or_none(e.get('episodeNumber')),
1473                         'description': unescapeHTML(e.get('description')),
1474                     })
1475                     if not info.get('title') and episode_name:
1476                         info['title'] = episode_name
1477                     part_of_season = e.get('partOfSeason')
1478                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1479                         info.update({
1480                             'season': unescapeHTML(part_of_season.get('name')),
1481                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1482                         })
1483                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1484                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1485                         info['series'] = unescapeHTML(part_of_series.get('name'))
1486                 elif item_type == 'Movie':
1487                     info.update({
1488                         'title': unescapeHTML(e.get('name')),
1489                         'description': unescapeHTML(e.get('description')),
1490                         'duration': parse_duration(e.get('duration')),
1491                         'timestamp': unified_timestamp(e.get('dateCreated')),
1492                     })
1493                 elif item_type in ('Article', 'NewsArticle'):
1494                     info.update({
1495                         'timestamp': parse_iso8601(e.get('datePublished')),
1496                         'title': unescapeHTML(e.get('headline')),
1497                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1498                     })
1499                 elif item_type == 'VideoObject':
1500                     extract_video_object(e)
1501                     if expected_type is None:
1502                         continue
1503                     else:
1504                         break
1505                 video = e.get('video')
1506                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1507                     extract_video_object(video)
1508                 if expected_type is None:
1509                     continue
1510                 else:
1511                     break
1512         traverse_json_ld(json_ld)
1513
1514         return dict((k, v) for k, v in info.items() if v is not None)
1515
1516     def _search_nextjs_data(self, webpage, video_id, **kw):
1517         return self._parse_json(
1518             self._search_regex(
1519                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1520                 webpage, 'next.js data', **kw),
1521             video_id, **kw)
1522
1523     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1524         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1525         # not all website do this, but it can be changed
1526         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1527         rectx = re.escape(context_name)
1528         js, arg_keys, arg_vals = self._search_regex(
1529             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1530              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1531             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1532
1533         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1534
1535         for key, val in args.items():
1536             if val in ('undefined', 'void 0'):
1537                 args[key] = 'null'
1538
1539         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1540
1541     @staticmethod
1542     def _hidden_inputs(html):
1543         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1544         hidden_inputs = {}
1545         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1546             attrs = extract_attributes(input)
1547             if not input:
1548                 continue
1549             if attrs.get('type') not in ('hidden', 'submit'):
1550                 continue
1551             name = attrs.get('name') or attrs.get('id')
1552             value = attrs.get('value')
1553             if name and value is not None:
1554                 hidden_inputs[name] = value
1555         return hidden_inputs
1556
1557     def _form_hidden_inputs(self, form_id, html):
1558         form = self._search_regex(
1559             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1560             html, '%s form' % form_id, group='form')
1561         return self._hidden_inputs(form)
1562
1563     class FormatSort:
1564         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1565
1566         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1567                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1568                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1569         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1570                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1571                         'fps', 'fs_approx', 'source', 'id')
1572
1573         settings = {
1574             'vcodec': {'type': 'ordered', 'regex': True,
1575                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1576             'acodec': {'type': 'ordered', 'regex': True,
1577                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1578             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1579                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1580             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1581                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1582             'vext': {'type': 'ordered', 'field': 'video_ext',
1583                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1584                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1585             'aext': {'type': 'ordered', 'field': 'audio_ext',
1586                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1587                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1588             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1589             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1590                            'field': ('vcodec', 'acodec'),
1591                            'function': lambda it: int(any(v != 'none' for v in it))},
1592             'ie_pref': {'priority': True, 'type': 'extractor'},
1593             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1594             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1595             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1596             'quality': {'convert': 'float', 'default': -1},
1597             'filesize': {'convert': 'bytes'},
1598             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1599             'id': {'convert': 'string', 'field': 'format_id'},
1600             'height': {'convert': 'float_none'},
1601             'width': {'convert': 'float_none'},
1602             'fps': {'convert': 'float_none'},
1603             'tbr': {'convert': 'float_none'},
1604             'vbr': {'convert': 'float_none'},
1605             'abr': {'convert': 'float_none'},
1606             'asr': {'convert': 'float_none'},
1607             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1608
1609             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1610             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1611             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1612             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1613             'res': {'type': 'multiple', 'field': ('height', 'width'),
1614                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1615
1616             # For compatibility with youtube-dl
1617             'format_id': {'type': 'alias', 'field': 'id'},
1618             'preference': {'type': 'alias', 'field': 'ie_pref'},
1619             'language_preference': {'type': 'alias', 'field': 'lang'},
1620
1621             # Deprecated
1622             'dimension': {'type': 'alias', 'field': 'res'},
1623             'resolution': {'type': 'alias', 'field': 'res'},
1624             'extension': {'type': 'alias', 'field': 'ext'},
1625             'bitrate': {'type': 'alias', 'field': 'br'},
1626             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1627             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1628             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1629             'framerate': {'type': 'alias', 'field': 'fps'},
1630             'protocol': {'type': 'alias', 'field': 'proto'},
1631             'source_preference': {'type': 'alias', 'field': 'source'},
1632             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1633             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1634             'samplerate': {'type': 'alias', 'field': 'asr'},
1635             'video_ext': {'type': 'alias', 'field': 'vext'},
1636             'audio_ext': {'type': 'alias', 'field': 'aext'},
1637             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1638             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1639             'video': {'type': 'alias', 'field': 'hasvid'},
1640             'has_video': {'type': 'alias', 'field': 'hasvid'},
1641             'audio': {'type': 'alias', 'field': 'hasaud'},
1642             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1643             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1644             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1645         }
1646
1647         def __init__(self, ie, field_preference):
1648             self._order = []
1649             self.ydl = ie._downloader
1650             self.evaluate_params(self.ydl.params, field_preference)
1651             if ie.get_param('verbose'):
1652                 self.print_verbose_info(self.ydl.write_debug)
1653
1654         def _get_field_setting(self, field, key):
1655             if field not in self.settings:
1656                 if key in ('forced', 'priority'):
1657                     return False
1658                 self.ydl.deprecation_warning(
1659                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1660                     'and may be removed in a future version')
1661                 self.settings[field] = {}
1662             propObj = self.settings[field]
1663             if key not in propObj:
1664                 type = propObj.get('type')
1665                 if key == 'field':
1666                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1667                 elif key == 'convert':
1668                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1669                 else:
1670                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1671                 propObj[key] = default
1672             return propObj[key]
1673
1674         def _resolve_field_value(self, field, value, convertNone=False):
1675             if value is None:
1676                 if not convertNone:
1677                     return None
1678             else:
1679                 value = value.lower()
1680             conversion = self._get_field_setting(field, 'convert')
1681             if conversion == 'ignore':
1682                 return None
1683             if conversion == 'string':
1684                 return value
1685             elif conversion == 'float_none':
1686                 return float_or_none(value)
1687             elif conversion == 'bytes':
1688                 return FileDownloader.parse_bytes(value)
1689             elif conversion == 'order':
1690                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1691                 use_regex = self._get_field_setting(field, 'regex')
1692                 list_length = len(order_list)
1693                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1694                 if use_regex and value is not None:
1695                     for i, regex in enumerate(order_list):
1696                         if regex and re.match(regex, value):
1697                             return list_length - i
1698                     return list_length - empty_pos  # not in list
1699                 else:  # not regex or  value = None
1700                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1701             else:
1702                 if value.isnumeric():
1703                     return float(value)
1704                 else:
1705                     self.settings[field]['convert'] = 'string'
1706                     return value
1707
1708         def evaluate_params(self, params, sort_extractor):
1709             self._use_free_order = params.get('prefer_free_formats', False)
1710             self._sort_user = params.get('format_sort', [])
1711             self._sort_extractor = sort_extractor
1712
1713             def add_item(field, reverse, closest, limit_text):
1714                 field = field.lower()
1715                 if field in self._order:
1716                     return
1717                 self._order.append(field)
1718                 limit = self._resolve_field_value(field, limit_text)
1719                 data = {
1720                     'reverse': reverse,
1721                     'closest': False if limit is None else closest,
1722                     'limit_text': limit_text,
1723                     'limit': limit}
1724                 if field in self.settings:
1725                     self.settings[field].update(data)
1726                 else:
1727                     self.settings[field] = data
1728
1729             sort_list = (
1730                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1731                 + (tuple() if params.get('format_sort_force', False)
1732                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1733                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1734
1735             for item in sort_list:
1736                 match = re.match(self.regex, item)
1737                 if match is None:
1738                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1739                 field = match.group('field')
1740                 if field is None:
1741                     continue
1742                 if self._get_field_setting(field, 'type') == 'alias':
1743                     alias, field = field, self._get_field_setting(field, 'field')
1744                     if alias not in ('format_id', 'preference', 'language_preference'):
1745                         self.ydl.deprecation_warning(
1746                             f'Format sorting alias {alias} is deprecated '
1747                             f'and may be removed in a future version. Please use {field} instead')
1748                 reverse = match.group('reverse') is not None
1749                 closest = match.group('separator') == '~'
1750                 limit_text = match.group('limit')
1751
1752                 has_limit = limit_text is not None
1753                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1754                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1755
1756                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1757                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1758                 limit_count = len(limits)
1759                 for (i, f) in enumerate(fields):
1760                     add_item(f, reverse, closest,
1761                              limits[i] if i < limit_count
1762                              else limits[0] if has_limit and not has_multiple_limits
1763                              else None)
1764
1765         def print_verbose_info(self, write_debug):
1766             if self._sort_user:
1767                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1768             if self._sort_extractor:
1769                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1770             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1771                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1772                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1773                               self._get_field_setting(field, 'limit_text'),
1774                               self._get_field_setting(field, 'limit'))
1775                 if self._get_field_setting(field, 'limit_text') is not None else '')
1776                 for field in self._order if self._get_field_setting(field, 'visible')]))
1777
1778         def _calculate_field_preference_from_value(self, format, field, type, value):
1779             reverse = self._get_field_setting(field, 'reverse')
1780             closest = self._get_field_setting(field, 'closest')
1781             limit = self._get_field_setting(field, 'limit')
1782
1783             if type == 'extractor':
1784                 maximum = self._get_field_setting(field, 'max')
1785                 if value is None or (maximum is not None and value >= maximum):
1786                     value = -1
1787             elif type == 'boolean':
1788                 in_list = self._get_field_setting(field, 'in_list')
1789                 not_in_list = self._get_field_setting(field, 'not_in_list')
1790                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1791             elif type == 'ordered':
1792                 value = self._resolve_field_value(field, value, True)
1793
1794             # try to convert to number
1795             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1796             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1797             if is_num:
1798                 value = val_num
1799
1800             return ((-10, 0) if value is None
1801                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1802                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1803                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1804                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1805                     else (-1, value, 0))
1806
1807         def _calculate_field_preference(self, format, field):
1808             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1809             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1810             if type == 'multiple':
1811                 type = 'field'  # Only 'field' is allowed in multiple for now
1812                 actual_fields = self._get_field_setting(field, 'field')
1813
1814                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1815             else:
1816                 value = get_value(field)
1817             return self._calculate_field_preference_from_value(format, field, type, value)
1818
1819         def calculate_preference(self, format):
1820             # Determine missing protocol
1821             if not format.get('protocol'):
1822                 format['protocol'] = determine_protocol(format)
1823
1824             # Determine missing ext
1825             if not format.get('ext') and 'url' in format:
1826                 format['ext'] = determine_ext(format['url'])
1827             if format.get('vcodec') == 'none':
1828                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1829                 format['video_ext'] = 'none'
1830             else:
1831                 format['video_ext'] = format['ext']
1832                 format['audio_ext'] = 'none'
1833             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1834             #    format['preference'] = -1000
1835
1836             # Determine missing bitrates
1837             if format.get('tbr') is None:
1838                 if format.get('vbr') is not None and format.get('abr') is not None:
1839                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1840             else:
1841                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1842                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1843                 if format.get('acodec') != 'none' and format.get('abr') is None:
1844                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1845
1846             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1847
1848     def _sort_formats(self, formats, field_preference=[]):
1849         if not formats:
1850             return
1851         format_sort = self.FormatSort(self, field_preference)
1852         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1853
1854     def _check_formats(self, formats, video_id):
1855         if formats:
1856             formats[:] = filter(
1857                 lambda f: self._is_valid_url(
1858                     f['url'], video_id,
1859                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1860                 formats)
1861
1862     @staticmethod
1863     def _remove_duplicate_formats(formats):
1864         format_urls = set()
1865         unique_formats = []
1866         for f in formats:
1867             if f['url'] not in format_urls:
1868                 format_urls.add(f['url'])
1869                 unique_formats.append(f)
1870         formats[:] = unique_formats
1871
1872     def _is_valid_url(self, url, video_id, item='video', headers={}):
1873         url = self._proto_relative_url(url, scheme='http:')
1874         # For now assume non HTTP(S) URLs always valid
1875         if not (url.startswith('http://') or url.startswith('https://')):
1876             return True
1877         try:
1878             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1879             return True
1880         except ExtractorError as e:
1881             self.to_screen(
1882                 '%s: %s URL is invalid, skipping: %s'
1883                 % (video_id, item, error_to_compat_str(e.cause)))
1884             return False
1885
1886     def http_scheme(self):
1887         """ Either "http:" or "https:", depending on the user's preferences """
1888         return (
1889             'http:'
1890             if self.get_param('prefer_insecure', False)
1891             else 'https:')
1892
1893     def _proto_relative_url(self, url, scheme=None):
1894         if url is None:
1895             return url
1896         if url.startswith('//'):
1897             if scheme is None:
1898                 scheme = self.http_scheme()
1899             return scheme + url
1900         else:
1901             return url
1902
1903     def _sleep(self, timeout, video_id, msg_template=None):
1904         if msg_template is None:
1905             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1906         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1907         self.to_screen(msg)
1908         time.sleep(timeout)
1909
1910     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1911                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1912                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1913         manifest = self._download_xml(
1914             manifest_url, video_id, 'Downloading f4m manifest',
1915             'Unable to download f4m manifest',
1916             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1917             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1918             transform_source=transform_source,
1919             fatal=fatal, data=data, headers=headers, query=query)
1920
1921         if manifest is False:
1922             return []
1923
1924         return self._parse_f4m_formats(
1925             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1926             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1927
1928     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1929                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1930                            fatal=True, m3u8_id=None):
1931         if not isinstance(manifest, compat_etree_Element) and not fatal:
1932             return []
1933
1934         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1935         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1936         if akamai_pv is not None and ';' in akamai_pv.text:
1937             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1938             if playerVerificationChallenge.strip() != '':
1939                 return []
1940
1941         formats = []
1942         manifest_version = '1.0'
1943         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1944         if not media_nodes:
1945             manifest_version = '2.0'
1946             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1947         # Remove unsupported DRM protected media from final formats
1948         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1949         media_nodes = remove_encrypted_media(media_nodes)
1950         if not media_nodes:
1951             return formats
1952
1953         manifest_base_url = get_base_url(manifest)
1954
1955         bootstrap_info = xpath_element(
1956             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1957             'bootstrap info', default=None)
1958
1959         vcodec = None
1960         mime_type = xpath_text(
1961             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1962             'base URL', default=None)
1963         if mime_type and mime_type.startswith('audio/'):
1964             vcodec = 'none'
1965
1966         for i, media_el in enumerate(media_nodes):
1967             tbr = int_or_none(media_el.attrib.get('bitrate'))
1968             width = int_or_none(media_el.attrib.get('width'))
1969             height = int_or_none(media_el.attrib.get('height'))
1970             format_id = join_nonempty(f4m_id, tbr or i)
1971             # If <bootstrapInfo> is present, the specified f4m is a
1972             # stream-level manifest, and only set-level manifests may refer to
1973             # external resources.  See section 11.4 and section 4 of F4M spec
1974             if bootstrap_info is None:
1975                 media_url = None
1976                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1977                 if manifest_version == '2.0':
1978                     media_url = media_el.attrib.get('href')
1979                 if media_url is None:
1980                     media_url = media_el.attrib.get('url')
1981                 if not media_url:
1982                     continue
1983                 manifest_url = (
1984                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1985                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1986                 # If media_url is itself a f4m manifest do the recursive extraction
1987                 # since bitrates in parent manifest (this one) and media_url manifest
1988                 # may differ leading to inability to resolve the format by requested
1989                 # bitrate in f4m downloader
1990                 ext = determine_ext(manifest_url)
1991                 if ext == 'f4m':
1992                     f4m_formats = self._extract_f4m_formats(
1993                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1994                         transform_source=transform_source, fatal=fatal)
1995                     # Sometimes stream-level manifest contains single media entry that
1996                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1997                     # At the same time parent's media entry in set-level manifest may
1998                     # contain it. We will copy it from parent in such cases.
1999                     if len(f4m_formats) == 1:
2000                         f = f4m_formats[0]
2001                         f.update({
2002                             'tbr': f.get('tbr') or tbr,
2003                             'width': f.get('width') or width,
2004                             'height': f.get('height') or height,
2005                             'format_id': f.get('format_id') if not tbr else format_id,
2006                             'vcodec': vcodec,
2007                         })
2008                     formats.extend(f4m_formats)
2009                     continue
2010                 elif ext == 'm3u8':
2011                     formats.extend(self._extract_m3u8_formats(
2012                         manifest_url, video_id, 'mp4', preference=preference,
2013                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2014                     continue
2015             formats.append({
2016                 'format_id': format_id,
2017                 'url': manifest_url,
2018                 'manifest_url': manifest_url,
2019                 'ext': 'flv' if bootstrap_info is not None else None,
2020                 'protocol': 'f4m',
2021                 'tbr': tbr,
2022                 'width': width,
2023                 'height': height,
2024                 'vcodec': vcodec,
2025                 'preference': preference,
2026                 'quality': quality,
2027             })
2028         return formats
2029
2030     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2031         return {
2032             'format_id': join_nonempty(m3u8_id, 'meta'),
2033             'url': m3u8_url,
2034             'ext': ext,
2035             'protocol': 'm3u8',
2036             'preference': preference - 100 if preference else -100,
2037             'quality': quality,
2038             'resolution': 'multiple',
2039             'format_note': 'Quality selection URL',
2040         }
2041
2042     def _report_ignoring_subs(self, name):
2043         self.report_warning(bug_reports_message(
2044             f'Ignoring subtitle tracks found in the {name} manifest; '
2045             'if any subtitle tracks are missing,'
2046         ), only_once=True)
2047
2048     def _extract_m3u8_formats(self, *args, **kwargs):
2049         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2050         if subs:
2051             self._report_ignoring_subs('HLS')
2052         return fmts
2053
2054     def _extract_m3u8_formats_and_subtitles(
2055             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2056             preference=None, quality=None, m3u8_id=None, note=None,
2057             errnote=None, fatal=True, live=False, data=None, headers={},
2058             query={}):
2059
2060         res = self._download_webpage_handle(
2061             m3u8_url, video_id,
2062             note='Downloading m3u8 information' if note is None else note,
2063             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2064             fatal=fatal, data=data, headers=headers, query=query)
2065
2066         if res is False:
2067             return [], {}
2068
2069         m3u8_doc, urlh = res
2070         m3u8_url = urlh.geturl()
2071
2072         return self._parse_m3u8_formats_and_subtitles(
2073             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2074             preference=preference, quality=quality, m3u8_id=m3u8_id,
2075             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2076             headers=headers, query=query, video_id=video_id)
2077
2078     def _parse_m3u8_formats_and_subtitles(
2079             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2080             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2081             errnote=None, fatal=True, data=None, headers={}, query={},
2082             video_id=None):
2083         formats, subtitles = [], {}
2084
2085         has_drm = re.search('|'.join([
2086             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2087             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2088         ]), m3u8_doc)
2089
2090         def format_url(url):
2091             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2092
2093         if self.get_param('hls_split_discontinuity', False):
2094             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2095                 if not m3u8_doc:
2096                     if not manifest_url:
2097                         return []
2098                     m3u8_doc = self._download_webpage(
2099                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2100                         note=False, errnote='Failed to download m3u8 playlist information')
2101                     if m3u8_doc is False:
2102                         return []
2103                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2104
2105         else:
2106             def _extract_m3u8_playlist_indices(*args, **kwargs):
2107                 return [None]
2108
2109         # References:
2110         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2111         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2112         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2113
2114         # We should try extracting formats only from master playlists [1, 4.3.4],
2115         # i.e. playlists that describe available qualities. On the other hand
2116         # media playlists [1, 4.3.3] should be returned as is since they contain
2117         # just the media without qualities renditions.
2118         # Fortunately, master playlist can be easily distinguished from media
2119         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2120         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2121         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2122         # media playlist and MUST NOT appear in master playlist thus we can
2123         # clearly detect media playlist with this criterion.
2124
2125         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2126             formats = [{
2127                 'format_id': join_nonempty(m3u8_id, idx),
2128                 'format_index': idx,
2129                 'url': m3u8_url,
2130                 'ext': ext,
2131                 'protocol': entry_protocol,
2132                 'preference': preference,
2133                 'quality': quality,
2134                 'has_drm': has_drm,
2135             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2136
2137             return formats, subtitles
2138
2139         groups = {}
2140         last_stream_inf = {}
2141
2142         def extract_media(x_media_line):
2143             media = parse_m3u8_attributes(x_media_line)
2144             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2145             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2146             if not (media_type and group_id and name):
2147                 return
2148             groups.setdefault(group_id, []).append(media)
2149             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2150             if media_type == 'SUBTITLES':
2151                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2152                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2153                 # However, lack of URI has been spotted in the wild.
2154                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2155                 if not media.get('URI'):
2156                     return
2157                 url = format_url(media['URI'])
2158                 sub_info = {
2159                     'url': url,
2160                     'ext': determine_ext(url),
2161                 }
2162                 if sub_info['ext'] == 'm3u8':
2163                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2164                     # files may contain is WebVTT:
2165                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2166                     sub_info['ext'] = 'vtt'
2167                     sub_info['protocol'] = 'm3u8_native'
2168                 lang = media.get('LANGUAGE') or 'und'
2169                 subtitles.setdefault(lang, []).append(sub_info)
2170             if media_type not in ('VIDEO', 'AUDIO'):
2171                 return
2172             media_url = media.get('URI')
2173             if media_url:
2174                 manifest_url = format_url(media_url)
2175                 formats.extend({
2176                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2177                     'format_note': name,
2178                     'format_index': idx,
2179                     'url': manifest_url,
2180                     'manifest_url': m3u8_url,
2181                     'language': media.get('LANGUAGE'),
2182                     'ext': ext,
2183                     'protocol': entry_protocol,
2184                     'preference': preference,
2185                     'quality': quality,
2186                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2187                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2188
2189         def build_stream_name():
2190             # Despite specification does not mention NAME attribute for
2191             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2192             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2193             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2194             stream_name = last_stream_inf.get('NAME')
2195             if stream_name:
2196                 return stream_name
2197             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2198             # from corresponding rendition group
2199             stream_group_id = last_stream_inf.get('VIDEO')
2200             if not stream_group_id:
2201                 return
2202             stream_group = groups.get(stream_group_id)
2203             if not stream_group:
2204                 return stream_group_id
2205             rendition = stream_group[0]
2206             return rendition.get('NAME') or stream_group_id
2207
2208         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2209         # chance to detect video only formats when EXT-X-STREAM-INF tags
2210         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2211         for line in m3u8_doc.splitlines():
2212             if line.startswith('#EXT-X-MEDIA:'):
2213                 extract_media(line)
2214
2215         for line in m3u8_doc.splitlines():
2216             if line.startswith('#EXT-X-STREAM-INF:'):
2217                 last_stream_inf = parse_m3u8_attributes(line)
2218             elif line.startswith('#') or not line.strip():
2219                 continue
2220             else:
2221                 tbr = float_or_none(
2222                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2223                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2224                 manifest_url = format_url(line.strip())
2225
2226                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2227                     format_id = [m3u8_id, None, idx]
2228                     # Bandwidth of live streams may differ over time thus making
2229                     # format_id unpredictable. So it's better to keep provided
2230                     # format_id intact.
2231                     if not live:
2232                         stream_name = build_stream_name()
2233                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2234                     f = {
2235                         'format_id': join_nonempty(*format_id),
2236                         'format_index': idx,
2237                         'url': manifest_url,
2238                         'manifest_url': m3u8_url,
2239                         'tbr': tbr,
2240                         'ext': ext,
2241                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2242                         'protocol': entry_protocol,
2243                         'preference': preference,
2244                         'quality': quality,
2245                     }
2246                     resolution = last_stream_inf.get('RESOLUTION')
2247                     if resolution:
2248                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2249                         if mobj:
2250                             f['width'] = int(mobj.group('width'))
2251                             f['height'] = int(mobj.group('height'))
2252                     # Unified Streaming Platform
2253                     mobj = re.search(
2254                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2255                     if mobj:
2256                         abr, vbr = mobj.groups()
2257                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2258                         f.update({
2259                             'vbr': vbr,
2260                             'abr': abr,
2261                         })
2262                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2263                     f.update(codecs)
2264                     audio_group_id = last_stream_inf.get('AUDIO')
2265                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2266                     # references a rendition group MUST have a CODECS attribute.
2267                     # However, this is not always respected, for example, [2]
2268                     # contains EXT-X-STREAM-INF tag which references AUDIO
2269                     # rendition group but does not have CODECS and despite
2270                     # referencing an audio group it represents a complete
2271                     # (with audio and video) format. So, for such cases we will
2272                     # ignore references to rendition groups and treat them
2273                     # as complete formats.
2274                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2275                         audio_group = groups.get(audio_group_id)
2276                         if audio_group and audio_group[0].get('URI'):
2277                             # TODO: update acodec for audio only formats with
2278                             # the same GROUP-ID
2279                             f['acodec'] = 'none'
2280                     if not f.get('ext'):
2281                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2282                     formats.append(f)
2283
2284                     # for DailyMotion
2285                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2286                     if progressive_uri:
2287                         http_f = f.copy()
2288                         del http_f['manifest_url']
2289                         http_f.update({
2290                             'format_id': f['format_id'].replace('hls-', 'http-'),
2291                             'protocol': 'http',
2292                             'url': progressive_uri,
2293                         })
2294                         formats.append(http_f)
2295
2296                 last_stream_inf = {}
2297         return formats, subtitles
2298
2299     def _extract_m3u8_vod_duration(
2300             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2301
2302         m3u8_vod = self._download_webpage(
2303             m3u8_vod_url, video_id,
2304             note='Downloading m3u8 VOD manifest' if note is None else note,
2305             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2306             fatal=False, data=data, headers=headers, query=query)
2307
2308         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2309
2310     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2311         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2312             return None
2313
2314         return int(sum(
2315             float(line[len('#EXTINF:'):].split(',')[0])
2316             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2317
2318     @staticmethod
2319     def _xpath_ns(path, namespace=None):
2320         if not namespace:
2321             return path
2322         out = []
2323         for c in path.split('/'):
2324             if not c or c == '.':
2325                 out.append(c)
2326             else:
2327                 out.append('{%s}%s' % (namespace, c))
2328         return '/'.join(out)
2329
2330     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2331         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2332
2333         if smil is False:
2334             assert not fatal
2335             return [], {}
2336
2337         namespace = self._parse_smil_namespace(smil)
2338
2339         fmts = self._parse_smil_formats(
2340             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2341         subs = self._parse_smil_subtitles(
2342             smil, namespace=namespace)
2343
2344         return fmts, subs
2345
2346     def _extract_smil_formats(self, *args, **kwargs):
2347         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2348         if subs:
2349             self._report_ignoring_subs('SMIL')
2350         return fmts
2351
2352     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2353         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2354         if smil is False:
2355             return {}
2356         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2357
2358     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2359         return self._download_xml(
2360             smil_url, video_id, 'Downloading SMIL file',
2361             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2362
2363     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2364         namespace = self._parse_smil_namespace(smil)
2365
2366         formats = self._parse_smil_formats(
2367             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2368         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2369
2370         video_id = os.path.splitext(url_basename(smil_url))[0]
2371         title = None
2372         description = None
2373         upload_date = None
2374         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2375             name = meta.attrib.get('name')
2376             content = meta.attrib.get('content')
2377             if not name or not content:
2378                 continue
2379             if not title and name == 'title':
2380                 title = content
2381             elif not description and name in ('description', 'abstract'):
2382                 description = content
2383             elif not upload_date and name == 'date':
2384                 upload_date = unified_strdate(content)
2385
2386         thumbnails = [{
2387             'id': image.get('type'),
2388             'url': image.get('src'),
2389             'width': int_or_none(image.get('width')),
2390             'height': int_or_none(image.get('height')),
2391         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2392
2393         return {
2394             'id': video_id,
2395             'title': title or video_id,
2396             'description': description,
2397             'upload_date': upload_date,
2398             'thumbnails': thumbnails,
2399             'formats': formats,
2400             'subtitles': subtitles,
2401         }
2402
2403     def _parse_smil_namespace(self, smil):
2404         return self._search_regex(
2405             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2406
2407     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2408         base = smil_url
2409         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2410             b = meta.get('base') or meta.get('httpBase')
2411             if b:
2412                 base = b
2413                 break
2414
2415         formats = []
2416         rtmp_count = 0
2417         http_count = 0
2418         m3u8_count = 0
2419         imgs_count = 0
2420
2421         srcs = set()
2422         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2423         for medium in media:
2424             src = medium.get('src')
2425             if not src or src in srcs:
2426                 continue
2427             srcs.add(src)
2428
2429             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2430             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2431             width = int_or_none(medium.get('width'))
2432             height = int_or_none(medium.get('height'))
2433             proto = medium.get('proto')
2434             ext = medium.get('ext')
2435             src_ext = determine_ext(src)
2436             streamer = medium.get('streamer') or base
2437
2438             if proto == 'rtmp' or streamer.startswith('rtmp'):
2439                 rtmp_count += 1
2440                 formats.append({
2441                     'url': streamer,
2442                     'play_path': src,
2443                     'ext': 'flv',
2444                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2445                     'tbr': bitrate,
2446                     'filesize': filesize,
2447                     'width': width,
2448                     'height': height,
2449                 })
2450                 if transform_rtmp_url:
2451                     streamer, src = transform_rtmp_url(streamer, src)
2452                     formats[-1].update({
2453                         'url': streamer,
2454                         'play_path': src,
2455                     })
2456                 continue
2457
2458             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2459             src_url = src_url.strip()
2460
2461             if proto == 'm3u8' or src_ext == 'm3u8':
2462                 m3u8_formats = self._extract_m3u8_formats(
2463                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2464                 if len(m3u8_formats) == 1:
2465                     m3u8_count += 1
2466                     m3u8_formats[0].update({
2467                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2468                         'tbr': bitrate,
2469                         'width': width,
2470                         'height': height,
2471                     })
2472                 formats.extend(m3u8_formats)
2473             elif src_ext == 'f4m':
2474                 f4m_url = src_url
2475                 if not f4m_params:
2476                     f4m_params = {
2477                         'hdcore': '3.2.0',
2478                         'plugin': 'flowplayer-3.2.0.1',
2479                     }
2480                 f4m_url += '&' if '?' in f4m_url else '?'
2481                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2482                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2483             elif src_ext == 'mpd':
2484                 formats.extend(self._extract_mpd_formats(
2485                     src_url, video_id, mpd_id='dash', fatal=False))
2486             elif re.search(r'\.ism/[Mm]anifest', src_url):
2487                 formats.extend(self._extract_ism_formats(
2488                     src_url, video_id, ism_id='mss', fatal=False))
2489             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2490                 http_count += 1
2491                 formats.append({
2492                     'url': src_url,
2493                     'ext': ext or src_ext or 'flv',
2494                     'format_id': 'http-%d' % (bitrate or http_count),
2495                     'tbr': bitrate,
2496                     'filesize': filesize,
2497                     'width': width,
2498                     'height': height,
2499                 })
2500
2501         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2502             src = medium.get('src')
2503             if not src or src in srcs:
2504                 continue
2505             srcs.add(src)
2506
2507             imgs_count += 1
2508             formats.append({
2509                 'format_id': 'imagestream-%d' % (imgs_count),
2510                 'url': src,
2511                 'ext': mimetype2ext(medium.get('type')),
2512                 'acodec': 'none',
2513                 'vcodec': 'none',
2514                 'width': int_or_none(medium.get('width')),
2515                 'height': int_or_none(medium.get('height')),
2516                 'format_note': 'SMIL storyboards',
2517             })
2518
2519         return formats
2520
2521     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2522         urls = []
2523         subtitles = {}
2524         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2525             src = textstream.get('src')
2526             if not src or src in urls:
2527                 continue
2528             urls.append(src)
2529             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2530             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2531             subtitles.setdefault(lang, []).append({
2532                 'url': src,
2533                 'ext': ext,
2534             })
2535         return subtitles
2536
2537     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2538         xspf = self._download_xml(
2539             xspf_url, playlist_id, 'Downloading xpsf playlist',
2540             'Unable to download xspf manifest', fatal=fatal)
2541         if xspf is False:
2542             return []
2543         return self._parse_xspf(
2544             xspf, playlist_id, xspf_url=xspf_url,
2545             xspf_base_url=base_url(xspf_url))
2546
2547     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2548         NS_MAP = {
2549             'xspf': 'http://xspf.org/ns/0/',
2550             's1': 'http://static.streamone.nl/player/ns/0',
2551         }
2552
2553         entries = []
2554         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2555             title = xpath_text(
2556                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2557             description = xpath_text(
2558                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2559             thumbnail = xpath_text(
2560                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2561             duration = float_or_none(
2562                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2563
2564             formats = []
2565             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2566                 format_url = urljoin(xspf_base_url, location.text)
2567                 if not format_url:
2568                     continue
2569                 formats.append({
2570                     'url': format_url,
2571                     'manifest_url': xspf_url,
2572                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2573                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2574                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2575                 })
2576             self._sort_formats(formats)
2577
2578             entries.append({
2579                 'id': playlist_id,
2580                 'title': title,
2581                 'description': description,
2582                 'thumbnail': thumbnail,
2583                 'duration': duration,
2584                 'formats': formats,
2585             })
2586         return entries
2587
2588     def _extract_mpd_formats(self, *args, **kwargs):
2589         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2590         if subs:
2591             self._report_ignoring_subs('DASH')
2592         return fmts
2593
2594     def _extract_mpd_formats_and_subtitles(
2595             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2596             fatal=True, data=None, headers={}, query={}):
2597         res = self._download_xml_handle(
2598             mpd_url, video_id,
2599             note='Downloading MPD manifest' if note is None else note,
2600             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2601             fatal=fatal, data=data, headers=headers, query=query)
2602         if res is False:
2603             return [], {}
2604         mpd_doc, urlh = res
2605         if mpd_doc is None:
2606             return [], {}
2607         mpd_base_url = base_url(urlh.geturl())
2608
2609         return self._parse_mpd_formats_and_subtitles(
2610             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2611
2612     def _parse_mpd_formats(self, *args, **kwargs):
2613         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2614         if subs:
2615             self._report_ignoring_subs('DASH')
2616         return fmts
2617
2618     def _parse_mpd_formats_and_subtitles(
2619             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2620         """
2621         Parse formats from MPD manifest.
2622         References:
2623          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2624             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2625          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2626         """
2627         if not self.get_param('dynamic_mpd', True):
2628             if mpd_doc.get('type') == 'dynamic':
2629                 return [], {}
2630
2631         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2632
2633         def _add_ns(path):
2634             return self._xpath_ns(path, namespace)
2635
2636         def is_drm_protected(element):
2637             return element.find(_add_ns('ContentProtection')) is not None
2638
2639         def extract_multisegment_info(element, ms_parent_info):
2640             ms_info = ms_parent_info.copy()
2641
2642             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2643             # common attributes and elements.  We will only extract relevant
2644             # for us.
2645             def extract_common(source):
2646                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2647                 if segment_timeline is not None:
2648                     s_e = segment_timeline.findall(_add_ns('S'))
2649                     if s_e:
2650                         ms_info['total_number'] = 0
2651                         ms_info['s'] = []
2652                         for s in s_e:
2653                             r = int(s.get('r', 0))
2654                             ms_info['total_number'] += 1 + r
2655                             ms_info['s'].append({
2656                                 't': int(s.get('t', 0)),
2657                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2658                                 'd': int(s.attrib['d']),
2659                                 'r': r,
2660                             })
2661                 start_number = source.get('startNumber')
2662                 if start_number:
2663                     ms_info['start_number'] = int(start_number)
2664                 timescale = source.get('timescale')
2665                 if timescale:
2666                     ms_info['timescale'] = int(timescale)
2667                 segment_duration = source.get('duration')
2668                 if segment_duration:
2669                     ms_info['segment_duration'] = float(segment_duration)
2670
2671             def extract_Initialization(source):
2672                 initialization = source.find(_add_ns('Initialization'))
2673                 if initialization is not None:
2674                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2675
2676             segment_list = element.find(_add_ns('SegmentList'))
2677             if segment_list is not None:
2678                 extract_common(segment_list)
2679                 extract_Initialization(segment_list)
2680                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2681                 if segment_urls_e:
2682                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2683             else:
2684                 segment_template = element.find(_add_ns('SegmentTemplate'))
2685                 if segment_template is not None:
2686                     extract_common(segment_template)
2687                     media = segment_template.get('media')
2688                     if media:
2689                         ms_info['media'] = media
2690                     initialization = segment_template.get('initialization')
2691                     if initialization:
2692                         ms_info['initialization'] = initialization
2693                     else:
2694                         extract_Initialization(segment_template)
2695             return ms_info
2696
2697         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2698         formats, subtitles = [], {}
2699         stream_numbers = collections.defaultdict(int)
2700         for period in mpd_doc.findall(_add_ns('Period')):
2701             period_duration = parse_duration(period.get('duration')) or mpd_duration
2702             period_ms_info = extract_multisegment_info(period, {
2703                 'start_number': 1,
2704                 'timescale': 1,
2705             })
2706             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2707                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2708                 for representation in adaptation_set.findall(_add_ns('Representation')):
2709                     representation_attrib = adaptation_set.attrib.copy()
2710                     representation_attrib.update(representation.attrib)
2711                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2712                     mime_type = representation_attrib['mimeType']
2713                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2714
2715                     codecs = parse_codecs(representation_attrib.get('codecs', ''))
2716                     if content_type not in ('video', 'audio', 'text'):
2717                         if mime_type == 'image/jpeg':
2718                             content_type = mime_type
2719                         elif codecs['vcodec'] != 'none':
2720                             content_type = 'video'
2721                         elif codecs['acodec'] != 'none':
2722                             content_type = 'audio'
2723                         elif codecs.get('tcodec', 'none') != 'none':
2724                             content_type = 'text'
2725                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2726                             content_type = 'text'
2727                         else:
2728                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2729                             continue
2730
2731                     base_url = ''
2732                     for element in (representation, adaptation_set, period, mpd_doc):
2733                         base_url_e = element.find(_add_ns('BaseURL'))
2734                         if base_url_e is not None:
2735                             base_url = base_url_e.text + base_url
2736                             if re.match(r'^https?://', base_url):
2737                                 break
2738                     if mpd_base_url and base_url.startswith('/'):
2739                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2740                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2741                         if not mpd_base_url.endswith('/'):
2742                             mpd_base_url += '/'
2743                         base_url = mpd_base_url + base_url
2744                     representation_id = representation_attrib.get('id')
2745                     lang = representation_attrib.get('lang')
2746                     url_el = representation.find(_add_ns('BaseURL'))
2747                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2748                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2749                     if representation_id is not None:
2750                         format_id = representation_id
2751                     else:
2752                         format_id = content_type
2753                     if mpd_id:
2754                         format_id = mpd_id + '-' + format_id
2755                     if content_type in ('video', 'audio'):
2756                         f = {
2757                             'format_id': format_id,
2758                             'manifest_url': mpd_url,
2759                             'ext': mimetype2ext(mime_type),
2760                             'width': int_or_none(representation_attrib.get('width')),
2761                             'height': int_or_none(representation_attrib.get('height')),
2762                             'tbr': float_or_none(bandwidth, 1000),
2763                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2764                             'fps': int_or_none(representation_attrib.get('frameRate')),
2765                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2766                             'format_note': 'DASH %s' % content_type,
2767                             'filesize': filesize,
2768                             'container': mimetype2ext(mime_type) + '_dash',
2769                             **codecs
2770                         }
2771                     elif content_type == 'text':
2772                         f = {
2773                             'ext': mimetype2ext(mime_type),
2774                             'manifest_url': mpd_url,
2775                             'filesize': filesize,
2776                         }
2777                     elif content_type == 'image/jpeg':
2778                         # See test case in VikiIE
2779                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2780                         f = {
2781                             'format_id': format_id,
2782                             'ext': 'mhtml',
2783                             'manifest_url': mpd_url,
2784                             'format_note': 'DASH storyboards (jpeg)',
2785                             'acodec': 'none',
2786                             'vcodec': 'none',
2787                         }
2788                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2789                         f['has_drm'] = True
2790                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2791
2792                     def prepare_template(template_name, identifiers):
2793                         tmpl = representation_ms_info[template_name]
2794                         # First of, % characters outside $...$ templates
2795                         # must be escaped by doubling for proper processing
2796                         # by % operator string formatting used further (see
2797                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2798                         t = ''
2799                         in_template = False
2800                         for c in tmpl:
2801                             t += c
2802                             if c == '$':
2803                                 in_template = not in_template
2804                             elif c == '%' and not in_template:
2805                                 t += c
2806                         # Next, $...$ templates are translated to their
2807                         # %(...) counterparts to be used with % operator
2808                         if representation_id is not None:
2809                             t = t.replace('$RepresentationID$', representation_id)
2810                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2811                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2812                         t.replace('$$', '$')
2813                         return t
2814
2815                     # @initialization is a regular template like @media one
2816                     # so it should be handled just the same way (see
2817                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2818                     if 'initialization' in representation_ms_info:
2819                         initialization_template = prepare_template(
2820                             'initialization',
2821                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2822                             # $Time$ shall not be included for @initialization thus
2823                             # only $Bandwidth$ remains
2824                             ('Bandwidth', ))
2825                         representation_ms_info['initialization_url'] = initialization_template % {
2826                             'Bandwidth': bandwidth,
2827                         }
2828
2829                     def location_key(location):
2830                         return 'url' if re.match(r'^https?://', location) else 'path'
2831
2832                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2833
2834                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2835                         media_location_key = location_key(media_template)
2836
2837                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2838                         # can't be used at the same time
2839                         if '%(Number' in media_template and 's' not in representation_ms_info:
2840                             segment_duration = None
2841                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2842                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2843                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2844                             representation_ms_info['fragments'] = [{
2845                                 media_location_key: media_template % {
2846                                     'Number': segment_number,
2847                                     'Bandwidth': bandwidth,
2848                                 },
2849                                 'duration': segment_duration,
2850                             } for segment_number in range(
2851                                 representation_ms_info['start_number'],
2852                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2853                         else:
2854                             # $Number*$ or $Time$ in media template with S list available
2855                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2856                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2857                             representation_ms_info['fragments'] = []
2858                             segment_time = 0
2859                             segment_d = None
2860                             segment_number = representation_ms_info['start_number']
2861
2862                             def add_segment_url():
2863                                 segment_url = media_template % {
2864                                     'Time': segment_time,
2865                                     'Bandwidth': bandwidth,
2866                                     'Number': segment_number,
2867                                 }
2868                                 representation_ms_info['fragments'].append({
2869                                     media_location_key: segment_url,
2870                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2871                                 })
2872
2873                             for num, s in enumerate(representation_ms_info['s']):
2874                                 segment_time = s.get('t') or segment_time
2875                                 segment_d = s['d']
2876                                 add_segment_url()
2877                                 segment_number += 1
2878                                 for r in range(s.get('r', 0)):
2879                                     segment_time += segment_d
2880                                     add_segment_url()
2881                                     segment_number += 1
2882                                 segment_time += segment_d
2883                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2884                         # No media template
2885                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2886                         # or any YouTube dashsegments video
2887                         fragments = []
2888                         segment_index = 0
2889                         timescale = representation_ms_info['timescale']
2890                         for s in representation_ms_info['s']:
2891                             duration = float_or_none(s['d'], timescale)
2892                             for r in range(s.get('r', 0) + 1):
2893                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2894                                 fragments.append({
2895                                     location_key(segment_uri): segment_uri,
2896                                     'duration': duration,
2897                                 })
2898                                 segment_index += 1
2899                         representation_ms_info['fragments'] = fragments
2900                     elif 'segment_urls' in representation_ms_info:
2901                         # Segment URLs with no SegmentTimeline
2902                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2903                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2904                         fragments = []
2905                         segment_duration = float_or_none(
2906                             representation_ms_info['segment_duration'],
2907                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2908                         for segment_url in representation_ms_info['segment_urls']:
2909                             fragment = {
2910                                 location_key(segment_url): segment_url,
2911                             }
2912                             if segment_duration:
2913                                 fragment['duration'] = segment_duration
2914                             fragments.append(fragment)
2915                         representation_ms_info['fragments'] = fragments
2916                     # If there is a fragments key available then we correctly recognized fragmented media.
2917                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2918                     # assumption is not necessarily correct since we may simply have no support for
2919                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2920                     if 'fragments' in representation_ms_info:
2921                         f.update({
2922                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2923                             'url': mpd_url or base_url,
2924                             'fragment_base_url': base_url,
2925                             'fragments': [],
2926                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2927                         })
2928                         if 'initialization_url' in representation_ms_info:
2929                             initialization_url = representation_ms_info['initialization_url']
2930                             if not f.get('url'):
2931                                 f['url'] = initialization_url
2932                             f['fragments'].append({location_key(initialization_url): initialization_url})
2933                         f['fragments'].extend(representation_ms_info['fragments'])
2934                     else:
2935                         # Assuming direct URL to unfragmented media.
2936                         f['url'] = base_url
2937                     if content_type in ('video', 'audio', 'image/jpeg'):
2938                         f['manifest_stream_number'] = stream_numbers[f['url']]
2939                         stream_numbers[f['url']] += 1
2940                         formats.append(f)
2941                     elif content_type == 'text':
2942                         subtitles.setdefault(lang or 'und', []).append(f)
2943
2944         return formats, subtitles
2945
2946     def _extract_ism_formats(self, *args, **kwargs):
2947         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2948         if subs:
2949             self._report_ignoring_subs('ISM')
2950         return fmts
2951
2952     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2953         res = self._download_xml_handle(
2954             ism_url, video_id,
2955             note='Downloading ISM manifest' if note is None else note,
2956             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2957             fatal=fatal, data=data, headers=headers, query=query)
2958         if res is False:
2959             return [], {}
2960         ism_doc, urlh = res
2961         if ism_doc is None:
2962             return [], {}
2963
2964         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2965
2966     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2967         """
2968         Parse formats from ISM manifest.
2969         References:
2970          1. [MS-SSTR]: Smooth Streaming Protocol,
2971             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2972         """
2973         if ism_doc.get('IsLive') == 'TRUE':
2974             return [], {}
2975
2976         duration = int(ism_doc.attrib['Duration'])
2977         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2978
2979         formats = []
2980         subtitles = {}
2981         for stream in ism_doc.findall('StreamIndex'):
2982             stream_type = stream.get('Type')
2983             if stream_type not in ('video', 'audio', 'text'):
2984                 continue
2985             url_pattern = stream.attrib['Url']
2986             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2987             stream_name = stream.get('Name')
2988             stream_language = stream.get('Language', 'und')
2989             for track in stream.findall('QualityLevel'):
2990                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2991                 # TODO: add support for WVC1 and WMAP
2992                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2993                     self.report_warning('%s is not a supported codec' % fourcc)
2994                     continue
2995                 tbr = int(track.attrib['Bitrate']) // 1000
2996                 # [1] does not mention Width and Height attributes. However,
2997                 # they're often present while MaxWidth and MaxHeight are
2998                 # missing, so should be used as fallbacks
2999                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3000                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3001                 sampling_rate = int_or_none(track.get('SamplingRate'))
3002
3003                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3004                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3005
3006                 fragments = []
3007                 fragment_ctx = {
3008                     'time': 0,
3009                 }
3010                 stream_fragments = stream.findall('c')
3011                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3012                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3013                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3014                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3015                     if not fragment_ctx['duration']:
3016                         try:
3017                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3018                         except IndexError:
3019                             next_fragment_time = duration
3020                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3021                     for _ in range(fragment_repeat):
3022                         fragments.append({
3023                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3024                             'duration': fragment_ctx['duration'] / stream_timescale,
3025                         })
3026                         fragment_ctx['time'] += fragment_ctx['duration']
3027
3028                 if stream_type == 'text':
3029                     subtitles.setdefault(stream_language, []).append({
3030                         'ext': 'ismt',
3031                         'protocol': 'ism',
3032                         'url': ism_url,
3033                         'manifest_url': ism_url,
3034                         'fragments': fragments,
3035                         '_download_params': {
3036                             'stream_type': stream_type,
3037                             'duration': duration,
3038                             'timescale': stream_timescale,
3039                             'fourcc': fourcc,
3040                             'language': stream_language,
3041                             'codec_private_data': track.get('CodecPrivateData'),
3042                         }
3043                     })
3044                 elif stream_type in ('video', 'audio'):
3045                     formats.append({
3046                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3047                         'url': ism_url,
3048                         'manifest_url': ism_url,
3049                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3050                         'width': width,
3051                         'height': height,
3052                         'tbr': tbr,
3053                         'asr': sampling_rate,
3054                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3055                         'acodec': 'none' if stream_type == 'video' else fourcc,
3056                         'protocol': 'ism',
3057                         'fragments': fragments,
3058                         'has_drm': ism_doc.find('Protection') is not None,
3059                         '_download_params': {
3060                             'stream_type': stream_type,
3061                             'duration': duration,
3062                             'timescale': stream_timescale,
3063                             'width': width or 0,
3064                             'height': height or 0,
3065                             'fourcc': fourcc,
3066                             'language': stream_language,
3067                             'codec_private_data': track.get('CodecPrivateData'),
3068                             'sampling_rate': sampling_rate,
3069                             'channels': int_or_none(track.get('Channels', 2)),
3070                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3071                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3072                         },
3073                     })
3074         return formats, subtitles
3075
3076     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3077         def absolute_url(item_url):
3078             return urljoin(base_url, item_url)
3079
3080         def parse_content_type(content_type):
3081             if not content_type:
3082                 return {}
3083             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3084             if ctr:
3085                 mimetype, codecs = ctr.groups()
3086                 f = parse_codecs(codecs)
3087                 f['ext'] = mimetype2ext(mimetype)
3088                 return f
3089             return {}
3090
3091         def _media_formats(src, cur_media_type, type_info={}):
3092             full_url = absolute_url(src)
3093             ext = type_info.get('ext') or determine_ext(full_url)
3094             if ext == 'm3u8':
3095                 is_plain_url = False
3096                 formats = self._extract_m3u8_formats(
3097                     full_url, video_id, ext='mp4',
3098                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3099                     preference=preference, quality=quality, fatal=False)
3100             elif ext == 'mpd':
3101                 is_plain_url = False
3102                 formats = self._extract_mpd_formats(
3103                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3104             else:
3105                 is_plain_url = True
3106                 formats = [{
3107                     'url': full_url,
3108                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3109                 }]
3110             return is_plain_url, formats
3111
3112         entries = []
3113         # amp-video and amp-audio are very similar to their HTML5 counterparts
3114         # so we wll include them right here (see
3115         # https://www.ampproject.org/docs/reference/components/amp-video)
3116         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3117         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3118         media_tags = [(media_tag, media_tag_name, media_type, '')
3119                       for media_tag, media_tag_name, media_type
3120                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3121         media_tags.extend(re.findall(
3122             # We only allow video|audio followed by a whitespace or '>'.
3123             # Allowing more characters may end up in significant slow down (see
3124             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3125             # http://www.porntrex.com/maps/videositemap.xml).
3126             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3127         for media_tag, _, media_type, media_content in media_tags:
3128             media_info = {
3129                 'formats': [],
3130                 'subtitles': {},
3131             }
3132             media_attributes = extract_attributes(media_tag)
3133             src = strip_or_none(media_attributes.get('src'))
3134             if src:
3135                 _, formats = _media_formats(src, media_type)
3136                 media_info['formats'].extend(formats)
3137             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3138             if media_content:
3139                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3140                     s_attr = extract_attributes(source_tag)
3141                     # data-video-src and data-src are non standard but seen
3142                     # several times in the wild
3143                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3144                     if not src:
3145                         continue
3146                     f = parse_content_type(s_attr.get('type'))
3147                     is_plain_url, formats = _media_formats(src, media_type, f)
3148                     if is_plain_url:
3149                         # width, height, res, label and title attributes are
3150                         # all not standard but seen several times in the wild
3151                         labels = [
3152                             s_attr.get(lbl)
3153                             for lbl in ('label', 'title')
3154                             if str_or_none(s_attr.get(lbl))
3155                         ]
3156                         width = int_or_none(s_attr.get('width'))
3157                         height = (int_or_none(s_attr.get('height'))
3158                                   or int_or_none(s_attr.get('res')))
3159                         if not width or not height:
3160                             for lbl in labels:
3161                                 resolution = parse_resolution(lbl)
3162                                 if not resolution:
3163                                     continue
3164                                 width = width or resolution.get('width')
3165                                 height = height or resolution.get('height')
3166                         for lbl in labels:
3167                             tbr = parse_bitrate(lbl)
3168                             if tbr:
3169                                 break
3170                         else:
3171                             tbr = None
3172                         f.update({
3173                             'width': width,
3174                             'height': height,
3175                             'tbr': tbr,
3176                             'format_id': s_attr.get('label') or s_attr.get('title'),
3177                         })
3178                         f.update(formats[0])
3179                         media_info['formats'].append(f)
3180                     else:
3181                         media_info['formats'].extend(formats)
3182                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3183                     track_attributes = extract_attributes(track_tag)
3184                     kind = track_attributes.get('kind')
3185                     if not kind or kind in ('subtitles', 'captions'):
3186                         src = strip_or_none(track_attributes.get('src'))
3187                         if not src:
3188                             continue
3189                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3190                         media_info['subtitles'].setdefault(lang, []).append({
3191                             'url': absolute_url(src),
3192                         })
3193             for f in media_info['formats']:
3194                 f.setdefault('http_headers', {})['Referer'] = base_url
3195             if media_info['formats'] or media_info['subtitles']:
3196                 entries.append(media_info)
3197         return entries
3198
3199     def _extract_akamai_formats(self, *args, **kwargs):
3200         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3201         if subs:
3202             self._report_ignoring_subs('akamai')
3203         return fmts
3204
3205     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3206         signed = 'hdnea=' in manifest_url
3207         if not signed:
3208             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3209             manifest_url = re.sub(
3210                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3211                 '', manifest_url).strip('?')
3212
3213         formats = []
3214         subtitles = {}
3215
3216         hdcore_sign = 'hdcore=3.7.0'
3217         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3218         hds_host = hosts.get('hds')
3219         if hds_host:
3220             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3221         if 'hdcore=' not in f4m_url:
3222             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3223         f4m_formats = self._extract_f4m_formats(
3224             f4m_url, video_id, f4m_id='hds', fatal=False)
3225         for entry in f4m_formats:
3226             entry.update({'extra_param_to_segment_url': hdcore_sign})
3227         formats.extend(f4m_formats)
3228
3229         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3230         hls_host = hosts.get('hls')
3231         if hls_host:
3232             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3233         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3234             m3u8_url, video_id, 'mp4', 'm3u8_native',
3235             m3u8_id='hls', fatal=False)
3236         formats.extend(m3u8_formats)
3237         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3238
3239         http_host = hosts.get('http')
3240         if http_host and m3u8_formats and not signed:
3241             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3242             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3243             qualities_length = len(qualities)
3244             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3245                 i = 0
3246                 for f in m3u8_formats:
3247                     if f['vcodec'] != 'none':
3248                         for protocol in ('http', 'https'):
3249                             http_f = f.copy()
3250                             del http_f['manifest_url']
3251                             http_url = re.sub(
3252                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3253                             http_f.update({
3254                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3255                                 'url': http_url,
3256                                 'protocol': protocol,
3257                             })
3258                             formats.append(http_f)
3259                         i += 1
3260
3261         return formats, subtitles
3262
3263     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3264         query = compat_urlparse.urlparse(url).query
3265         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3266         mobj = re.search(
3267             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3268         url_base = mobj.group('url')
3269         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3270         formats = []
3271
3272         def manifest_url(manifest):
3273             m_url = '%s/%s' % (http_base_url, manifest)
3274             if query:
3275                 m_url += '?%s' % query
3276             return m_url
3277
3278         if 'm3u8' not in skip_protocols:
3279             formats.extend(self._extract_m3u8_formats(
3280                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3281                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3282         if 'f4m' not in skip_protocols:
3283             formats.extend(self._extract_f4m_formats(
3284                 manifest_url('manifest.f4m'),
3285                 video_id, f4m_id='hds', fatal=False))
3286         if 'dash' not in skip_protocols:
3287             formats.extend(self._extract_mpd_formats(
3288                 manifest_url('manifest.mpd'),
3289                 video_id, mpd_id='dash', fatal=False))
3290         if re.search(r'(?:/smil:|\.smil)', url_base):
3291             if 'smil' not in skip_protocols:
3292                 rtmp_formats = self._extract_smil_formats(
3293                     manifest_url('jwplayer.smil'),
3294                     video_id, fatal=False)
3295                 for rtmp_format in rtmp_formats:
3296                     rtsp_format = rtmp_format.copy()
3297                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3298                     del rtsp_format['play_path']
3299                     del rtsp_format['ext']
3300                     rtsp_format.update({
3301                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3302                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3303                         'protocol': 'rtsp',
3304                     })
3305                     formats.extend([rtmp_format, rtsp_format])
3306         else:
3307             for protocol in ('rtmp', 'rtsp'):
3308                 if protocol not in skip_protocols:
3309                     formats.append({
3310                         'url': '%s:%s' % (protocol, url_base),
3311                         'format_id': protocol,
3312                         'protocol': protocol,
3313                     })
3314         return formats
3315
3316     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3317         mobj = re.search(
3318             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3319             webpage)
3320         if mobj:
3321             try:
3322                 jwplayer_data = self._parse_json(mobj.group('options'),
3323                                                  video_id=video_id,
3324                                                  transform_source=transform_source)
3325             except ExtractorError:
3326                 pass
3327             else:
3328                 if isinstance(jwplayer_data, dict):
3329                     return jwplayer_data
3330
3331     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3332         jwplayer_data = self._find_jwplayer_data(
3333             webpage, video_id, transform_source=js_to_json)
3334         return self._parse_jwplayer_data(
3335             jwplayer_data, video_id, *args, **kwargs)
3336
3337     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3338                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3339         # JWPlayer backward compatibility: flattened playlists
3340         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3341         if 'playlist' not in jwplayer_data:
3342             jwplayer_data = {'playlist': [jwplayer_data]}
3343
3344         entries = []
3345
3346         # JWPlayer backward compatibility: single playlist item
3347         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3348         if not isinstance(jwplayer_data['playlist'], list):
3349             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3350
3351         for video_data in jwplayer_data['playlist']:
3352             # JWPlayer backward compatibility: flattened sources
3353             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3354             if 'sources' not in video_data:
3355                 video_data['sources'] = [video_data]
3356
3357             this_video_id = video_id or video_data['mediaid']
3358
3359             formats = self._parse_jwplayer_formats(
3360                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3361                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3362
3363             subtitles = {}
3364             tracks = video_data.get('tracks')
3365             if tracks and isinstance(tracks, list):
3366                 for track in tracks:
3367                     if not isinstance(track, dict):
3368                         continue
3369                     track_kind = track.get('kind')
3370                     if not track_kind or not isinstance(track_kind, compat_str):
3371                         continue
3372                     if track_kind.lower() not in ('captions', 'subtitles'):
3373                         continue
3374                     track_url = urljoin(base_url, track.get('file'))
3375                     if not track_url:
3376                         continue
3377                     subtitles.setdefault(track.get('label') or 'en', []).append({
3378                         'url': self._proto_relative_url(track_url)
3379                     })
3380
3381             entry = {
3382                 'id': this_video_id,
3383                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3384                 'description': clean_html(video_data.get('description')),
3385                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3386                 'timestamp': int_or_none(video_data.get('pubdate')),
3387                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3388                 'subtitles': subtitles,
3389             }
3390             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3391             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3392                 entry.update({
3393                     '_type': 'url_transparent',
3394                     'url': formats[0]['url'],
3395                 })
3396             else:
3397                 self._sort_formats(formats)
3398                 entry['formats'] = formats
3399             entries.append(entry)
3400         if len(entries) == 1:
3401             return entries[0]
3402         else:
3403             return self.playlist_result(entries)
3404
3405     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3406                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3407         urls = []
3408         formats = []
3409         for source in jwplayer_sources_data:
3410             if not isinstance(source, dict):
3411                 continue
3412             source_url = urljoin(
3413                 base_url, self._proto_relative_url(source.get('file')))
3414             if not source_url or source_url in urls:
3415                 continue
3416             urls.append(source_url)
3417             source_type = source.get('type') or ''
3418             ext = mimetype2ext(source_type) or determine_ext(source_url)
3419             if source_type == 'hls' or ext == 'm3u8':
3420                 formats.extend(self._extract_m3u8_formats(
3421                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3422                     m3u8_id=m3u8_id, fatal=False))
3423             elif source_type == 'dash' or ext == 'mpd':
3424                 formats.extend(self._extract_mpd_formats(
3425                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3426             elif ext == 'smil':
3427                 formats.extend(self._extract_smil_formats(
3428                     source_url, video_id, fatal=False))
3429             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3430             elif source_type.startswith('audio') or ext in (
3431                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3432                 formats.append({
3433                     'url': source_url,
3434                     'vcodec': 'none',
3435                     'ext': ext,
3436                 })
3437             else:
3438                 height = int_or_none(source.get('height'))
3439                 if height is None:
3440                     # Often no height is provided but there is a label in
3441                     # format like "1080p", "720p SD", or 1080.
3442                     height = int_or_none(self._search_regex(
3443                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3444                         'height', default=None))
3445                 a_format = {
3446                     'url': source_url,
3447                     'width': int_or_none(source.get('width')),
3448                     'height': height,
3449                     'tbr': int_or_none(source.get('bitrate')),
3450                     'ext': ext,
3451                 }
3452                 if source_url.startswith('rtmp'):
3453                     a_format['ext'] = 'flv'
3454                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3455                     # of jwplayer.flash.swf
3456                     rtmp_url_parts = re.split(
3457                         r'((?:mp4|mp3|flv):)', source_url, 1)
3458                     if len(rtmp_url_parts) == 3:
3459                         rtmp_url, prefix, play_path = rtmp_url_parts
3460                         a_format.update({
3461                             'url': rtmp_url,
3462                             'play_path': prefix + play_path,
3463                         })
3464                     if rtmp_params:
3465                         a_format.update(rtmp_params)
3466                 formats.append(a_format)
3467         return formats
3468
3469     def _live_title(self, name):
3470         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3471         return name
3472
3473     def _int(self, v, name, fatal=False, **kwargs):
3474         res = int_or_none(v, **kwargs)
3475         if 'get_attr' in kwargs:
3476             print(getattr(v, kwargs['get_attr']))
3477         if res is None:
3478             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3479             if fatal:
3480                 raise ExtractorError(msg)
3481             else:
3482                 self.report_warning(msg)
3483         return res
3484
3485     def _float(self, v, name, fatal=False, **kwargs):
3486         res = float_or_none(v, **kwargs)
3487         if res is None:
3488             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3489             if fatal:
3490                 raise ExtractorError(msg)
3491             else:
3492                 self.report_warning(msg)
3493         return res
3494
3495     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3496                     path='/', secure=False, discard=False, rest={}, **kwargs):
3497         cookie = compat_cookiejar_Cookie(
3498             0, name, value, port, port is not None, domain, True,
3499             domain.startswith('.'), path, True, secure, expire_time,
3500             discard, None, None, rest)
3501         self._downloader.cookiejar.set_cookie(cookie)
3502
3503     def _get_cookies(self, url):
3504         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3505         req = sanitized_Request(url)
3506         self._downloader.cookiejar.add_cookie_header(req)
3507         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3508
3509     def _apply_first_set_cookie_header(self, url_handle, cookie):
3510         """
3511         Apply first Set-Cookie header instead of the last. Experimental.
3512
3513         Some sites (e.g. [1-3]) may serve two cookies under the same name
3514         in Set-Cookie header and expect the first (old) one to be set rather
3515         than second (new). However, as of RFC6265 the newer one cookie
3516         should be set into cookie store what actually happens.
3517         We will workaround this issue by resetting the cookie to
3518         the first one manually.
3519         1. https://new.vk.com/
3520         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3521         3. https://learning.oreilly.com/
3522         """
3523         for header, cookies in url_handle.headers.items():
3524             if header.lower() != 'set-cookie':
3525                 continue
3526             if sys.version_info[0] >= 3:
3527                 cookies = cookies.encode('iso-8859-1')
3528             cookies = cookies.decode('utf-8')
3529             cookie_value = re.search(
3530                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3531             if cookie_value:
3532                 value, domain = cookie_value.groups()
3533                 self._set_cookie(domain, cookie, value)
3534                 break
3535
3536     def get_testcases(self, include_onlymatching=False):
3537         t = getattr(self, '_TEST', None)
3538         if t:
3539             assert not hasattr(self, '_TESTS'), \
3540                 '%s has _TEST and _TESTS' % type(self).__name__
3541             tests = [t]
3542         else:
3543             tests = getattr(self, '_TESTS', [])
3544         for t in tests:
3545             if not include_onlymatching and t.get('only_matching', False):
3546                 continue
3547             t['name'] = type(self).__name__[:-len('IE')]
3548             yield t
3549
3550     def is_suitable(self, age_limit):
3551         """ Test whether the extractor is generally suitable for the given
3552         age limit (i.e. pornographic sites are not, all others usually are) """
3553
3554         any_restricted = False
3555         for tc in self.get_testcases(include_onlymatching=False):
3556             if tc.get('playlist', []):
3557                 tc = tc['playlist'][0]
3558             is_restricted = age_restricted(
3559                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3560             if not is_restricted:
3561                 return True
3562             any_restricted = any_restricted or is_restricted
3563         return not any_restricted
3564
3565     def extract_subtitles(self, *args, **kwargs):
3566         if (self.get_param('writesubtitles', False)
3567                 or self.get_param('listsubtitles')):
3568             return self._get_subtitles(*args, **kwargs)
3569         return {}
3570
3571     def _get_subtitles(self, *args, **kwargs):
3572         raise NotImplementedError('This method must be implemented by subclasses')
3573
3574     def extract_comments(self, *args, **kwargs):
3575         if not self.get_param('getcomments'):
3576             return None
3577         generator = self._get_comments(*args, **kwargs)
3578
3579         def extractor():
3580             comments = []
3581             interrupted = True
3582             try:
3583                 while True:
3584                     comments.append(next(generator))
3585             except StopIteration:
3586                 interrupted = False
3587             except KeyboardInterrupt:
3588                 self.to_screen('Interrupted by user')
3589             except Exception as e:
3590                 if self.get_param('ignoreerrors') is not True:
3591                     raise
3592                 self._downloader.report_error(e)
3593             comment_count = len(comments)
3594             self.to_screen(f'Extracted {comment_count} comments')
3595             return {
3596                 'comments': comments,
3597                 'comment_count': None if interrupted else comment_count
3598             }
3599         return extractor
3600
3601     def _get_comments(self, *args, **kwargs):
3602         raise NotImplementedError('This method must be implemented by subclasses')
3603
3604     @staticmethod
3605     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3606         """ Merge subtitle items for one language. Items with duplicated URLs
3607         will be dropped. """
3608         list1_urls = set([item['url'] for item in subtitle_list1])
3609         ret = list(subtitle_list1)
3610         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3611         return ret
3612
3613     @classmethod
3614     def _merge_subtitles(cls, *dicts, target=None):
3615         """ Merge subtitle dictionaries, language by language. """
3616         if target is None:
3617             target = {}
3618         for d in dicts:
3619             for lang, subs in d.items():
3620                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3621         return target
3622
3623     def extract_automatic_captions(self, *args, **kwargs):
3624         if (self.get_param('writeautomaticsub', False)
3625                 or self.get_param('listsubtitles')):
3626             return self._get_automatic_captions(*args, **kwargs)
3627         return {}
3628
3629     def _get_automatic_captions(self, *args, **kwargs):
3630         raise NotImplementedError('This method must be implemented by subclasses')
3631
3632     def mark_watched(self, *args, **kwargs):
3633         if not self.get_param('mark_watched', False):
3634             return
3635         if (self._get_login_info()[0] is not None
3636                 or self.get_param('cookiefile')
3637                 or self.get_param('cookiesfrombrowser')):
3638             self._mark_watched(*args, **kwargs)
3639
3640     def _mark_watched(self, *args, **kwargs):
3641         raise NotImplementedError('This method must be implemented by subclasses')
3642
3643     def geo_verification_headers(self):
3644         headers = {}
3645         geo_verification_proxy = self.get_param('geo_verification_proxy')
3646         if geo_verification_proxy:
3647             headers['Ytdl-request-proxy'] = geo_verification_proxy
3648         return headers
3649
3650     def _generic_id(self, url):
3651         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3652
3653     def _generic_title(self, url):
3654         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3655
3656     @staticmethod
3657     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3658         all_known = all(map(
3659             lambda x: x is not None,
3660             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3661         return (
3662             'private' if is_private
3663             else 'premium_only' if needs_premium
3664             else 'subscriber_only' if needs_subscription
3665             else 'needs_auth' if needs_auth
3666             else 'unlisted' if is_unlisted
3667             else 'public' if all_known
3668             else None)
3669
3670     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3671         '''
3672         @returns            A list of values for the extractor argument given by "key"
3673                             or "default" if no such key is present
3674         @param default      The default value to return when the key is not present (default: [])
3675         @param casesense    When false, the values are converted to lower case
3676         '''
3677         val = traverse_obj(
3678             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3679         if val is None:
3680             return [] if default is NO_DEFAULT else default
3681         return list(val) if casesense else [x.lower() for x in val]
3682
3683
3684 class SearchInfoExtractor(InfoExtractor):
3685     """
3686     Base class for paged search queries extractors.
3687     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3688     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3689     """
3690
3691     _MAX_RESULTS = float('inf')
3692
3693     @classmethod
3694     def _make_valid_url(cls):
3695         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3696
3697     def _real_extract(self, query):
3698         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3699         if prefix == '':
3700             return self._get_n_results(query, 1)
3701         elif prefix == 'all':
3702             return self._get_n_results(query, self._MAX_RESULTS)
3703         else:
3704             n = int(prefix)
3705             if n <= 0:
3706                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3707             elif n > self._MAX_RESULTS:
3708                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3709                 n = self._MAX_RESULTS
3710             return self._get_n_results(query, n)
3711
3712     def _get_n_results(self, query, n):
3713         """Get a specified number of results for a query.
3714         Either this function or _search_results must be overridden by subclasses """
3715         return self.playlist_result(
3716             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3717             query, query)
3718
3719     def _search_results(self, query):
3720         """Returns an iterator of search results"""
3721         raise NotImplementedError('This method must be implemented by subclasses')
3722
3723     @property
3724     def SEARCH_KEY(self):
3725         return self._SEARCH_KEY