yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import collections
   6 import hashlib
   7 import itertools
   8 import json
   9 import netrc
  10 import os
  11 import random
  12 import re
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar_Cookie,
  19     compat_cookies_SimpleCookie,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader import FileDownloader
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     error_to_compat_str,
  49     extract_attributes,
  50     ExtractorError,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     GeoRestrictedError,
  55     GeoUtils,
  56     int_or_none,
  57     join_nonempty,
  58     js_to_json,
  59     JSON_LD_RE,
  60     mimetype2ext,
  61     network_exceptions,
  62     NO_DEFAULT,
  63     orderedSet,
  64     parse_bitrate,
  65     parse_codecs,
  66     parse_duration,
  67     parse_iso8601,
  68     parse_m3u8_attributes,
  69     parse_resolution,
  70     RegexNotFoundError,
  71     sanitize_filename,
  72     sanitized_Request,
  73     str_or_none,
  74     str_to_int,
  75     strip_or_none,
  76     traverse_obj,
  77     unescapeHTML,
  78     UnsupportedError,
  79     unified_strdate,
  80     unified_timestamp,
  81     update_Request,
  82     update_url_query,
  83     url_basename,
  84     url_or_none,
  85     urljoin,
  86     variadic,
  87     xpath_element,
  88     xpath_text,
  89     xpath_with_ns,
  90 )
  91
  92
  93 class InfoExtractor(object):
  94     """Information Extractor class.
  95
  96     Information extractors are the classes that, given a URL, extract
  97     information about the video (or videos) the URL refers to. This
  98     information includes the real video URL, the video title, author and
  99     others. The information is stored in a dictionary which is then
 100     passed to the YoutubeDL. The YoutubeDL processes this
 101     information possibly downloading the video to the file system, among
 102     other possible outcomes.
 103
 104     The type field determines the type of the result.
 105     By far the most common value (and the default if _type is missing) is
 106     "video", which indicates a single video.
 107
 108     For a video, the dictionaries must include the following fields:
 109
 110     id:             Video identifier.
 111     title:          Video title, unescaped.
 112
 113     Additionally, it must contain either a formats entry or a url one:
 114
 115     formats:        A list of dictionaries for each format available, ordered
 116                     from worst to best quality.
 117
 118                     Potential fields:
 119                     * url        The mandatory URL representing the media:
 120                                    for plain file media - HTTP URL of this file,
 121                                    for RTMP - RTMP URL,
 122                                    for HLS - URL of the M3U8 media playlist,
 123                                    for HDS - URL of the F4M manifest,
 124                                    for DASH
 125                                      - HTTP URL to plain file media (in case of
 126                                        unfragmented media)
 127                                      - URL of the MPD manifest or base URL
 128                                        representing the media if MPD manifest
 129                                        is parsed from a string (in case of
 130                                        fragmented media)
 131                                    for MSS - URL of the ISM manifest.
 132                     * manifest_url
 133                                  The URL of the manifest file in case of
 134                                  fragmented media:
 135                                    for HLS - URL of the M3U8 master playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH - URL of the MPD manifest,
 138                                    for MSS - URL of the ISM manifest.
 139                     * ext        Will be calculated from URL if missing
 140                     * format     A human-readable description of the format
 141                                  ("mp4 container with h264/opus").
 142                                  Calculated from the format_id, width, height.
 143                                  and format_note fields if missing.
 144                     * format_id  A short description of the format
 145                                  ("mp4_h264_opus" or "19").
 146                                 Technically optional, but strongly recommended.
 147                     * format_note Additional info about the format
 148                                  ("3D" or "DASH video")
 149                     * width      Width of the video, if known
 150                     * height     Height of the video, if known
 151                     * resolution Textual description of width and height
 152                     * dynamic_range The dynamic range of the video. One of:
 153                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 154                     * tbr        Average bitrate of audio and video in KBit/s
 155                     * abr        Average audio bitrate in KBit/s
 156                     * acodec     Name of the audio codec in use
 157                     * asr        Audio sampling rate in Hertz
 158                     * vbr        Average video bitrate in KBit/s
 159                     * fps        Frame rate
 160                     * vcodec     Name of the video codec in use
 161                     * container  Name of the container format
 162                     * filesize   The number of bytes, if known in advance
 163                     * filesize_approx  An estimate for the number of bytes
 164                     * player_url SWF Player URL (used for rtmpdump).
 165                     * protocol   The protocol that will be used for the actual
 166                                  download, lower-case.
 167                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 168                                  "m3u8", "m3u8_native" or "http_dash_segments".
 169                     * fragment_base_url
 170                                  Base URL for fragments. Each fragment's path
 171                                  value (if present) will be relative to
 172                                  this URL.
 173                     * fragments  A list of fragments of a fragmented media.
 174                                  Each fragment entry must contain either an url
 175                                  or a path. If an url is present it should be
 176                                  considered by a client. Otherwise both path and
 177                                  fragment_base_url must be present. Here is
 178                                  the list of all potential fields:
 179                                  * "url" - fragment's URL
 180                                  * "path" - fragment's path relative to
 181                                             fragment_base_url
 182                                  * "duration" (optional, int or float)
 183                                  * "filesize" (optional, int)
 184                     * preference Order number of this format. If this field is
 185                                  present and not None, the formats get sorted
 186                                  by this field, regardless of all other values.
 187                                  -1 for default (order by other properties),
 188                                  -2 or smaller for less than default.
 189                                  < -1000 to hide the format (if there is
 190                                     another one which is strictly better)
 191                     * language   Language code, e.g. "de" or "en-US".
 192                     * language_preference  Is this in the language mentioned in
 193                                  the URL?
 194                                  10 if it's what the URL is about,
 195                                  -1 for default (don't know),
 196                                  -10 otherwise, other values reserved for now.
 197                     * quality    Order number of the video quality of this
 198                                  format, irrespective of the file format.
 199                                  -1 for default (order by other properties),
 200                                  -2 or smaller for less than default.
 201                     * source_preference  Order number for this video source
 202                                   (quality takes higher priority)
 203                                  -1 for default (order by other properties),
 204                                  -2 or smaller for less than default.
 205                     * http_headers  A dictionary of additional HTTP headers
 206                                  to add to the request.
 207                     * stretched_ratio  If given and not 1, indicates that the
 208                                  video's pixels are not square.
 209                                  width : height ratio as float.
 210                     * no_resume  The server does not support resuming the
 211                                  (HTTP or RTMP) download. Boolean.
 212                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 213                     * downloader_options  A dictionary of downloader options as
 214                                  described in FileDownloader
 215                     RTMP formats can also have the additional fields: page_url,
 216                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 217                     rtmp_protocol, rtmp_real_time
 218
 219     url:            Final video URL.
 220     ext:            Video filename extension.
 221     format:         The video format, defaults to ext (used for --get-format)
 222     player_url:     SWF Player URL (used for rtmpdump).
 223
 224     The following fields are optional:
 225
 226     alt_title:      A secondary title of the video.
 227     display_id      An alternative identifier for the video, not necessarily
 228                     unique, but available before title. Typically, id is
 229                     something like "4234987", title "Dancing naked mole rats",
 230                     and display_id "dancing-naked-mole-rats"
 231     thumbnails:     A list of dictionaries, with the following entries:
 232                         * "id" (optional, string) - Thumbnail format ID
 233                         * "url"
 234                         * "preference" (optional, int) - quality of the image
 235                         * "width" (optional, int)
 236                         * "height" (optional, int)
 237                         * "resolution" (optional, string "{width}x{height}",
 238                                         deprecated)
 239                         * "filesize" (optional, int)
 240     thumbnail:      Full URL to a video thumbnail image.
 241     description:    Full video description.
 242     uploader:       Full name of the video uploader.
 243     license:        License name the video is licensed under.
 244     creator:        The creator of the video.
 245     release_timestamp: UNIX timestamp of the moment the video was released.
 246     release_date:   The date (YYYYMMDD) when the video was released.
 247     timestamp:      UNIX timestamp of the moment the video was uploaded
 248     upload_date:    Video upload date (YYYYMMDD).
 249                     If not explicitly set, calculated from timestamp.
 250     uploader_id:    Nickname or id of the video uploader.
 251     uploader_url:   Full URL to a personal webpage of the video uploader.
 252     channel:        Full name of the channel the video is uploaded on.
 253                     Note that channel fields may or may not repeat uploader
 254                     fields. This depends on a particular extractor.
 255     channel_id:     Id of the channel.
 256     channel_url:    Full URL to a channel webpage.
 257     location:       Physical location where the video was filmed.
 258     subtitles:      The available subtitles as a dictionary in the format
 259                     {tag: subformats}. "tag" is usually a language code, and
 260                     "subformats" is a list sorted from lower to higher
 261                     preference, each element is a dictionary with the "ext"
 262                     entry and one of:
 263                         * "data": The subtitles file contents
 264                         * "url": A URL pointing to the subtitles file
 265                     It can optionally also have:
 266                         * "name": Name or description of the subtitles
 267                     "ext" will be calculated from URL if missing
 268     automatic_captions: Like 'subtitles'; contains automatically generated
 269                     captions instead of normal subtitles
 270     duration:       Length of the video in seconds, as an integer or float.
 271     view_count:     How many users have watched the video on the platform.
 272     like_count:     Number of positive ratings of the video
 273     dislike_count:  Number of negative ratings of the video
 274     repost_count:   Number of reposts of the video
 275     average_rating: Average rating give by users, the scale used depends on the webpage
 276     comment_count:  Number of comments on the video
 277     comments:       A list of comments, each with one or more of the following
 278                     properties (all but one of text or html optional):
 279                         * "author" - human-readable name of the comment author
 280                         * "author_id" - user ID of the comment author
 281                         * "author_thumbnail" - The thumbnail of the comment author
 282                         * "id" - Comment ID
 283                         * "html" - Comment as HTML
 284                         * "text" - Plain text of the comment
 285                         * "timestamp" - UNIX timestamp of comment
 286                         * "parent" - ID of the comment this one is replying to.
 287                                      Set to "root" to indicate that this is a
 288                                      comment to the original video.
 289                         * "like_count" - Number of positive ratings of the comment
 290                         * "dislike_count" - Number of negative ratings of the comment
 291                         * "is_favorited" - Whether the comment is marked as
 292                                            favorite by the video uploader
 293                         * "author_is_uploader" - Whether the comment is made by
 294                                                  the video uploader
 295     age_limit:      Age restriction for the video, as an integer (years)
 296     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 297                     should allow to get the same result again. (It will be set
 298                     by YoutubeDL if it's missing)
 299     categories:     A list of categories that the video falls in, for example
 300                     ["Sports", "Berlin"]
 301     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 302     cast:           A list of the video cast
 303     is_live:        True, False, or None (=unknown). Whether this video is a
 304                     live stream that goes on instead of a fixed-length video.
 305     was_live:       True, False, or None (=unknown). Whether this video was
 306                     originally a live stream.
 307     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 308                     If absent, automatically set from is_live, was_live
 309     start_time:     Time in seconds where the reproduction should start, as
 310                     specified in the URL.
 311     end_time:       Time in seconds where the reproduction should end, as
 312                     specified in the URL.
 313     chapters:       A list of dictionaries, with the following entries:
 314                         * "start_time" - The start time of the chapter in seconds
 315                         * "end_time" - The end time of the chapter in seconds
 316                         * "title" (optional, string)
 317     playable_in_embed: Whether this video is allowed to play in embedded
 318                     players on other sites. Can be True (=always allowed),
 319                     False (=never allowed), None (=unknown), or a string
 320                     specifying the criteria for embedability (Eg: 'whitelist')
 321     availability:   Under what condition the video is available. One of
 322                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 323                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 324                     to set it
 325     __post_extractor: A function to be called just before the metadata is
 326                     written to either disk, logger or console. The function
 327                     must return a dict which will be added to the info_dict.
 328                     This is usefull for additional information that is
 329                     time-consuming to extract. Note that the fields thus
 330                     extracted will not be available to output template and
 331                     match_filter. So, only "comments" and "comment_count" are
 332                     currently allowed to be extracted via this method.
 333
 334     The following fields should only be used when the video belongs to some logical
 335     chapter or section:
 336
 337     chapter:        Name or title of the chapter the video belongs to.
 338     chapter_number: Number of the chapter the video belongs to, as an integer.
 339     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 340
 341     The following fields should only be used when the video is an episode of some
 342     series, programme or podcast:
 343
 344     series:         Title of the series or programme the video episode belongs to.
 345     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 346     season:         Title of the season the video episode belongs to.
 347     season_number:  Number of the season the video episode belongs to, as an integer.
 348     season_id:      Id of the season the video episode belongs to, as a unicode string.
 349     episode:        Title of the video episode. Unlike mandatory video title field,
 350                     this field should denote the exact title of the video episode
 351                     without any kind of decoration.
 352     episode_number: Number of the video episode within a season, as an integer.
 353     episode_id:     Id of the video episode, as a unicode string.
 354
 355     The following fields should only be used when the media is a track or a part of
 356     a music album:
 357
 358     track:          Title of the track.
 359     track_number:   Number of the track within an album or a disc, as an integer.
 360     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 361                     as a unicode string.
 362     artist:         Artist(s) of the track.
 363     genre:          Genre(s) of the track.
 364     album:          Title of the album the track belongs to.
 365     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 366     album_artist:   List of all artists appeared on the album (e.g.
 367                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 368                     and compilations).
 369     disc_number:    Number of the disc or other physical medium the track belongs to,
 370                     as an integer.
 371     release_year:   Year (YYYY) when the album was released.
 372
 373     Unless mentioned otherwise, the fields should be Unicode strings.
 374
 375     Unless mentioned otherwise, None is equivalent to absence of information.
 376
 377
 378     _type "playlist" indicates multiple videos.
 379     There must be a key "entries", which is a list, an iterable, or a PagedList
 380     object, each element of which is a valid dictionary by this specification.
 381
 382     Additionally, playlists can have "id", "title", and any other relevent
 383     attributes with the same semantics as videos (see above).
 384
 385
 386     _type "multi_video" indicates that there are multiple videos that
 387     form a single show, for examples multiple acts of an opera or TV episode.
 388     It must have an entries key like a playlist and contain all the keys
 389     required for a video at the same time.
 390
 391
 392     _type "url" indicates that the video must be extracted from another
 393     location, possibly by a different extractor. Its only required key is:
 394     "url" - the next URL to extract.
 395     The key "ie_key" can be set to the class name (minus the trailing "IE",
 396     e.g. "Youtube") if the extractor class is known in advance.
 397     Additionally, the dictionary may have any properties of the resolved entity
 398     known in advance, for example "title" if the title of the referred video is
 399     known ahead of time.
 400
 401
 402     _type "url_transparent" entities have the same specification as "url", but
 403     indicate that the given additional information is more precise than the one
 404     associated with the resolved URL.
 405     This is useful when a site employs a video service that hosts the video and
 406     its technical metadata, but that video service does not embed a useful
 407     title, description etc.
 408
 409
 410     Subclasses of this one should re-define the _real_initialize() and
 411     _real_extract() methods and define a _VALID_URL regexp.
 412     Probably, they should also be added to the list of extractors.
 413
 414     Subclasses may also override suitable() if necessary, but ensure the function
 415     signature is preserved and that this function imports everything it needs
 416     (except other extractors), so that lazy_extractors works correctly
 417
 418     _GEO_BYPASS attribute may be set to False in order to disable
 419     geo restriction bypass mechanisms for a particular extractor.
 420     Though it won't disable explicit geo restriction bypass based on
 421     country code provided with geo_bypass_country.
 422
 423     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 424     countries for this extractor. One of these countries will be used by
 425     geo restriction bypass mechanism right away in order to bypass
 426     geo restriction, of course, if the mechanism is not disabled.
 427
 428     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 429     IP blocks in CIDR notation for this extractor. One of these IP blocks
 430     will be used by geo restriction bypass mechanism similarly
 431     to _GEO_COUNTRIES.
 432
 433     The _WORKING attribute should be set to False for broken IEs
 434     in order to warn the users and skip the tests.
 435     """
 436
 437     _ready = False
 438     _downloader = None
 439     _x_forwarded_for_ip = None
 440     _GEO_BYPASS = True
 441     _GEO_COUNTRIES = None
 442     _GEO_IP_BLOCKS = None
 443     _WORKING = True
 444
 445     _LOGIN_HINTS = {
 446         'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
 447         'cookies': (
 448             'Use --cookies-from-browser or --cookies for the authentication. '
 449             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 450         'password': 'Use --username and --password, or --netrc to provide account credentials',
 451     }
 452
 453     def __init__(self, downloader=None):
 454         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 455         If a downloader is not passed during initialization,
 456         it must be set using "set_downloader()" before "extract()" is called"""
 457         self._ready = False
 458         self._x_forwarded_for_ip = None
 459         self._printed_messages = set()
 460         self.set_downloader(downloader)
 461
 462     @classmethod
 463     def _match_valid_url(cls, url):
 464         # This does not use has/getattr intentionally - we want to know whether
 465         # we have cached the regexp for *this* class, whereas getattr would also
 466         # match the superclass
 467         if '_VALID_URL_RE' not in cls.__dict__:
 468             if '_VALID_URL' not in cls.__dict__:
 469                 cls._VALID_URL = cls._make_valid_url()
 470             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 471         return cls._VALID_URL_RE.match(url)
 472
 473     @classmethod
 474     def suitable(cls, url):
 475         """Receives a URL and returns True if suitable for this IE."""
 476         # This function must import everything it needs (except other extractors),
 477         # so that lazy_extractors works correctly
 478         return cls._match_valid_url(url) is not None
 479
 480     @classmethod
 481     def _match_id(cls, url):
 482         return cls._match_valid_url(url).group('id')
 483
 484     @classmethod
 485     def get_temp_id(cls, url):
 486         try:
 487             return cls._match_id(url)
 488         except (IndexError, AttributeError):
 489             return None
 490
 491     @classmethod
 492     def working(cls):
 493         """Getter method for _WORKING."""
 494         return cls._WORKING
 495
 496     def initialize(self):
 497         """Initializes an instance (authentication, etc)."""
 498         self._printed_messages = set()
 499         self._initialize_geo_bypass({
 500             'countries': self._GEO_COUNTRIES,
 501             'ip_blocks': self._GEO_IP_BLOCKS,
 502         })
 503         if not self._ready:
 504             self._real_initialize()
 505             self._ready = True
 506
 507     def _initialize_geo_bypass(self, geo_bypass_context):
 508         """
 509         Initialize geo restriction bypass mechanism.
 510
 511         This method is used to initialize geo bypass mechanism based on faking
 512         X-Forwarded-For HTTP header. A random country from provided country list
 513         is selected and a random IP belonging to this country is generated. This
 514         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 515         HTTP requests.
 516
 517         This method will be used for initial geo bypass mechanism initialization
 518         during the instance initialization with _GEO_COUNTRIES and
 519         _GEO_IP_BLOCKS.
 520
 521         You may also manually call it from extractor's code if geo bypass
 522         information is not available beforehand (e.g. obtained during
 523         extraction) or due to some other reason. In this case you should pass
 524         this information in geo bypass context passed as first argument. It may
 525         contain following fields:
 526
 527         countries:  List of geo unrestricted countries (similar
 528                     to _GEO_COUNTRIES)
 529         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 530                     (similar to _GEO_IP_BLOCKS)
 531
 532         """
 533         if not self._x_forwarded_for_ip:
 534
 535             # Geo bypass mechanism is explicitly disabled by user
 536             if not self.get_param('geo_bypass', True):
 537                 return
 538
 539             if not geo_bypass_context:
 540                 geo_bypass_context = {}
 541
 542             # Backward compatibility: previously _initialize_geo_bypass
 543             # expected a list of countries, some 3rd party code may still use
 544             # it this way
 545             if isinstance(geo_bypass_context, (list, tuple)):
 546                 geo_bypass_context = {
 547                     'countries': geo_bypass_context,
 548                 }
 549
 550             # The whole point of geo bypass mechanism is to fake IP
 551             # as X-Forwarded-For HTTP header based on some IP block or
 552             # country code.
 553
 554             # Path 1: bypassing based on IP block in CIDR notation
 555
 556             # Explicit IP block specified by user, use it right away
 557             # regardless of whether extractor is geo bypassable or not
 558             ip_block = self.get_param('geo_bypass_ip_block', None)
 559
 560             # Otherwise use random IP block from geo bypass context but only
 561             # if extractor is known as geo bypassable
 562             if not ip_block:
 563                 ip_blocks = geo_bypass_context.get('ip_blocks')
 564                 if self._GEO_BYPASS and ip_blocks:
 565                     ip_block = random.choice(ip_blocks)
 566
 567             if ip_block:
 568                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 569                 self._downloader.write_debug(
 570                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 571                 return
 572
 573             # Path 2: bypassing based on country code
 574
 575             # Explicit country code specified by user, use it right away
 576             # regardless of whether extractor is geo bypassable or not
 577             country = self.get_param('geo_bypass_country', None)
 578
 579             # Otherwise use random country code from geo bypass context but
 580             # only if extractor is known as geo bypassable
 581             if not country:
 582                 countries = geo_bypass_context.get('countries')
 583                 if self._GEO_BYPASS and countries:
 584                     country = random.choice(countries)
 585
 586             if country:
 587                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 588                 self._downloader.write_debug(
 589                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 590
 591     def extract(self, url):
 592         """Extracts URL information and returns it in list of dicts."""
 593         try:
 594             for _ in range(2):
 595                 try:
 596                     self.initialize()
 597                     self.write_debug('Extracting URL: %s' % url)
 598                     ie_result = self._real_extract(url)
 599                     if ie_result is None:
 600                         return None
 601                     if self._x_forwarded_for_ip:
 602                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 603                     subtitles = ie_result.get('subtitles')
 604                     if (subtitles and 'live_chat' in subtitles
 605                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 606                         del subtitles['live_chat']
 607                     return ie_result
 608                 except GeoRestrictedError as e:
 609                     if self.__maybe_fake_ip_and_retry(e.countries):
 610                         continue
 611                     raise
 612         except UnsupportedError:
 613             raise
 614         except ExtractorError as e:
 615             kwargs = {
 616                 'video_id': e.video_id or self.get_temp_id(url),
 617                 'ie': self.IE_NAME,
 618                 'tb': e.traceback,
 619                 'expected': e.expected,
 620                 'cause': e.cause
 621             }
 622             if hasattr(e, 'countries'):
 623                 kwargs['countries'] = e.countries
 624             raise type(e)(e.msg, **kwargs)
 625         except compat_http_client.IncompleteRead as e:
 626             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 627         except (KeyError, StopIteration) as e:
 628             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 629
 630     def __maybe_fake_ip_and_retry(self, countries):
 631         if (not self.get_param('geo_bypass_country', None)
 632                 and self._GEO_BYPASS
 633                 and self.get_param('geo_bypass', True)
 634                 and not self._x_forwarded_for_ip
 635                 and countries):
 636             country_code = random.choice(countries)
 637             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 638             if self._x_forwarded_for_ip:
 639                 self.report_warning(
 640                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 641                     % (self._x_forwarded_for_ip, country_code.upper()))
 642                 return True
 643         return False
 644
 645     def set_downloader(self, downloader):
 646         """Sets the downloader for this IE."""
 647         self._downloader = downloader
 648
 649     def _real_initialize(self):
 650         """Real initialization process. Redefine in subclasses."""
 651         pass
 652
 653     def _real_extract(self, url):
 654         """Real extraction process. Redefine in subclasses."""
 655         pass
 656
 657     @classmethod
 658     def ie_key(cls):
 659         """A string for getting the InfoExtractor with get_info_extractor"""
 660         return cls.__name__[:-2]
 661
 662     @property
 663     def IE_NAME(self):
 664         return compat_str(type(self).__name__[:-2])
 665
 666     @staticmethod
 667     def __can_accept_status_code(err, expected_status):
 668         assert isinstance(err, compat_urllib_error.HTTPError)
 669         if expected_status is None:
 670             return False
 671         elif callable(expected_status):
 672             return expected_status(err.code) is True
 673         else:
 674             return err.code in variadic(expected_status)
 675
 676     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 677         """
 678         Return the response handle.
 679
 680         See _download_webpage docstring for arguments specification.
 681         """
 682         if not self._downloader._first_webpage_request:
 683             sleep_interval = self.get_param('sleep_interval_requests') or 0
 684             if sleep_interval > 0:
 685                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 686                 time.sleep(sleep_interval)
 687         else:
 688             self._downloader._first_webpage_request = False
 689
 690         if note is None:
 691             self.report_download_webpage(video_id)
 692         elif note is not False:
 693             if video_id is None:
 694                 self.to_screen('%s' % (note,))
 695             else:
 696                 self.to_screen('%s: %s' % (video_id, note))
 697
 698         # Some sites check X-Forwarded-For HTTP header in order to figure out
 699         # the origin of the client behind proxy. This allows bypassing geo
 700         # restriction by faking this header's value to IP that belongs to some
 701         # geo unrestricted country. We will do so once we encounter any
 702         # geo restriction error.
 703         if self._x_forwarded_for_ip:
 704             if 'X-Forwarded-For' not in headers:
 705                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 706
 707         if isinstance(url_or_request, compat_urllib_request.Request):
 708             url_or_request = update_Request(
 709                 url_or_request, data=data, headers=headers, query=query)
 710         else:
 711             if query:
 712                 url_or_request = update_url_query(url_or_request, query)
 713             if data is not None or headers:
 714                 url_or_request = sanitized_Request(url_or_request, data, headers)
 715         try:
 716             return self._downloader.urlopen(url_or_request)
 717         except network_exceptions as err:
 718             if isinstance(err, compat_urllib_error.HTTPError):
 719                 if self.__can_accept_status_code(err, expected_status):
 720                     # Retain reference to error to prevent file object from
 721                     # being closed before it can be read. Works around the
 722                     # effects of <https://bugs.python.org/issue15002>
 723                     # introduced in Python 3.4.1.
 724                     err.fp._error = err
 725                     return err.fp
 726
 727             if errnote is False:
 728                 return False
 729             if errnote is None:
 730                 errnote = 'Unable to download webpage'
 731
 732             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 733             if fatal:
 734                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 735             else:
 736                 self.report_warning(errmsg)
 737                 return False
 738
 739     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 740         """
 741         Return a tuple (page content as string, URL handle).
 742
 743         See _download_webpage docstring for arguments specification.
 744         """
 745         # Strip hashes from the URL (#1038)
 746         if isinstance(url_or_request, (compat_str, str)):
 747             url_or_request = url_or_request.partition('#')[0]
 748
 749         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 750         if urlh is False:
 751             assert not fatal
 752             return False
 753         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 754         return (content, urlh)
 755
 756     @staticmethod
 757     def _guess_encoding_from_content(content_type, webpage_bytes):
 758         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 759         if m:
 760             encoding = m.group(1)
 761         else:
 762             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 763                           webpage_bytes[:1024])
 764             if m:
 765                 encoding = m.group(1).decode('ascii')
 766             elif webpage_bytes.startswith(b'\xff\xfe'):
 767                 encoding = 'utf-16'
 768             else:
 769                 encoding = 'utf-8'
 770
 771         return encoding
 772
 773     def __check_blocked(self, content):
 774         first_block = content[:512]
 775         if ('<title>Access to this site is blocked</title>' in content
 776                 and 'Websense' in first_block):
 777             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 778             blocked_iframe = self._html_search_regex(
 779                 r'<iframe src="([^"]+)"', content,
 780                 'Websense information URL', default=None)
 781             if blocked_iframe:
 782                 msg += ' Visit %s for more details' % blocked_iframe
 783             raise ExtractorError(msg, expected=True)
 784         if '<title>The URL you requested has been blocked</title>' in first_block:
 785             msg = (
 786                 'Access to this webpage has been blocked by Indian censorship. '
 787                 'Use a VPN or proxy server (with --proxy) to route around it.')
 788             block_msg = self._html_search_regex(
 789                 r'</h1><p>(.*?)</p>',
 790                 content, 'block message', default=None)
 791             if block_msg:
 792                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 793             raise ExtractorError(msg, expected=True)
 794         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 795                 and 'blocklist.rkn.gov.ru' in content):
 796             raise ExtractorError(
 797                 'Access to this webpage has been blocked by decision of the Russian government. '
 798                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 799                 expected=True)
 800
 801     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 802         content_type = urlh.headers.get('Content-Type', '')
 803         webpage_bytes = urlh.read()
 804         if prefix is not None:
 805             webpage_bytes = prefix + webpage_bytes
 806         if not encoding:
 807             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 808         if self.get_param('dump_intermediate_pages', False):
 809             self.to_screen('Dumping request to ' + urlh.geturl())
 810             dump = base64.b64encode(webpage_bytes).decode('ascii')
 811             self._downloader.to_screen(dump)
 812         if self.get_param('write_pages', False):
 813             basen = '%s_%s' % (video_id, urlh.geturl())
 814             trim_length = self.get_param('trim_file_name') or 240
 815             if len(basen) > trim_length:
 816                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 817                 basen = basen[:trim_length - len(h)] + h
 818             raw_filename = basen + '.dump'
 819             filename = sanitize_filename(raw_filename, restricted=True)
 820             self.to_screen('Saving request to ' + filename)
 821             # Working around MAX_PATH limitation on Windows (see
 822             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 823             if compat_os_name == 'nt':
 824                 absfilepath = os.path.abspath(filename)
 825                 if len(absfilepath) > 259:
 826                     filename = '\\\\?\\' + absfilepath
 827             with open(filename, 'wb') as outf:
 828                 outf.write(webpage_bytes)
 829
 830         try:
 831             content = webpage_bytes.decode(encoding, 'replace')
 832         except LookupError:
 833             content = webpage_bytes.decode('utf-8', 'replace')
 834
 835         self.__check_blocked(content)
 836
 837         return content
 838
 839     def _download_webpage(
 840             self, url_or_request, video_id, note=None, errnote=None,
 841             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 842             headers={}, query={}, expected_status=None):
 843         """
 844         Return the data of the page as a string.
 845
 846         Arguments:
 847         url_or_request -- plain text URL as a string or
 848             a compat_urllib_request.Requestobject
 849         video_id -- Video/playlist/item identifier (string)
 850
 851         Keyword arguments:
 852         note -- note printed before downloading (string)
 853         errnote -- note printed in case of an error (string)
 854         fatal -- flag denoting whether error should be considered fatal,
 855             i.e. whether it should cause ExtractionError to be raised,
 856             otherwise a warning will be reported and extraction continued
 857         tries -- number of tries
 858         timeout -- sleep interval between tries
 859         encoding -- encoding for a page content decoding, guessed automatically
 860             when not explicitly specified
 861         data -- POST data (bytes)
 862         headers -- HTTP headers (dict)
 863         query -- URL query (dict)
 864         expected_status -- allows to accept failed HTTP requests (non 2xx
 865             status code) by explicitly specifying a set of accepted status
 866             codes. Can be any of the following entities:
 867                 - an integer type specifying an exact failed status code to
 868                   accept
 869                 - a list or a tuple of integer types specifying a list of
 870                   failed status codes to accept
 871                 - a callable accepting an actual failed status code and
 872                   returning True if it should be accepted
 873             Note that this argument does not affect success status codes (2xx)
 874             which are always accepted.
 875         """
 876
 877         success = False
 878         try_count = 0
 879         while success is False:
 880             try:
 881                 res = self._download_webpage_handle(
 882                     url_or_request, video_id, note, errnote, fatal,
 883                     encoding=encoding, data=data, headers=headers, query=query,
 884                     expected_status=expected_status)
 885                 success = True
 886             except compat_http_client.IncompleteRead as e:
 887                 try_count += 1
 888                 if try_count >= tries:
 889                     raise e
 890                 self._sleep(timeout, video_id)
 891         if res is False:
 892             return res
 893         else:
 894             content, _ = res
 895             return content
 896
 897     def _download_xml_handle(
 898             self, url_or_request, video_id, note='Downloading XML',
 899             errnote='Unable to download XML', transform_source=None,
 900             fatal=True, encoding=None, data=None, headers={}, query={},
 901             expected_status=None):
 902         """
 903         Return a tuple (xml as an compat_etree_Element, URL handle).
 904
 905         See _download_webpage docstring for arguments specification.
 906         """
 907         res = self._download_webpage_handle(
 908             url_or_request, video_id, note, errnote, fatal=fatal,
 909             encoding=encoding, data=data, headers=headers, query=query,
 910             expected_status=expected_status)
 911         if res is False:
 912             return res
 913         xml_string, urlh = res
 914         return self._parse_xml(
 915             xml_string, video_id, transform_source=transform_source,
 916             fatal=fatal), urlh
 917
 918     def _download_xml(
 919             self, url_or_request, video_id,
 920             note='Downloading XML', errnote='Unable to download XML',
 921             transform_source=None, fatal=True, encoding=None,
 922             data=None, headers={}, query={}, expected_status=None):
 923         """
 924         Return the xml as an compat_etree_Element.
 925
 926         See _download_webpage docstring for arguments specification.
 927         """
 928         res = self._download_xml_handle(
 929             url_or_request, video_id, note=note, errnote=errnote,
 930             transform_source=transform_source, fatal=fatal, encoding=encoding,
 931             data=data, headers=headers, query=query,
 932             expected_status=expected_status)
 933         return res if res is False else res[0]
 934
 935     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 936         if transform_source:
 937             xml_string = transform_source(xml_string)
 938         try:
 939             return compat_etree_fromstring(xml_string.encode('utf-8'))
 940         except compat_xml_parse_error as ve:
 941             errmsg = '%s: Failed to parse XML ' % video_id
 942             if fatal:
 943                 raise ExtractorError(errmsg, cause=ve)
 944             else:
 945                 self.report_warning(errmsg + str(ve))
 946
 947     def _download_json_handle(
 948             self, url_or_request, video_id, note='Downloading JSON metadata',
 949             errnote='Unable to download JSON metadata', transform_source=None,
 950             fatal=True, encoding=None, data=None, headers={}, query={},
 951             expected_status=None):
 952         """
 953         Return a tuple (JSON object, URL handle).
 954
 955         See _download_webpage docstring for arguments specification.
 956         """
 957         res = self._download_webpage_handle(
 958             url_or_request, video_id, note, errnote, fatal=fatal,
 959             encoding=encoding, data=data, headers=headers, query=query,
 960             expected_status=expected_status)
 961         if res is False:
 962             return res
 963         json_string, urlh = res
 964         return self._parse_json(
 965             json_string, video_id, transform_source=transform_source,
 966             fatal=fatal), urlh
 967
 968     def _download_json(
 969             self, url_or_request, video_id, note='Downloading JSON metadata',
 970             errnote='Unable to download JSON metadata', transform_source=None,
 971             fatal=True, encoding=None, data=None, headers={}, query={},
 972             expected_status=None):
 973         """
 974         Return the JSON object as a dict.
 975
 976         See _download_webpage docstring for arguments specification.
 977         """
 978         res = self._download_json_handle(
 979             url_or_request, video_id, note=note, errnote=errnote,
 980             transform_source=transform_source, fatal=fatal, encoding=encoding,
 981             data=data, headers=headers, query=query,
 982             expected_status=expected_status)
 983         return res if res is False else res[0]
 984
 985     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 986         if transform_source:
 987             json_string = transform_source(json_string)
 988         try:
 989             return json.loads(json_string)
 990         except ValueError as ve:
 991             errmsg = '%s: Failed to parse JSON ' % video_id
 992             if fatal:
 993                 raise ExtractorError(errmsg, cause=ve)
 994             else:
 995                 self.report_warning(errmsg + str(ve))
 996
 997     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 998         return self._parse_json(
 999             data[data.find('{'):data.rfind('}') + 1],
1000             video_id, transform_source, fatal)
1001
1002     def _download_socket_json_handle(
1003             self, url_or_request, video_id, note='Polling socket',
1004             errnote='Unable to poll socket', transform_source=None,
1005             fatal=True, encoding=None, data=None, headers={}, query={},
1006             expected_status=None):
1007         """
1008         Return a tuple (JSON object, URL handle).
1009
1010         See _download_webpage docstring for arguments specification.
1011         """
1012         res = self._download_webpage_handle(
1013             url_or_request, video_id, note, errnote, fatal=fatal,
1014             encoding=encoding, data=data, headers=headers, query=query,
1015             expected_status=expected_status)
1016         if res is False:
1017             return res
1018         webpage, urlh = res
1019         return self._parse_socket_response_as_json(
1020             webpage, video_id, transform_source=transform_source,
1021             fatal=fatal), urlh
1022
1023     def _download_socket_json(
1024             self, url_or_request, video_id, note='Polling socket',
1025             errnote='Unable to poll socket', transform_source=None,
1026             fatal=True, encoding=None, data=None, headers={}, query={},
1027             expected_status=None):
1028         """
1029         Return the JSON object as a dict.
1030
1031         See _download_webpage docstring for arguments specification.
1032         """
1033         res = self._download_socket_json_handle(
1034             url_or_request, video_id, note=note, errnote=errnote,
1035             transform_source=transform_source, fatal=fatal, encoding=encoding,
1036             data=data, headers=headers, query=query,
1037             expected_status=expected_status)
1038         return res if res is False else res[0]
1039
1040     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1041         idstr = format_field(video_id, template='%s: ')
1042         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1043         if only_once:
1044             if f'WARNING: {msg}' in self._printed_messages:
1045                 return
1046             self._printed_messages.add(f'WARNING: {msg}')
1047         self._downloader.report_warning(msg, *args, **kwargs)
1048
1049     def to_screen(self, msg, *args, **kwargs):
1050         """Print msg to screen, prefixing it with '[ie_name]'"""
1051         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1052
1053     def write_debug(self, msg, *args, **kwargs):
1054         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1055
1056     def get_param(self, name, default=None, *args, **kwargs):
1057         if self._downloader:
1058             return self._downloader.params.get(name, default, *args, **kwargs)
1059         return default
1060
1061     def report_drm(self, video_id, partial=False):
1062         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1063
1064     def report_extraction(self, id_or_name):
1065         """Report information extraction."""
1066         self.to_screen('%s: Extracting information' % id_or_name)
1067
1068     def report_download_webpage(self, video_id):
1069         """Report webpage download."""
1070         self.to_screen('%s: Downloading webpage' % video_id)
1071
1072     def report_age_confirmation(self):
1073         """Report attempt to confirm age."""
1074         self.to_screen('Confirming age')
1075
1076     def report_login(self):
1077         """Report attempt to log in."""
1078         self.to_screen('Logging in')
1079
1080     def raise_login_required(
1081             self, msg='This video is only available for registered users',
1082             metadata_available=False, method='any'):
1083         if metadata_available and (
1084                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1085             self.report_warning(msg)
1086         if method is not None:
1087             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1088         raise ExtractorError(msg, expected=True)
1089
1090     def raise_geo_restricted(
1091             self, msg='This video is not available from your location due to geo restriction',
1092             countries=None, metadata_available=False):
1093         if metadata_available and (
1094                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1095             self.report_warning(msg)
1096         else:
1097             raise GeoRestrictedError(msg, countries=countries)
1098
1099     def raise_no_formats(self, msg, expected=False, video_id=None):
1100         if expected and (
1101                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1102             self.report_warning(msg, video_id)
1103         elif isinstance(msg, ExtractorError):
1104             raise msg
1105         else:
1106             raise ExtractorError(msg, expected=expected, video_id=video_id)
1107
1108     # Methods for following #608
1109     @staticmethod
1110     def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1111         """Returns a URL that points to a page that should be processed"""
1112         # TODO: ie should be the class used for getting the info
1113         video_info = {'_type': 'url',
1114                       'url': url,
1115                       'ie_key': ie}
1116         video_info.update(kwargs)
1117         if video_id is not None:
1118             video_info['id'] = video_id
1119         if video_title is not None:
1120             video_info['title'] = video_title
1121         return video_info
1122
1123     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1124         urls = orderedSet(
1125             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1126             for m in matches)
1127         return self.playlist_result(
1128             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1129
1130     @staticmethod
1131     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1132         """Returns a playlist"""
1133         video_info = {'_type': 'playlist',
1134                       'entries': entries}
1135         video_info.update(kwargs)
1136         if playlist_id:
1137             video_info['id'] = playlist_id
1138         if playlist_title:
1139             video_info['title'] = playlist_title
1140         if playlist_description is not None:
1141             video_info['description'] = playlist_description
1142         return video_info
1143
1144     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1145         """
1146         Perform a regex search on the given string, using a single or a list of
1147         patterns returning the first matching group.
1148         In case of failure return a default value or raise a WARNING or a
1149         RegexNotFoundError, depending on fatal, specifying the field name.
1150         """
1151         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1152             mobj = re.search(pattern, string, flags)
1153         else:
1154             for p in pattern:
1155                 mobj = re.search(p, string, flags)
1156                 if mobj:
1157                     break
1158
1159         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1160
1161         if mobj:
1162             if group is None:
1163                 # return the first matching group
1164                 return next(g for g in mobj.groups() if g is not None)
1165             elif isinstance(group, (list, tuple)):
1166                 return tuple(mobj.group(g) for g in group)
1167             else:
1168                 return mobj.group(group)
1169         elif default is not NO_DEFAULT:
1170             return default
1171         elif fatal:
1172             raise RegexNotFoundError('Unable to extract %s' % _name)
1173         else:
1174             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1175             return None
1176
1177     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1178         """
1179         Like _search_regex, but strips HTML tags and unescapes entities.
1180         """
1181         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1182         if res:
1183             return clean_html(res).strip()
1184         else:
1185             return res
1186
1187     def _get_netrc_login_info(self, netrc_machine=None):
1188         username = None
1189         password = None
1190         netrc_machine = netrc_machine or self._NETRC_MACHINE
1191
1192         if self.get_param('usenetrc', False):
1193             try:
1194                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1195                 if os.path.isdir(netrc_file):
1196                     netrc_file = os.path.join(netrc_file, '.netrc')
1197                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1198                 if info is not None:
1199                     username = info[0]
1200                     password = info[2]
1201                 else:
1202                     raise netrc.NetrcParseError(
1203                         'No authenticators for %s' % netrc_machine)
1204             except (IOError, netrc.NetrcParseError) as err:
1205                 self.report_warning(
1206                     'parsing .netrc: %s' % error_to_compat_str(err))
1207
1208         return username, password
1209
1210     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1211         """
1212         Get the login info as (username, password)
1213         First look for the manually specified credentials using username_option
1214         and password_option as keys in params dictionary. If no such credentials
1215         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1216         value.
1217         If there's no info available, return (None, None)
1218         """
1219
1220         # Attempt to use provided username and password or .netrc data
1221         username = self.get_param(username_option)
1222         if username is not None:
1223             password = self.get_param(password_option)
1224         else:
1225             username, password = self._get_netrc_login_info(netrc_machine)
1226
1227         return username, password
1228
1229     def _get_tfa_info(self, note='two-factor verification code'):
1230         """
1231         Get the two-factor authentication info
1232         TODO - asking the user will be required for sms/phone verify
1233         currently just uses the command line option
1234         If there's no info available, return None
1235         """
1236
1237         tfa = self.get_param('twofactor')
1238         if tfa is not None:
1239             return tfa
1240
1241         return compat_getpass('Type %s and press [Return]: ' % note)
1242
1243     # Helper functions for extracting OpenGraph info
1244     @staticmethod
1245     def _og_regexes(prop):
1246         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1247         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1248                        % {'prop': re.escape(prop)})
1249         template = r'<meta[^>]+?%s[^>]+?%s'
1250         return [
1251             template % (property_re, content_re),
1252             template % (content_re, property_re),
1253         ]
1254
1255     @staticmethod
1256     def _meta_regex(prop):
1257         return r'''(?isx)<meta
1258                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1259                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1260
1261     def _og_search_property(self, prop, html, name=None, **kargs):
1262         prop = variadic(prop)
1263         if name is None:
1264             name = 'OpenGraph %s' % prop[0]
1265         og_regexes = []
1266         for p in prop:
1267             og_regexes.extend(self._og_regexes(p))
1268         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1269         if escaped is None:
1270             return None
1271         return unescapeHTML(escaped)
1272
1273     def _og_search_thumbnail(self, html, **kargs):
1274         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1275
1276     def _og_search_description(self, html, **kargs):
1277         return self._og_search_property('description', html, fatal=False, **kargs)
1278
1279     def _og_search_title(self, html, **kargs):
1280         return self._og_search_property('title', html, **kargs)
1281
1282     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1283         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1284         if secure:
1285             regexes = self._og_regexes('video:secure_url') + regexes
1286         return self._html_search_regex(regexes, html, name, **kargs)
1287
1288     def _og_search_url(self, html, **kargs):
1289         return self._og_search_property('url', html, **kargs)
1290
1291     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1292         name = variadic(name)
1293         if display_name is None:
1294             display_name = name[0]
1295         return self._html_search_regex(
1296             [self._meta_regex(n) for n in name],
1297             html, display_name, fatal=fatal, group='content', **kwargs)
1298
1299     def _dc_search_uploader(self, html):
1300         return self._html_search_meta('dc.creator', html, 'uploader')
1301
1302     def _rta_search(self, html):
1303         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1304         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1305                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1306                      html):
1307             return 18
1308         return 0
1309
1310     def _media_rating_search(self, html):
1311         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1312         rating = self._html_search_meta('rating', html)
1313
1314         if not rating:
1315             return None
1316
1317         RATING_TABLE = {
1318             'safe for kids': 0,
1319             'general': 8,
1320             '14 years': 14,
1321             'mature': 17,
1322             'restricted': 19,
1323         }
1324         return RATING_TABLE.get(rating.lower())
1325
1326     def _family_friendly_search(self, html):
1327         # See http://schema.org/VideoObject
1328         family_friendly = self._html_search_meta(
1329             'isFamilyFriendly', html, default=None)
1330
1331         if not family_friendly:
1332             return None
1333
1334         RATING_TABLE = {
1335             '1': 0,
1336             'true': 0,
1337             '0': 18,
1338             'false': 18,
1339         }
1340         return RATING_TABLE.get(family_friendly.lower())
1341
1342     def _twitter_search_player(self, html):
1343         return self._html_search_meta('twitter:player', html,
1344                                       'twitter card player')
1345
1346     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1347         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1348         default = kwargs.get('default', NO_DEFAULT)
1349         # JSON-LD may be malformed and thus `fatal` should be respected.
1350         # At the same time `default` may be passed that assumes `fatal=False`
1351         # for _search_regex. Let's simulate the same behavior here as well.
1352         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1353         json_ld = []
1354         for mobj in json_ld_list:
1355             json_ld_item = self._parse_json(
1356                 mobj.group('json_ld'), video_id, fatal=fatal)
1357             if not json_ld_item:
1358                 continue
1359             if isinstance(json_ld_item, dict):
1360                 json_ld.append(json_ld_item)
1361             elif isinstance(json_ld_item, (list, tuple)):
1362                 json_ld.extend(json_ld_item)
1363         if json_ld:
1364             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1365         if json_ld:
1366             return json_ld
1367         if default is not NO_DEFAULT:
1368             return default
1369         elif fatal:
1370             raise RegexNotFoundError('Unable to extract JSON-LD')
1371         else:
1372             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1373             return {}
1374
1375     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1376         if isinstance(json_ld, compat_str):
1377             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1378         if not json_ld:
1379             return {}
1380         info = {}
1381         if not isinstance(json_ld, (list, tuple, dict)):
1382             return info
1383         if isinstance(json_ld, dict):
1384             json_ld = [json_ld]
1385
1386         INTERACTION_TYPE_MAP = {
1387             'CommentAction': 'comment',
1388             'AgreeAction': 'like',
1389             'DisagreeAction': 'dislike',
1390             'LikeAction': 'like',
1391             'DislikeAction': 'dislike',
1392             'ListenAction': 'view',
1393             'WatchAction': 'view',
1394             'ViewAction': 'view',
1395         }
1396
1397         def extract_interaction_type(e):
1398             interaction_type = e.get('interactionType')
1399             if isinstance(interaction_type, dict):
1400                 interaction_type = interaction_type.get('@type')
1401             return str_or_none(interaction_type)
1402
1403         def extract_interaction_statistic(e):
1404             interaction_statistic = e.get('interactionStatistic')
1405             if isinstance(interaction_statistic, dict):
1406                 interaction_statistic = [interaction_statistic]
1407             if not isinstance(interaction_statistic, list):
1408                 return
1409             for is_e in interaction_statistic:
1410                 if not isinstance(is_e, dict):
1411                     continue
1412                 if is_e.get('@type') != 'InteractionCounter':
1413                     continue
1414                 interaction_type = extract_interaction_type(is_e)
1415                 if not interaction_type:
1416                     continue
1417                 # For interaction count some sites provide string instead of
1418                 # an integer (as per spec) with non digit characters (e.g. ",")
1419                 # so extracting count with more relaxed str_to_int
1420                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1421                 if interaction_count is None:
1422                     continue
1423                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1424                 if not count_kind:
1425                     continue
1426                 count_key = '%s_count' % count_kind
1427                 if info.get(count_key) is not None:
1428                     continue
1429                 info[count_key] = interaction_count
1430
1431         def extract_video_object(e):
1432             assert e['@type'] == 'VideoObject'
1433             author = e.get('author')
1434             info.update({
1435                 'url': url_or_none(e.get('contentUrl')),
1436                 'title': unescapeHTML(e.get('name')),
1437                 'description': unescapeHTML(e.get('description')),
1438                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1439                 'duration': parse_duration(e.get('duration')),
1440                 'timestamp': unified_timestamp(e.get('uploadDate')),
1441                 # author can be an instance of 'Organization' or 'Person' types.
1442                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1443                 # however some websites are using 'Text' type instead.
1444                 # 1. https://schema.org/VideoObject
1445                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1446                 'filesize': float_or_none(e.get('contentSize')),
1447                 'tbr': int_or_none(e.get('bitrate')),
1448                 'width': int_or_none(e.get('width')),
1449                 'height': int_or_none(e.get('height')),
1450                 'view_count': int_or_none(e.get('interactionCount')),
1451             })
1452             extract_interaction_statistic(e)
1453
1454         def traverse_json_ld(json_ld, at_top_level=True):
1455             for e in json_ld:
1456                 if at_top_level and '@context' not in e:
1457                     continue
1458                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1459                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1460                     break
1461                 item_type = e.get('@type')
1462                 if expected_type is not None and expected_type != item_type:
1463                     continue
1464                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1465                 if rating is not None:
1466                     info['average_rating'] = rating
1467                 if item_type in ('TVEpisode', 'Episode'):
1468                     episode_name = unescapeHTML(e.get('name'))
1469                     info.update({
1470                         'episode': episode_name,
1471                         'episode_number': int_or_none(e.get('episodeNumber')),
1472                         'description': unescapeHTML(e.get('description')),
1473                     })
1474                     if not info.get('title') and episode_name:
1475                         info['title'] = episode_name
1476                     part_of_season = e.get('partOfSeason')
1477                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1478                         info.update({
1479                             'season': unescapeHTML(part_of_season.get('name')),
1480                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1481                         })
1482                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1483                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1484                         info['series'] = unescapeHTML(part_of_series.get('name'))
1485                 elif item_type == 'Movie':
1486                     info.update({
1487                         'title': unescapeHTML(e.get('name')),
1488                         'description': unescapeHTML(e.get('description')),
1489                         'duration': parse_duration(e.get('duration')),
1490                         'timestamp': unified_timestamp(e.get('dateCreated')),
1491                     })
1492                 elif item_type in ('Article', 'NewsArticle'):
1493                     info.update({
1494                         'timestamp': parse_iso8601(e.get('datePublished')),
1495                         'title': unescapeHTML(e.get('headline')),
1496                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1497                     })
1498                 elif item_type == 'VideoObject':
1499                     extract_video_object(e)
1500                     if expected_type is None:
1501                         continue
1502                     else:
1503                         break
1504                 video = e.get('video')
1505                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1506                     extract_video_object(video)
1507                 if expected_type is None:
1508                     continue
1509                 else:
1510                     break
1511         traverse_json_ld(json_ld)
1512
1513         return dict((k, v) for k, v in info.items() if v is not None)
1514
1515     def _search_nextjs_data(self, webpage, video_id, **kw):
1516         return self._parse_json(
1517             self._search_regex(
1518                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1519                 webpage, 'next.js data', **kw),
1520             video_id, **kw)
1521
1522     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1523         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1524         # not all website do this, but it can be changed
1525         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1526         rectx = re.escape(context_name)
1527         js, arg_keys, arg_vals = self._search_regex(
1528             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1529              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1530             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1531
1532         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1533
1534         for key, val in args.items():
1535             if val in ('undefined', 'void 0'):
1536                 args[key] = 'null'
1537
1538         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1539
1540     @staticmethod
1541     def _hidden_inputs(html):
1542         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1543         hidden_inputs = {}
1544         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1545             attrs = extract_attributes(input)
1546             if not input:
1547                 continue
1548             if attrs.get('type') not in ('hidden', 'submit'):
1549                 continue
1550             name = attrs.get('name') or attrs.get('id')
1551             value = attrs.get('value')
1552             if name and value is not None:
1553                 hidden_inputs[name] = value
1554         return hidden_inputs
1555
1556     def _form_hidden_inputs(self, form_id, html):
1557         form = self._search_regex(
1558             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1559             html, '%s form' % form_id, group='form')
1560         return self._hidden_inputs(form)
1561
1562     class FormatSort:
1563         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1564
1565         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1566                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1567                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1568         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1569                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1570                         'fps', 'fs_approx', 'source', 'id')
1571
1572         settings = {
1573             'vcodec': {'type': 'ordered', 'regex': True,
1574                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1575             'acodec': {'type': 'ordered', 'regex': True,
1576                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1577             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1578                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1579             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1580                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1581             'vext': {'type': 'ordered', 'field': 'video_ext',
1582                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1583                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1584             'aext': {'type': 'ordered', 'field': 'audio_ext',
1585                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1586                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1587             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1588             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1589                            'field': ('vcodec', 'acodec'),
1590                            'function': lambda it: int(any(v != 'none' for v in it))},
1591             'ie_pref': {'priority': True, 'type': 'extractor'},
1592             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1593             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1594             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1595             'quality': {'convert': 'float', 'default': -1},
1596             'filesize': {'convert': 'bytes'},
1597             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1598             'id': {'convert': 'string', 'field': 'format_id'},
1599             'height': {'convert': 'float_none'},
1600             'width': {'convert': 'float_none'},
1601             'fps': {'convert': 'float_none'},
1602             'tbr': {'convert': 'float_none'},
1603             'vbr': {'convert': 'float_none'},
1604             'abr': {'convert': 'float_none'},
1605             'asr': {'convert': 'float_none'},
1606             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1607
1608             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1609             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1610             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1611             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1612             'res': {'type': 'multiple', 'field': ('height', 'width'),
1613                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1614
1615             # For compatibility with youtube-dl
1616             'format_id': {'type': 'alias', 'field': 'id'},
1617             'preference': {'type': 'alias', 'field': 'ie_pref'},
1618             'language_preference': {'type': 'alias', 'field': 'lang'},
1619
1620             # Deprecated
1621             'dimension': {'type': 'alias', 'field': 'res'},
1622             'resolution': {'type': 'alias', 'field': 'res'},
1623             'extension': {'type': 'alias', 'field': 'ext'},
1624             'bitrate': {'type': 'alias', 'field': 'br'},
1625             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1626             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1627             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1628             'framerate': {'type': 'alias', 'field': 'fps'},
1629             'protocol': {'type': 'alias', 'field': 'proto'},
1630             'source_preference': {'type': 'alias', 'field': 'source'},
1631             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1632             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1633             'samplerate': {'type': 'alias', 'field': 'asr'},
1634             'video_ext': {'type': 'alias', 'field': 'vext'},
1635             'audio_ext': {'type': 'alias', 'field': 'aext'},
1636             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1637             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1638             'video': {'type': 'alias', 'field': 'hasvid'},
1639             'has_video': {'type': 'alias', 'field': 'hasvid'},
1640             'audio': {'type': 'alias', 'field': 'hasaud'},
1641             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1642             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1643             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1644         }
1645
1646         def __init__(self, ie, field_preference):
1647             self._order = []
1648             self.ydl = ie._downloader
1649             self.evaluate_params(self.ydl.params, field_preference)
1650             if ie.get_param('verbose'):
1651                 self.print_verbose_info(self.ydl.write_debug)
1652
1653         def _get_field_setting(self, field, key):
1654             if field not in self.settings:
1655                 if key in ('forced', 'priority'):
1656                     return False
1657                 self.ydl.deprecation_warning(
1658                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1659                     'and may be removed in a future version')
1660                 self.settings[field] = {}
1661             propObj = self.settings[field]
1662             if key not in propObj:
1663                 type = propObj.get('type')
1664                 if key == 'field':
1665                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1666                 elif key == 'convert':
1667                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1668                 else:
1669                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1670                 propObj[key] = default
1671             return propObj[key]
1672
1673         def _resolve_field_value(self, field, value, convertNone=False):
1674             if value is None:
1675                 if not convertNone:
1676                     return None
1677             else:
1678                 value = value.lower()
1679             conversion = self._get_field_setting(field, 'convert')
1680             if conversion == 'ignore':
1681                 return None
1682             if conversion == 'string':
1683                 return value
1684             elif conversion == 'float_none':
1685                 return float_or_none(value)
1686             elif conversion == 'bytes':
1687                 return FileDownloader.parse_bytes(value)
1688             elif conversion == 'order':
1689                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1690                 use_regex = self._get_field_setting(field, 'regex')
1691                 list_length = len(order_list)
1692                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1693                 if use_regex and value is not None:
1694                     for i, regex in enumerate(order_list):
1695                         if regex and re.match(regex, value):
1696                             return list_length - i
1697                     return list_length - empty_pos  # not in list
1698                 else:  # not regex or  value = None
1699                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1700             else:
1701                 if value.isnumeric():
1702                     return float(value)
1703                 else:
1704                     self.settings[field]['convert'] = 'string'
1705                     return value
1706
1707         def evaluate_params(self, params, sort_extractor):
1708             self._use_free_order = params.get('prefer_free_formats', False)
1709             self._sort_user = params.get('format_sort', [])
1710             self._sort_extractor = sort_extractor
1711
1712             def add_item(field, reverse, closest, limit_text):
1713                 field = field.lower()
1714                 if field in self._order:
1715                     return
1716                 self._order.append(field)
1717                 limit = self._resolve_field_value(field, limit_text)
1718                 data = {
1719                     'reverse': reverse,
1720                     'closest': False if limit is None else closest,
1721                     'limit_text': limit_text,
1722                     'limit': limit}
1723                 if field in self.settings:
1724                     self.settings[field].update(data)
1725                 else:
1726                     self.settings[field] = data
1727
1728             sort_list = (
1729                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1730                 + (tuple() if params.get('format_sort_force', False)
1731                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1732                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1733
1734             for item in sort_list:
1735                 match = re.match(self.regex, item)
1736                 if match is None:
1737                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1738                 field = match.group('field')
1739                 if field is None:
1740                     continue
1741                 if self._get_field_setting(field, 'type') == 'alias':
1742                     alias, field = field, self._get_field_setting(field, 'field')
1743                     if alias not in ('format_id', 'preference', 'language_preference'):
1744                         self.ydl.deprecation_warning(
1745                             f'Format sorting alias {alias} is deprecated '
1746                             f'and may be removed in a future version. Please use {field} instead')
1747                 reverse = match.group('reverse') is not None
1748                 closest = match.group('separator') == '~'
1749                 limit_text = match.group('limit')
1750
1751                 has_limit = limit_text is not None
1752                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1753                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1754
1755                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1756                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1757                 limit_count = len(limits)
1758                 for (i, f) in enumerate(fields):
1759                     add_item(f, reverse, closest,
1760                              limits[i] if i < limit_count
1761                              else limits[0] if has_limit and not has_multiple_limits
1762                              else None)
1763
1764         def print_verbose_info(self, write_debug):
1765             if self._sort_user:
1766                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1767             if self._sort_extractor:
1768                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1769             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1770                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1771                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1772                               self._get_field_setting(field, 'limit_text'),
1773                               self._get_field_setting(field, 'limit'))
1774                 if self._get_field_setting(field, 'limit_text') is not None else '')
1775                 for field in self._order if self._get_field_setting(field, 'visible')]))
1776
1777         def _calculate_field_preference_from_value(self, format, field, type, value):
1778             reverse = self._get_field_setting(field, 'reverse')
1779             closest = self._get_field_setting(field, 'closest')
1780             limit = self._get_field_setting(field, 'limit')
1781
1782             if type == 'extractor':
1783                 maximum = self._get_field_setting(field, 'max')
1784                 if value is None or (maximum is not None and value >= maximum):
1785                     value = -1
1786             elif type == 'boolean':
1787                 in_list = self._get_field_setting(field, 'in_list')
1788                 not_in_list = self._get_field_setting(field, 'not_in_list')
1789                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1790             elif type == 'ordered':
1791                 value = self._resolve_field_value(field, value, True)
1792
1793             # try to convert to number
1794             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1795             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1796             if is_num:
1797                 value = val_num
1798
1799             return ((-10, 0) if value is None
1800                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1801                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1802                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1803                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1804                     else (-1, value, 0))
1805
1806         def _calculate_field_preference(self, format, field):
1807             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1808             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1809             if type == 'multiple':
1810                 type = 'field'  # Only 'field' is allowed in multiple for now
1811                 actual_fields = self._get_field_setting(field, 'field')
1812
1813                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1814             else:
1815                 value = get_value(field)
1816             return self._calculate_field_preference_from_value(format, field, type, value)
1817
1818         def calculate_preference(self, format):
1819             # Determine missing protocol
1820             if not format.get('protocol'):
1821                 format['protocol'] = determine_protocol(format)
1822
1823             # Determine missing ext
1824             if not format.get('ext') and 'url' in format:
1825                 format['ext'] = determine_ext(format['url'])
1826             if format.get('vcodec') == 'none':
1827                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1828                 format['video_ext'] = 'none'
1829             else:
1830                 format['video_ext'] = format['ext']
1831                 format['audio_ext'] = 'none'
1832             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1833             #    format['preference'] = -1000
1834
1835             # Determine missing bitrates
1836             if format.get('tbr') is None:
1837                 if format.get('vbr') is not None and format.get('abr') is not None:
1838                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1839             else:
1840                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1841                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1842                 if format.get('acodec') != 'none' and format.get('abr') is None:
1843                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1844
1845             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1846
1847     def _sort_formats(self, formats, field_preference=[]):
1848         if not formats:
1849             return
1850         format_sort = self.FormatSort(self, field_preference)
1851         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1852
1853     def _check_formats(self, formats, video_id):
1854         if formats:
1855             formats[:] = filter(
1856                 lambda f: self._is_valid_url(
1857                     f['url'], video_id,
1858                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1859                 formats)
1860
1861     @staticmethod
1862     def _remove_duplicate_formats(formats):
1863         format_urls = set()
1864         unique_formats = []
1865         for f in formats:
1866             if f['url'] not in format_urls:
1867                 format_urls.add(f['url'])
1868                 unique_formats.append(f)
1869         formats[:] = unique_formats
1870
1871     def _is_valid_url(self, url, video_id, item='video', headers={}):
1872         url = self._proto_relative_url(url, scheme='http:')
1873         # For now assume non HTTP(S) URLs always valid
1874         if not (url.startswith('http://') or url.startswith('https://')):
1875             return True
1876         try:
1877             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1878             return True
1879         except ExtractorError as e:
1880             self.to_screen(
1881                 '%s: %s URL is invalid, skipping: %s'
1882                 % (video_id, item, error_to_compat_str(e.cause)))
1883             return False
1884
1885     def http_scheme(self):
1886         """ Either "http:" or "https:", depending on the user's preferences """
1887         return (
1888             'http:'
1889             if self.get_param('prefer_insecure', False)
1890             else 'https:')
1891
1892     def _proto_relative_url(self, url, scheme=None):
1893         if url is None:
1894             return url
1895         if url.startswith('//'):
1896             if scheme is None:
1897                 scheme = self.http_scheme()
1898             return scheme + url
1899         else:
1900             return url
1901
1902     def _sleep(self, timeout, video_id, msg_template=None):
1903         if msg_template is None:
1904             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1905         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1906         self.to_screen(msg)
1907         time.sleep(timeout)
1908
1909     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1910                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1911                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1912         manifest = self._download_xml(
1913             manifest_url, video_id, 'Downloading f4m manifest',
1914             'Unable to download f4m manifest',
1915             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1916             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1917             transform_source=transform_source,
1918             fatal=fatal, data=data, headers=headers, query=query)
1919
1920         if manifest is False:
1921             return []
1922
1923         return self._parse_f4m_formats(
1924             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1925             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1926
1927     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1928                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1929                            fatal=True, m3u8_id=None):
1930         if not isinstance(manifest, compat_etree_Element) and not fatal:
1931             return []
1932
1933         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1934         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1935         if akamai_pv is not None and ';' in akamai_pv.text:
1936             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1937             if playerVerificationChallenge.strip() != '':
1938                 return []
1939
1940         formats = []
1941         manifest_version = '1.0'
1942         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1943         if not media_nodes:
1944             manifest_version = '2.0'
1945             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1946         # Remove unsupported DRM protected media from final formats
1947         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1948         media_nodes = remove_encrypted_media(media_nodes)
1949         if not media_nodes:
1950             return formats
1951
1952         manifest_base_url = get_base_url(manifest)
1953
1954         bootstrap_info = xpath_element(
1955             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1956             'bootstrap info', default=None)
1957
1958         vcodec = None
1959         mime_type = xpath_text(
1960             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1961             'base URL', default=None)
1962         if mime_type and mime_type.startswith('audio/'):
1963             vcodec = 'none'
1964
1965         for i, media_el in enumerate(media_nodes):
1966             tbr = int_or_none(media_el.attrib.get('bitrate'))
1967             width = int_or_none(media_el.attrib.get('width'))
1968             height = int_or_none(media_el.attrib.get('height'))
1969             format_id = join_nonempty(f4m_id, tbr or i)
1970             # If <bootstrapInfo> is present, the specified f4m is a
1971             # stream-level manifest, and only set-level manifests may refer to
1972             # external resources.  See section 11.4 and section 4 of F4M spec
1973             if bootstrap_info is None:
1974                 media_url = None
1975                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1976                 if manifest_version == '2.0':
1977                     media_url = media_el.attrib.get('href')
1978                 if media_url is None:
1979                     media_url = media_el.attrib.get('url')
1980                 if not media_url:
1981                     continue
1982                 manifest_url = (
1983                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1984                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1985                 # If media_url is itself a f4m manifest do the recursive extraction
1986                 # since bitrates in parent manifest (this one) and media_url manifest
1987                 # may differ leading to inability to resolve the format by requested
1988                 # bitrate in f4m downloader
1989                 ext = determine_ext(manifest_url)
1990                 if ext == 'f4m':
1991                     f4m_formats = self._extract_f4m_formats(
1992                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1993                         transform_source=transform_source, fatal=fatal)
1994                     # Sometimes stream-level manifest contains single media entry that
1995                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1996                     # At the same time parent's media entry in set-level manifest may
1997                     # contain it. We will copy it from parent in such cases.
1998                     if len(f4m_formats) == 1:
1999                         f = f4m_formats[0]
2000                         f.update({
2001                             'tbr': f.get('tbr') or tbr,
2002                             'width': f.get('width') or width,
2003                             'height': f.get('height') or height,
2004                             'format_id': f.get('format_id') if not tbr else format_id,
2005                             'vcodec': vcodec,
2006                         })
2007                     formats.extend(f4m_formats)
2008                     continue
2009                 elif ext == 'm3u8':
2010                     formats.extend(self._extract_m3u8_formats(
2011                         manifest_url, video_id, 'mp4', preference=preference,
2012                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2013                     continue
2014             formats.append({
2015                 'format_id': format_id,
2016                 'url': manifest_url,
2017                 'manifest_url': manifest_url,
2018                 'ext': 'flv' if bootstrap_info is not None else None,
2019                 'protocol': 'f4m',
2020                 'tbr': tbr,
2021                 'width': width,
2022                 'height': height,
2023                 'vcodec': vcodec,
2024                 'preference': preference,
2025                 'quality': quality,
2026             })
2027         return formats
2028
2029     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2030         return {
2031             'format_id': join_nonempty(m3u8_id, 'meta'),
2032             'url': m3u8_url,
2033             'ext': ext,
2034             'protocol': 'm3u8',
2035             'preference': preference - 100 if preference else -100,
2036             'quality': quality,
2037             'resolution': 'multiple',
2038             'format_note': 'Quality selection URL',
2039         }
2040
2041     def _report_ignoring_subs(self, name):
2042         self.report_warning(bug_reports_message(
2043             f'Ignoring subtitle tracks found in the {name} manifest; '
2044             'if any subtitle tracks are missing,'
2045         ), only_once=True)
2046
2047     def _extract_m3u8_formats(self, *args, **kwargs):
2048         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2049         if subs:
2050             self._report_ignoring_subs('HLS')
2051         return fmts
2052
2053     def _extract_m3u8_formats_and_subtitles(
2054             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2055             preference=None, quality=None, m3u8_id=None, note=None,
2056             errnote=None, fatal=True, live=False, data=None, headers={},
2057             query={}):
2058
2059         res = self._download_webpage_handle(
2060             m3u8_url, video_id,
2061             note='Downloading m3u8 information' if note is None else note,
2062             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2063             fatal=fatal, data=data, headers=headers, query=query)
2064
2065         if res is False:
2066             return [], {}
2067
2068         m3u8_doc, urlh = res
2069         m3u8_url = urlh.geturl()
2070
2071         return self._parse_m3u8_formats_and_subtitles(
2072             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2073             preference=preference, quality=quality, m3u8_id=m3u8_id,
2074             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2075             headers=headers, query=query, video_id=video_id)
2076
2077     def _parse_m3u8_formats_and_subtitles(
2078             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2079             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2080             errnote=None, fatal=True, data=None, headers={}, query={},
2081             video_id=None):
2082         formats, subtitles = [], {}
2083
2084         has_drm = re.search('|'.join([
2085             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2086             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2087         ]), m3u8_doc)
2088
2089         def format_url(url):
2090             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2091
2092         if self.get_param('hls_split_discontinuity', False):
2093             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2094                 if not m3u8_doc:
2095                     if not manifest_url:
2096                         return []
2097                     m3u8_doc = self._download_webpage(
2098                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2099                         note=False, errnote='Failed to download m3u8 playlist information')
2100                     if m3u8_doc is False:
2101                         return []
2102                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2103
2104         else:
2105             def _extract_m3u8_playlist_indices(*args, **kwargs):
2106                 return [None]
2107
2108         # References:
2109         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2110         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2111         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2112
2113         # We should try extracting formats only from master playlists [1, 4.3.4],
2114         # i.e. playlists that describe available qualities. On the other hand
2115         # media playlists [1, 4.3.3] should be returned as is since they contain
2116         # just the media without qualities renditions.
2117         # Fortunately, master playlist can be easily distinguished from media
2118         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2119         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2120         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2121         # media playlist and MUST NOT appear in master playlist thus we can
2122         # clearly detect media playlist with this criterion.
2123
2124         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2125             formats = [{
2126                 'format_id': join_nonempty(m3u8_id, idx),
2127                 'format_index': idx,
2128                 'url': m3u8_url,
2129                 'ext': ext,
2130                 'protocol': entry_protocol,
2131                 'preference': preference,
2132                 'quality': quality,
2133                 'has_drm': has_drm,
2134             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2135
2136             return formats, subtitles
2137
2138         groups = {}
2139         last_stream_inf = {}
2140
2141         def extract_media(x_media_line):
2142             media = parse_m3u8_attributes(x_media_line)
2143             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2144             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2145             if not (media_type and group_id and name):
2146                 return
2147             groups.setdefault(group_id, []).append(media)
2148             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2149             if media_type == 'SUBTITLES':
2150                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2151                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2152                 # However, lack of URI has been spotted in the wild.
2153                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2154                 if not media.get('URI'):
2155                     return
2156                 url = format_url(media['URI'])
2157                 sub_info = {
2158                     'url': url,
2159                     'ext': determine_ext(url),
2160                 }
2161                 if sub_info['ext'] == 'm3u8':
2162                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2163                     # files may contain is WebVTT:
2164                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2165                     sub_info['ext'] = 'vtt'
2166                     sub_info['protocol'] = 'm3u8_native'
2167                 lang = media.get('LANGUAGE') or 'und'
2168                 subtitles.setdefault(lang, []).append(sub_info)
2169             if media_type not in ('VIDEO', 'AUDIO'):
2170                 return
2171             media_url = media.get('URI')
2172             if media_url:
2173                 manifest_url = format_url(media_url)
2174                 formats.extend({
2175                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2176                     'format_note': name,
2177                     'format_index': idx,
2178                     'url': manifest_url,
2179                     'manifest_url': m3u8_url,
2180                     'language': media.get('LANGUAGE'),
2181                     'ext': ext,
2182                     'protocol': entry_protocol,
2183                     'preference': preference,
2184                     'quality': quality,
2185                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2186                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2187
2188         def build_stream_name():
2189             # Despite specification does not mention NAME attribute for
2190             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2191             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2192             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2193             stream_name = last_stream_inf.get('NAME')
2194             if stream_name:
2195                 return stream_name
2196             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2197             # from corresponding rendition group
2198             stream_group_id = last_stream_inf.get('VIDEO')
2199             if not stream_group_id:
2200                 return
2201             stream_group = groups.get(stream_group_id)
2202             if not stream_group:
2203                 return stream_group_id
2204             rendition = stream_group[0]
2205             return rendition.get('NAME') or stream_group_id
2206
2207         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2208         # chance to detect video only formats when EXT-X-STREAM-INF tags
2209         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2210         for line in m3u8_doc.splitlines():
2211             if line.startswith('#EXT-X-MEDIA:'):
2212                 extract_media(line)
2213
2214         for line in m3u8_doc.splitlines():
2215             if line.startswith('#EXT-X-STREAM-INF:'):
2216                 last_stream_inf = parse_m3u8_attributes(line)
2217             elif line.startswith('#') or not line.strip():
2218                 continue
2219             else:
2220                 tbr = float_or_none(
2221                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2222                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2223                 manifest_url = format_url(line.strip())
2224
2225                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2226                     format_id = [m3u8_id, None, idx]
2227                     # Bandwidth of live streams may differ over time thus making
2228                     # format_id unpredictable. So it's better to keep provided
2229                     # format_id intact.
2230                     if not live:
2231                         stream_name = build_stream_name()
2232                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2233                     f = {
2234                         'format_id': join_nonempty(*format_id),
2235                         'format_index': idx,
2236                         'url': manifest_url,
2237                         'manifest_url': m3u8_url,
2238                         'tbr': tbr,
2239                         'ext': ext,
2240                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2241                         'protocol': entry_protocol,
2242                         'preference': preference,
2243                         'quality': quality,
2244                     }
2245                     resolution = last_stream_inf.get('RESOLUTION')
2246                     if resolution:
2247                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2248                         if mobj:
2249                             f['width'] = int(mobj.group('width'))
2250                             f['height'] = int(mobj.group('height'))
2251                     # Unified Streaming Platform
2252                     mobj = re.search(
2253                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2254                     if mobj:
2255                         abr, vbr = mobj.groups()
2256                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2257                         f.update({
2258                             'vbr': vbr,
2259                             'abr': abr,
2260                         })
2261                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2262                     f.update(codecs)
2263                     audio_group_id = last_stream_inf.get('AUDIO')
2264                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2265                     # references a rendition group MUST have a CODECS attribute.
2266                     # However, this is not always respected, for example, [2]
2267                     # contains EXT-X-STREAM-INF tag which references AUDIO
2268                     # rendition group but does not have CODECS and despite
2269                     # referencing an audio group it represents a complete
2270                     # (with audio and video) format. So, for such cases we will
2271                     # ignore references to rendition groups and treat them
2272                     # as complete formats.
2273                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2274                         audio_group = groups.get(audio_group_id)
2275                         if audio_group and audio_group[0].get('URI'):
2276                             # TODO: update acodec for audio only formats with
2277                             # the same GROUP-ID
2278                             f['acodec'] = 'none'
2279                     if not f.get('ext'):
2280                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2281                     formats.append(f)
2282
2283                     # for DailyMotion
2284                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2285                     if progressive_uri:
2286                         http_f = f.copy()
2287                         del http_f['manifest_url']
2288                         http_f.update({
2289                             'format_id': f['format_id'].replace('hls-', 'http-'),
2290                             'protocol': 'http',
2291                             'url': progressive_uri,
2292                         })
2293                         formats.append(http_f)
2294
2295                 last_stream_inf = {}
2296         return formats, subtitles
2297
2298     def _extract_m3u8_vod_duration(
2299             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2300
2301         m3u8_vod = self._download_webpage(
2302             m3u8_vod_url, video_id,
2303             note='Downloading m3u8 VOD manifest' if note is None else note,
2304             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2305             fatal=False, data=data, headers=headers, query=query)
2306
2307         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2308
2309     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2310         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2311             return None
2312
2313         return int(sum(
2314             float(line[len('#EXTINF:'):].split(',')[0])
2315             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2316
2317     @staticmethod
2318     def _xpath_ns(path, namespace=None):
2319         if not namespace:
2320             return path
2321         out = []
2322         for c in path.split('/'):
2323             if not c or c == '.':
2324                 out.append(c)
2325             else:
2326                 out.append('{%s}%s' % (namespace, c))
2327         return '/'.join(out)
2328
2329     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2330         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2331
2332         if smil is False:
2333             assert not fatal
2334             return []
2335
2336         namespace = self._parse_smil_namespace(smil)
2337
2338         fmts = self._parse_smil_formats(
2339             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2340         subs = self._parse_smil_subtitles(
2341             smil, namespace=namespace)
2342
2343         return fmts, subs
2344
2345     def _extract_smil_formats(self, *args, **kwargs):
2346         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2347         if subs:
2348             self._report_ignoring_subs('SMIL')
2349         return fmts
2350
2351     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2352         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2353         if smil is False:
2354             return {}
2355         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2356
2357     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2358         return self._download_xml(
2359             smil_url, video_id, 'Downloading SMIL file',
2360             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2361
2362     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2363         namespace = self._parse_smil_namespace(smil)
2364
2365         formats = self._parse_smil_formats(
2366             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2367         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2368
2369         video_id = os.path.splitext(url_basename(smil_url))[0]
2370         title = None
2371         description = None
2372         upload_date = None
2373         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2374             name = meta.attrib.get('name')
2375             content = meta.attrib.get('content')
2376             if not name or not content:
2377                 continue
2378             if not title and name == 'title':
2379                 title = content
2380             elif not description and name in ('description', 'abstract'):
2381                 description = content
2382             elif not upload_date and name == 'date':
2383                 upload_date = unified_strdate(content)
2384
2385         thumbnails = [{
2386             'id': image.get('type'),
2387             'url': image.get('src'),
2388             'width': int_or_none(image.get('width')),
2389             'height': int_or_none(image.get('height')),
2390         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2391
2392         return {
2393             'id': video_id,
2394             'title': title or video_id,
2395             'description': description,
2396             'upload_date': upload_date,
2397             'thumbnails': thumbnails,
2398             'formats': formats,
2399             'subtitles': subtitles,
2400         }
2401
2402     def _parse_smil_namespace(self, smil):
2403         return self._search_regex(
2404             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2405
2406     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2407         base = smil_url
2408         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2409             b = meta.get('base') or meta.get('httpBase')
2410             if b:
2411                 base = b
2412                 break
2413
2414         formats = []
2415         rtmp_count = 0
2416         http_count = 0
2417         m3u8_count = 0
2418         imgs_count = 0
2419
2420         srcs = set()
2421         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2422         for medium in media:
2423             src = medium.get('src')
2424             if not src or src in srcs:
2425                 continue
2426             srcs.add(src)
2427
2428             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2429             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2430             width = int_or_none(medium.get('width'))
2431             height = int_or_none(medium.get('height'))
2432             proto = medium.get('proto')
2433             ext = medium.get('ext')
2434             src_ext = determine_ext(src)
2435             streamer = medium.get('streamer') or base
2436
2437             if proto == 'rtmp' or streamer.startswith('rtmp'):
2438                 rtmp_count += 1
2439                 formats.append({
2440                     'url': streamer,
2441                     'play_path': src,
2442                     'ext': 'flv',
2443                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2444                     'tbr': bitrate,
2445                     'filesize': filesize,
2446                     'width': width,
2447                     'height': height,
2448                 })
2449                 if transform_rtmp_url:
2450                     streamer, src = transform_rtmp_url(streamer, src)
2451                     formats[-1].update({
2452                         'url': streamer,
2453                         'play_path': src,
2454                     })
2455                 continue
2456
2457             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2458             src_url = src_url.strip()
2459
2460             if proto == 'm3u8' or src_ext == 'm3u8':
2461                 m3u8_formats = self._extract_m3u8_formats(
2462                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2463                 if len(m3u8_formats) == 1:
2464                     m3u8_count += 1
2465                     m3u8_formats[0].update({
2466                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2467                         'tbr': bitrate,
2468                         'width': width,
2469                         'height': height,
2470                     })
2471                 formats.extend(m3u8_formats)
2472             elif src_ext == 'f4m':
2473                 f4m_url = src_url
2474                 if not f4m_params:
2475                     f4m_params = {
2476                         'hdcore': '3.2.0',
2477                         'plugin': 'flowplayer-3.2.0.1',
2478                     }
2479                 f4m_url += '&' if '?' in f4m_url else '?'
2480                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2481                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2482             elif src_ext == 'mpd':
2483                 formats.extend(self._extract_mpd_formats(
2484                     src_url, video_id, mpd_id='dash', fatal=False))
2485             elif re.search(r'\.ism/[Mm]anifest', src_url):
2486                 formats.extend(self._extract_ism_formats(
2487                     src_url, video_id, ism_id='mss', fatal=False))
2488             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2489                 http_count += 1
2490                 formats.append({
2491                     'url': src_url,
2492                     'ext': ext or src_ext or 'flv',
2493                     'format_id': 'http-%d' % (bitrate or http_count),
2494                     'tbr': bitrate,
2495                     'filesize': filesize,
2496                     'width': width,
2497                     'height': height,
2498                 })
2499
2500         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2501             src = medium.get('src')
2502             if not src or src in srcs:
2503                 continue
2504             srcs.add(src)
2505
2506             imgs_count += 1
2507             formats.append({
2508                 'format_id': 'imagestream-%d' % (imgs_count),
2509                 'url': src,
2510                 'ext': mimetype2ext(medium.get('type')),
2511                 'acodec': 'none',
2512                 'vcodec': 'none',
2513                 'width': int_or_none(medium.get('width')),
2514                 'height': int_or_none(medium.get('height')),
2515                 'format_note': 'SMIL storyboards',
2516             })
2517
2518         return formats
2519
2520     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2521         urls = []
2522         subtitles = {}
2523         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2524             src = textstream.get('src')
2525             if not src or src in urls:
2526                 continue
2527             urls.append(src)
2528             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2529             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2530             subtitles.setdefault(lang, []).append({
2531                 'url': src,
2532                 'ext': ext,
2533             })
2534         return subtitles
2535
2536     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2537         xspf = self._download_xml(
2538             xspf_url, playlist_id, 'Downloading xpsf playlist',
2539             'Unable to download xspf manifest', fatal=fatal)
2540         if xspf is False:
2541             return []
2542         return self._parse_xspf(
2543             xspf, playlist_id, xspf_url=xspf_url,
2544             xspf_base_url=base_url(xspf_url))
2545
2546     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2547         NS_MAP = {
2548             'xspf': 'http://xspf.org/ns/0/',
2549             's1': 'http://static.streamone.nl/player/ns/0',
2550         }
2551
2552         entries = []
2553         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2554             title = xpath_text(
2555                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2556             description = xpath_text(
2557                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2558             thumbnail = xpath_text(
2559                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2560             duration = float_or_none(
2561                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2562
2563             formats = []
2564             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2565                 format_url = urljoin(xspf_base_url, location.text)
2566                 if not format_url:
2567                     continue
2568                 formats.append({
2569                     'url': format_url,
2570                     'manifest_url': xspf_url,
2571                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2572                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2573                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2574                 })
2575             self._sort_formats(formats)
2576
2577             entries.append({
2578                 'id': playlist_id,
2579                 'title': title,
2580                 'description': description,
2581                 'thumbnail': thumbnail,
2582                 'duration': duration,
2583                 'formats': formats,
2584             })
2585         return entries
2586
2587     def _extract_mpd_formats(self, *args, **kwargs):
2588         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2589         if subs:
2590             self._report_ignoring_subs('DASH')
2591         return fmts
2592
2593     def _extract_mpd_formats_and_subtitles(
2594             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2595             fatal=True, data=None, headers={}, query={}):
2596         res = self._download_xml_handle(
2597             mpd_url, video_id,
2598             note='Downloading MPD manifest' if note is None else note,
2599             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2600             fatal=fatal, data=data, headers=headers, query=query)
2601         if res is False:
2602             return [], {}
2603         mpd_doc, urlh = res
2604         if mpd_doc is None:
2605             return [], {}
2606         mpd_base_url = base_url(urlh.geturl())
2607
2608         return self._parse_mpd_formats_and_subtitles(
2609             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2610
2611     def _parse_mpd_formats(self, *args, **kwargs):
2612         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2613         if subs:
2614             self._report_ignoring_subs('DASH')
2615         return fmts
2616
2617     def _parse_mpd_formats_and_subtitles(
2618             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2619         """
2620         Parse formats from MPD manifest.
2621         References:
2622          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2623             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2624          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2625         """
2626         if not self.get_param('dynamic_mpd', True):
2627             if mpd_doc.get('type') == 'dynamic':
2628                 return [], {}
2629
2630         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2631
2632         def _add_ns(path):
2633             return self._xpath_ns(path, namespace)
2634
2635         def is_drm_protected(element):
2636             return element.find(_add_ns('ContentProtection')) is not None
2637
2638         def extract_multisegment_info(element, ms_parent_info):
2639             ms_info = ms_parent_info.copy()
2640
2641             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2642             # common attributes and elements.  We will only extract relevant
2643             # for us.
2644             def extract_common(source):
2645                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2646                 if segment_timeline is not None:
2647                     s_e = segment_timeline.findall(_add_ns('S'))
2648                     if s_e:
2649                         ms_info['total_number'] = 0
2650                         ms_info['s'] = []
2651                         for s in s_e:
2652                             r = int(s.get('r', 0))
2653                             ms_info['total_number'] += 1 + r
2654                             ms_info['s'].append({
2655                                 't': int(s.get('t', 0)),
2656                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2657                                 'd': int(s.attrib['d']),
2658                                 'r': r,
2659                             })
2660                 start_number = source.get('startNumber')
2661                 if start_number:
2662                     ms_info['start_number'] = int(start_number)
2663                 timescale = source.get('timescale')
2664                 if timescale:
2665                     ms_info['timescale'] = int(timescale)
2666                 segment_duration = source.get('duration')
2667                 if segment_duration:
2668                     ms_info['segment_duration'] = float(segment_duration)
2669
2670             def extract_Initialization(source):
2671                 initialization = source.find(_add_ns('Initialization'))
2672                 if initialization is not None:
2673                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2674
2675             segment_list = element.find(_add_ns('SegmentList'))
2676             if segment_list is not None:
2677                 extract_common(segment_list)
2678                 extract_Initialization(segment_list)
2679                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2680                 if segment_urls_e:
2681                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2682             else:
2683                 segment_template = element.find(_add_ns('SegmentTemplate'))
2684                 if segment_template is not None:
2685                     extract_common(segment_template)
2686                     media = segment_template.get('media')
2687                     if media:
2688                         ms_info['media'] = media
2689                     initialization = segment_template.get('initialization')
2690                     if initialization:
2691                         ms_info['initialization'] = initialization
2692                     else:
2693                         extract_Initialization(segment_template)
2694             return ms_info
2695
2696         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2697         formats, subtitles = [], {}
2698         stream_numbers = collections.defaultdict(int)
2699         for period in mpd_doc.findall(_add_ns('Period')):
2700             period_duration = parse_duration(period.get('duration')) or mpd_duration
2701             period_ms_info = extract_multisegment_info(period, {
2702                 'start_number': 1,
2703                 'timescale': 1,
2704             })
2705             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2706                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2707                 for representation in adaptation_set.findall(_add_ns('Representation')):
2708                     representation_attrib = adaptation_set.attrib.copy()
2709                     representation_attrib.update(representation.attrib)
2710                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2711                     mime_type = representation_attrib['mimeType']
2712                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2713
2714                     codecs = representation_attrib.get('codecs', '')
2715                     if content_type not in ('video', 'audio', 'text'):
2716                         if mime_type == 'image/jpeg':
2717                             content_type = mime_type
2718                         elif codecs.split('.')[0] == 'stpp':
2719                             content_type = 'text'
2720                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2721                             content_type = 'text'
2722                         else:
2723                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2724                             continue
2725
2726                     base_url = ''
2727                     for element in (representation, adaptation_set, period, mpd_doc):
2728                         base_url_e = element.find(_add_ns('BaseURL'))
2729                         if base_url_e is not None:
2730                             base_url = base_url_e.text + base_url
2731                             if re.match(r'^https?://', base_url):
2732                                 break
2733                     if mpd_base_url and base_url.startswith('/'):
2734                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2735                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2736                         if not mpd_base_url.endswith('/'):
2737                             mpd_base_url += '/'
2738                         base_url = mpd_base_url + base_url
2739                     representation_id = representation_attrib.get('id')
2740                     lang = representation_attrib.get('lang')
2741                     url_el = representation.find(_add_ns('BaseURL'))
2742                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2743                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2744                     if representation_id is not None:
2745                         format_id = representation_id
2746                     else:
2747                         format_id = content_type
2748                     if mpd_id:
2749                         format_id = mpd_id + '-' + format_id
2750                     if content_type in ('video', 'audio'):
2751                         f = {
2752                             'format_id': format_id,
2753                             'manifest_url': mpd_url,
2754                             'ext': mimetype2ext(mime_type),
2755                             'width': int_or_none(representation_attrib.get('width')),
2756                             'height': int_or_none(representation_attrib.get('height')),
2757                             'tbr': float_or_none(bandwidth, 1000),
2758                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2759                             'fps': int_or_none(representation_attrib.get('frameRate')),
2760                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2761                             'format_note': 'DASH %s' % content_type,
2762                             'filesize': filesize,
2763                             'container': mimetype2ext(mime_type) + '_dash',
2764                         }
2765                         f.update(parse_codecs(codecs))
2766                     elif content_type == 'text':
2767                         f = {
2768                             'ext': mimetype2ext(mime_type),
2769                             'manifest_url': mpd_url,
2770                             'filesize': filesize,
2771                         }
2772                     elif content_type == 'image/jpeg':
2773                         # See test case in VikiIE
2774                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2775                         f = {
2776                             'format_id': format_id,
2777                             'ext': 'mhtml',
2778                             'manifest_url': mpd_url,
2779                             'format_note': 'DASH storyboards (jpeg)',
2780                             'acodec': 'none',
2781                             'vcodec': 'none',
2782                         }
2783                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2784                         f['has_drm'] = True
2785                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2786
2787                     def prepare_template(template_name, identifiers):
2788                         tmpl = representation_ms_info[template_name]
2789                         # First of, % characters outside $...$ templates
2790                         # must be escaped by doubling for proper processing
2791                         # by % operator string formatting used further (see
2792                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2793                         t = ''
2794                         in_template = False
2795                         for c in tmpl:
2796                             t += c
2797                             if c == '$':
2798                                 in_template = not in_template
2799                             elif c == '%' and not in_template:
2800                                 t += c
2801                         # Next, $...$ templates are translated to their
2802                         # %(...) counterparts to be used with % operator
2803                         if representation_id is not None:
2804                             t = t.replace('$RepresentationID$', representation_id)
2805                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2806                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2807                         t.replace('$$', '$')
2808                         return t
2809
2810                     # @initialization is a regular template like @media one
2811                     # so it should be handled just the same way (see
2812                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2813                     if 'initialization' in representation_ms_info:
2814                         initialization_template = prepare_template(
2815                             'initialization',
2816                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2817                             # $Time$ shall not be included for @initialization thus
2818                             # only $Bandwidth$ remains
2819                             ('Bandwidth', ))
2820                         representation_ms_info['initialization_url'] = initialization_template % {
2821                             'Bandwidth': bandwidth,
2822                         }
2823
2824                     def location_key(location):
2825                         return 'url' if re.match(r'^https?://', location) else 'path'
2826
2827                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2828
2829                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2830                         media_location_key = location_key(media_template)
2831
2832                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2833                         # can't be used at the same time
2834                         if '%(Number' in media_template and 's' not in representation_ms_info:
2835                             segment_duration = None
2836                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2837                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2838                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2839                             representation_ms_info['fragments'] = [{
2840                                 media_location_key: media_template % {
2841                                     'Number': segment_number,
2842                                     'Bandwidth': bandwidth,
2843                                 },
2844                                 'duration': segment_duration,
2845                             } for segment_number in range(
2846                                 representation_ms_info['start_number'],
2847                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2848                         else:
2849                             # $Number*$ or $Time$ in media template with S list available
2850                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2851                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2852                             representation_ms_info['fragments'] = []
2853                             segment_time = 0
2854                             segment_d = None
2855                             segment_number = representation_ms_info['start_number']
2856
2857                             def add_segment_url():
2858                                 segment_url = media_template % {
2859                                     'Time': segment_time,
2860                                     'Bandwidth': bandwidth,
2861                                     'Number': segment_number,
2862                                 }
2863                                 representation_ms_info['fragments'].append({
2864                                     media_location_key: segment_url,
2865                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2866                                 })
2867
2868                             for num, s in enumerate(representation_ms_info['s']):
2869                                 segment_time = s.get('t') or segment_time
2870                                 segment_d = s['d']
2871                                 add_segment_url()
2872                                 segment_number += 1
2873                                 for r in range(s.get('r', 0)):
2874                                     segment_time += segment_d
2875                                     add_segment_url()
2876                                     segment_number += 1
2877                                 segment_time += segment_d
2878                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2879                         # No media template
2880                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2881                         # or any YouTube dashsegments video
2882                         fragments = []
2883                         segment_index = 0
2884                         timescale = representation_ms_info['timescale']
2885                         for s in representation_ms_info['s']:
2886                             duration = float_or_none(s['d'], timescale)
2887                             for r in range(s.get('r', 0) + 1):
2888                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2889                                 fragments.append({
2890                                     location_key(segment_uri): segment_uri,
2891                                     'duration': duration,
2892                                 })
2893                                 segment_index += 1
2894                         representation_ms_info['fragments'] = fragments
2895                     elif 'segment_urls' in representation_ms_info:
2896                         # Segment URLs with no SegmentTimeline
2897                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2898                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2899                         fragments = []
2900                         segment_duration = float_or_none(
2901                             representation_ms_info['segment_duration'],
2902                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2903                         for segment_url in representation_ms_info['segment_urls']:
2904                             fragment = {
2905                                 location_key(segment_url): segment_url,
2906                             }
2907                             if segment_duration:
2908                                 fragment['duration'] = segment_duration
2909                             fragments.append(fragment)
2910                         representation_ms_info['fragments'] = fragments
2911                     # If there is a fragments key available then we correctly recognized fragmented media.
2912                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2913                     # assumption is not necessarily correct since we may simply have no support for
2914                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2915                     if 'fragments' in representation_ms_info:
2916                         f.update({
2917                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2918                             'url': mpd_url or base_url,
2919                             'fragment_base_url': base_url,
2920                             'fragments': [],
2921                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2922                         })
2923                         if 'initialization_url' in representation_ms_info:
2924                             initialization_url = representation_ms_info['initialization_url']
2925                             if not f.get('url'):
2926                                 f['url'] = initialization_url
2927                             f['fragments'].append({location_key(initialization_url): initialization_url})
2928                         f['fragments'].extend(representation_ms_info['fragments'])
2929                     else:
2930                         # Assuming direct URL to unfragmented media.
2931                         f['url'] = base_url
2932                     if content_type in ('video', 'audio', 'image/jpeg'):
2933                         f['manifest_stream_number'] = stream_numbers[f['url']]
2934                         stream_numbers[f['url']] += 1
2935                         formats.append(f)
2936                     elif content_type == 'text':
2937                         subtitles.setdefault(lang or 'und', []).append(f)
2938
2939         return formats, subtitles
2940
2941     def _extract_ism_formats(self, *args, **kwargs):
2942         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2943         if subs:
2944             self._report_ignoring_subs('ISM')
2945         return fmts
2946
2947     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2948         res = self._download_xml_handle(
2949             ism_url, video_id,
2950             note='Downloading ISM manifest' if note is None else note,
2951             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2952             fatal=fatal, data=data, headers=headers, query=query)
2953         if res is False:
2954             return [], {}
2955         ism_doc, urlh = res
2956         if ism_doc is None:
2957             return [], {}
2958
2959         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2960
2961     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2962         """
2963         Parse formats from ISM manifest.
2964         References:
2965          1. [MS-SSTR]: Smooth Streaming Protocol,
2966             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2967         """
2968         if ism_doc.get('IsLive') == 'TRUE':
2969             return [], {}
2970
2971         duration = int(ism_doc.attrib['Duration'])
2972         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2973
2974         formats = []
2975         subtitles = {}
2976         for stream in ism_doc.findall('StreamIndex'):
2977             stream_type = stream.get('Type')
2978             if stream_type not in ('video', 'audio', 'text'):
2979                 continue
2980             url_pattern = stream.attrib['Url']
2981             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2982             stream_name = stream.get('Name')
2983             stream_language = stream.get('Language', 'und')
2984             for track in stream.findall('QualityLevel'):
2985                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2986                 # TODO: add support for WVC1 and WMAP
2987                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2988                     self.report_warning('%s is not a supported codec' % fourcc)
2989                     continue
2990                 tbr = int(track.attrib['Bitrate']) // 1000
2991                 # [1] does not mention Width and Height attributes. However,
2992                 # they're often present while MaxWidth and MaxHeight are
2993                 # missing, so should be used as fallbacks
2994                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2995                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2996                 sampling_rate = int_or_none(track.get('SamplingRate'))
2997
2998                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2999                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3000
3001                 fragments = []
3002                 fragment_ctx = {
3003                     'time': 0,
3004                 }
3005                 stream_fragments = stream.findall('c')
3006                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3007                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3008                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3009                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3010                     if not fragment_ctx['duration']:
3011                         try:
3012                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3013                         except IndexError:
3014                             next_fragment_time = duration
3015                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3016                     for _ in range(fragment_repeat):
3017                         fragments.append({
3018                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3019                             'duration': fragment_ctx['duration'] / stream_timescale,
3020                         })
3021                         fragment_ctx['time'] += fragment_ctx['duration']
3022
3023                 if stream_type == 'text':
3024                     subtitles.setdefault(stream_language, []).append({
3025                         'ext': 'ismt',
3026                         'protocol': 'ism',
3027                         'url': ism_url,
3028                         'manifest_url': ism_url,
3029                         'fragments': fragments,
3030                         '_download_params': {
3031                             'stream_type': stream_type,
3032                             'duration': duration,
3033                             'timescale': stream_timescale,
3034                             'fourcc': fourcc,
3035                             'language': stream_language,
3036                             'codec_private_data': track.get('CodecPrivateData'),
3037                         }
3038                     })
3039                 elif stream_type in ('video', 'audio'):
3040                     formats.append({
3041                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3042                         'url': ism_url,
3043                         'manifest_url': ism_url,
3044                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3045                         'width': width,
3046                         'height': height,
3047                         'tbr': tbr,
3048                         'asr': sampling_rate,
3049                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3050                         'acodec': 'none' if stream_type == 'video' else fourcc,
3051                         'protocol': 'ism',
3052                         'fragments': fragments,
3053                         'has_drm': ism_doc.find('Protection') is not None,
3054                         '_download_params': {
3055                             'stream_type': stream_type,
3056                             'duration': duration,
3057                             'timescale': stream_timescale,
3058                             'width': width or 0,
3059                             'height': height or 0,
3060                             'fourcc': fourcc,
3061                             'language': stream_language,
3062                             'codec_private_data': track.get('CodecPrivateData'),
3063                             'sampling_rate': sampling_rate,
3064                             'channels': int_or_none(track.get('Channels', 2)),
3065                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3066                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3067                         },
3068                     })
3069         return formats, subtitles
3070
3071     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3072         def absolute_url(item_url):
3073             return urljoin(base_url, item_url)
3074
3075         def parse_content_type(content_type):
3076             if not content_type:
3077                 return {}
3078             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3079             if ctr:
3080                 mimetype, codecs = ctr.groups()
3081                 f = parse_codecs(codecs)
3082                 f['ext'] = mimetype2ext(mimetype)
3083                 return f
3084             return {}
3085
3086         def _media_formats(src, cur_media_type, type_info={}):
3087             full_url = absolute_url(src)
3088             ext = type_info.get('ext') or determine_ext(full_url)
3089             if ext == 'm3u8':
3090                 is_plain_url = False
3091                 formats = self._extract_m3u8_formats(
3092                     full_url, video_id, ext='mp4',
3093                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3094                     preference=preference, quality=quality, fatal=False)
3095             elif ext == 'mpd':
3096                 is_plain_url = False
3097                 formats = self._extract_mpd_formats(
3098                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3099             else:
3100                 is_plain_url = True
3101                 formats = [{
3102                     'url': full_url,
3103                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3104                 }]
3105             return is_plain_url, formats
3106
3107         entries = []
3108         # amp-video and amp-audio are very similar to their HTML5 counterparts
3109         # so we wll include them right here (see
3110         # https://www.ampproject.org/docs/reference/components/amp-video)
3111         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3112         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3113         media_tags = [(media_tag, media_tag_name, media_type, '')
3114                       for media_tag, media_tag_name, media_type
3115                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3116         media_tags.extend(re.findall(
3117             # We only allow video|audio followed by a whitespace or '>'.
3118             # Allowing more characters may end up in significant slow down (see
3119             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3120             # http://www.porntrex.com/maps/videositemap.xml).
3121             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3122         for media_tag, _, media_type, media_content in media_tags:
3123             media_info = {
3124                 'formats': [],
3125                 'subtitles': {},
3126             }
3127             media_attributes = extract_attributes(media_tag)
3128             src = strip_or_none(media_attributes.get('src'))
3129             if src:
3130                 _, formats = _media_formats(src, media_type)
3131                 media_info['formats'].extend(formats)
3132             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3133             if media_content:
3134                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3135                     s_attr = extract_attributes(source_tag)
3136                     # data-video-src and data-src are non standard but seen
3137                     # several times in the wild
3138                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3139                     if not src:
3140                         continue
3141                     f = parse_content_type(s_attr.get('type'))
3142                     is_plain_url, formats = _media_formats(src, media_type, f)
3143                     if is_plain_url:
3144                         # width, height, res, label and title attributes are
3145                         # all not standard but seen several times in the wild
3146                         labels = [
3147                             s_attr.get(lbl)
3148                             for lbl in ('label', 'title')
3149                             if str_or_none(s_attr.get(lbl))
3150                         ]
3151                         width = int_or_none(s_attr.get('width'))
3152                         height = (int_or_none(s_attr.get('height'))
3153                                   or int_or_none(s_attr.get('res')))
3154                         if not width or not height:
3155                             for lbl in labels:
3156                                 resolution = parse_resolution(lbl)
3157                                 if not resolution:
3158                                     continue
3159                                 width = width or resolution.get('width')
3160                                 height = height or resolution.get('height')
3161                         for lbl in labels:
3162                             tbr = parse_bitrate(lbl)
3163                             if tbr:
3164                                 break
3165                         else:
3166                             tbr = None
3167                         f.update({
3168                             'width': width,
3169                             'height': height,
3170                             'tbr': tbr,
3171                             'format_id': s_attr.get('label') or s_attr.get('title'),
3172                         })
3173                         f.update(formats[0])
3174                         media_info['formats'].append(f)
3175                     else:
3176                         media_info['formats'].extend(formats)
3177                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3178                     track_attributes = extract_attributes(track_tag)
3179                     kind = track_attributes.get('kind')
3180                     if not kind or kind in ('subtitles', 'captions'):
3181                         src = strip_or_none(track_attributes.get('src'))
3182                         if not src:
3183                             continue
3184                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3185                         media_info['subtitles'].setdefault(lang, []).append({
3186                             'url': absolute_url(src),
3187                         })
3188             for f in media_info['formats']:
3189                 f.setdefault('http_headers', {})['Referer'] = base_url
3190             if media_info['formats'] or media_info['subtitles']:
3191                 entries.append(media_info)
3192         return entries
3193
3194     def _extract_akamai_formats(self, *args, **kwargs):
3195         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3196         if subs:
3197             self._report_ignoring_subs('akamai')
3198         return fmts
3199
3200     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3201         signed = 'hdnea=' in manifest_url
3202         if not signed:
3203             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3204             manifest_url = re.sub(
3205                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3206                 '', manifest_url).strip('?')
3207
3208         formats = []
3209         subtitles = {}
3210
3211         hdcore_sign = 'hdcore=3.7.0'
3212         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3213         hds_host = hosts.get('hds')
3214         if hds_host:
3215             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3216         if 'hdcore=' not in f4m_url:
3217             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3218         f4m_formats = self._extract_f4m_formats(
3219             f4m_url, video_id, f4m_id='hds', fatal=False)
3220         for entry in f4m_formats:
3221             entry.update({'extra_param_to_segment_url': hdcore_sign})
3222         formats.extend(f4m_formats)
3223
3224         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3225         hls_host = hosts.get('hls')
3226         if hls_host:
3227             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3228         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3229             m3u8_url, video_id, 'mp4', 'm3u8_native',
3230             m3u8_id='hls', fatal=False)
3231         formats.extend(m3u8_formats)
3232         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3233
3234         http_host = hosts.get('http')
3235         if http_host and m3u8_formats and not signed:
3236             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3237             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3238             qualities_length = len(qualities)
3239             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3240                 i = 0
3241                 for f in m3u8_formats:
3242                     if f['vcodec'] != 'none':
3243                         for protocol in ('http', 'https'):
3244                             http_f = f.copy()
3245                             del http_f['manifest_url']
3246                             http_url = re.sub(
3247                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3248                             http_f.update({
3249                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3250                                 'url': http_url,
3251                                 'protocol': protocol,
3252                             })
3253                             formats.append(http_f)
3254                         i += 1
3255
3256         return formats, subtitles
3257
3258     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3259         query = compat_urlparse.urlparse(url).query
3260         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3261         mobj = re.search(
3262             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3263         url_base = mobj.group('url')
3264         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3265         formats = []
3266
3267         def manifest_url(manifest):
3268             m_url = '%s/%s' % (http_base_url, manifest)
3269             if query:
3270                 m_url += '?%s' % query
3271             return m_url
3272
3273         if 'm3u8' not in skip_protocols:
3274             formats.extend(self._extract_m3u8_formats(
3275                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3276                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3277         if 'f4m' not in skip_protocols:
3278             formats.extend(self._extract_f4m_formats(
3279                 manifest_url('manifest.f4m'),
3280                 video_id, f4m_id='hds', fatal=False))
3281         if 'dash' not in skip_protocols:
3282             formats.extend(self._extract_mpd_formats(
3283                 manifest_url('manifest.mpd'),
3284                 video_id, mpd_id='dash', fatal=False))
3285         if re.search(r'(?:/smil:|\.smil)', url_base):
3286             if 'smil' not in skip_protocols:
3287                 rtmp_formats = self._extract_smil_formats(
3288                     manifest_url('jwplayer.smil'),
3289                     video_id, fatal=False)
3290                 for rtmp_format in rtmp_formats:
3291                     rtsp_format = rtmp_format.copy()
3292                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3293                     del rtsp_format['play_path']
3294                     del rtsp_format['ext']
3295                     rtsp_format.update({
3296                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3297                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3298                         'protocol': 'rtsp',
3299                     })
3300                     formats.extend([rtmp_format, rtsp_format])
3301         else:
3302             for protocol in ('rtmp', 'rtsp'):
3303                 if protocol not in skip_protocols:
3304                     formats.append({
3305                         'url': '%s:%s' % (protocol, url_base),
3306                         'format_id': protocol,
3307                         'protocol': protocol,
3308                     })
3309         return formats
3310
3311     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3312         mobj = re.search(
3313             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3314             webpage)
3315         if mobj:
3316             try:
3317                 jwplayer_data = self._parse_json(mobj.group('options'),
3318                                                  video_id=video_id,
3319                                                  transform_source=transform_source)
3320             except ExtractorError:
3321                 pass
3322             else:
3323                 if isinstance(jwplayer_data, dict):
3324                     return jwplayer_data
3325
3326     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3327         jwplayer_data = self._find_jwplayer_data(
3328             webpage, video_id, transform_source=js_to_json)
3329         return self._parse_jwplayer_data(
3330             jwplayer_data, video_id, *args, **kwargs)
3331
3332     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3333                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3334         # JWPlayer backward compatibility: flattened playlists
3335         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3336         if 'playlist' not in jwplayer_data:
3337             jwplayer_data = {'playlist': [jwplayer_data]}
3338
3339         entries = []
3340
3341         # JWPlayer backward compatibility: single playlist item
3342         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3343         if not isinstance(jwplayer_data['playlist'], list):
3344             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3345
3346         for video_data in jwplayer_data['playlist']:
3347             # JWPlayer backward compatibility: flattened sources
3348             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3349             if 'sources' not in video_data:
3350                 video_data['sources'] = [video_data]
3351
3352             this_video_id = video_id or video_data['mediaid']
3353
3354             formats = self._parse_jwplayer_formats(
3355                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3356                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3357
3358             subtitles = {}
3359             tracks = video_data.get('tracks')
3360             if tracks and isinstance(tracks, list):
3361                 for track in tracks:
3362                     if not isinstance(track, dict):
3363                         continue
3364                     track_kind = track.get('kind')
3365                     if not track_kind or not isinstance(track_kind, compat_str):
3366                         continue
3367                     if track_kind.lower() not in ('captions', 'subtitles'):
3368                         continue
3369                     track_url = urljoin(base_url, track.get('file'))
3370                     if not track_url:
3371                         continue
3372                     subtitles.setdefault(track.get('label') or 'en', []).append({
3373                         'url': self._proto_relative_url(track_url)
3374                     })
3375
3376             entry = {
3377                 'id': this_video_id,
3378                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3379                 'description': clean_html(video_data.get('description')),
3380                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3381                 'timestamp': int_or_none(video_data.get('pubdate')),
3382                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3383                 'subtitles': subtitles,
3384             }
3385             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3386             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3387                 entry.update({
3388                     '_type': 'url_transparent',
3389                     'url': formats[0]['url'],
3390                 })
3391             else:
3392                 self._sort_formats(formats)
3393                 entry['formats'] = formats
3394             entries.append(entry)
3395         if len(entries) == 1:
3396             return entries[0]
3397         else:
3398             return self.playlist_result(entries)
3399
3400     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3401                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3402         urls = []
3403         formats = []
3404         for source in jwplayer_sources_data:
3405             if not isinstance(source, dict):
3406                 continue
3407             source_url = urljoin(
3408                 base_url, self._proto_relative_url(source.get('file')))
3409             if not source_url or source_url in urls:
3410                 continue
3411             urls.append(source_url)
3412             source_type = source.get('type') or ''
3413             ext = mimetype2ext(source_type) or determine_ext(source_url)
3414             if source_type == 'hls' or ext == 'm3u8':
3415                 formats.extend(self._extract_m3u8_formats(
3416                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3417                     m3u8_id=m3u8_id, fatal=False))
3418             elif source_type == 'dash' or ext == 'mpd':
3419                 formats.extend(self._extract_mpd_formats(
3420                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3421             elif ext == 'smil':
3422                 formats.extend(self._extract_smil_formats(
3423                     source_url, video_id, fatal=False))
3424             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3425             elif source_type.startswith('audio') or ext in (
3426                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3427                 formats.append({
3428                     'url': source_url,
3429                     'vcodec': 'none',
3430                     'ext': ext,
3431                 })
3432             else:
3433                 height = int_or_none(source.get('height'))
3434                 if height is None:
3435                     # Often no height is provided but there is a label in
3436                     # format like "1080p", "720p SD", or 1080.
3437                     height = int_or_none(self._search_regex(
3438                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3439                         'height', default=None))
3440                 a_format = {
3441                     'url': source_url,
3442                     'width': int_or_none(source.get('width')),
3443                     'height': height,
3444                     'tbr': int_or_none(source.get('bitrate')),
3445                     'ext': ext,
3446                 }
3447                 if source_url.startswith('rtmp'):
3448                     a_format['ext'] = 'flv'
3449                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3450                     # of jwplayer.flash.swf
3451                     rtmp_url_parts = re.split(
3452                         r'((?:mp4|mp3|flv):)', source_url, 1)
3453                     if len(rtmp_url_parts) == 3:
3454                         rtmp_url, prefix, play_path = rtmp_url_parts
3455                         a_format.update({
3456                             'url': rtmp_url,
3457                             'play_path': prefix + play_path,
3458                         })
3459                     if rtmp_params:
3460                         a_format.update(rtmp_params)
3461                 formats.append(a_format)
3462         return formats
3463
3464     def _live_title(self, name):
3465         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3466         return name
3467
3468     def _int(self, v, name, fatal=False, **kwargs):
3469         res = int_or_none(v, **kwargs)
3470         if 'get_attr' in kwargs:
3471             print(getattr(v, kwargs['get_attr']))
3472         if res is None:
3473             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3474             if fatal:
3475                 raise ExtractorError(msg)
3476             else:
3477                 self.report_warning(msg)
3478         return res
3479
3480     def _float(self, v, name, fatal=False, **kwargs):
3481         res = float_or_none(v, **kwargs)
3482         if res is None:
3483             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3484             if fatal:
3485                 raise ExtractorError(msg)
3486             else:
3487                 self.report_warning(msg)
3488         return res
3489
3490     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3491                     path='/', secure=False, discard=False, rest={}, **kwargs):
3492         cookie = compat_cookiejar_Cookie(
3493             0, name, value, port, port is not None, domain, True,
3494             domain.startswith('.'), path, True, secure, expire_time,
3495             discard, None, None, rest)
3496         self._downloader.cookiejar.set_cookie(cookie)
3497
3498     def _get_cookies(self, url):
3499         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3500         req = sanitized_Request(url)
3501         self._downloader.cookiejar.add_cookie_header(req)
3502         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3503
3504     def _apply_first_set_cookie_header(self, url_handle, cookie):
3505         """
3506         Apply first Set-Cookie header instead of the last. Experimental.
3507
3508         Some sites (e.g. [1-3]) may serve two cookies under the same name
3509         in Set-Cookie header and expect the first (old) one to be set rather
3510         than second (new). However, as of RFC6265 the newer one cookie
3511         should be set into cookie store what actually happens.
3512         We will workaround this issue by resetting the cookie to
3513         the first one manually.
3514         1. https://new.vk.com/
3515         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3516         3. https://learning.oreilly.com/
3517         """
3518         for header, cookies in url_handle.headers.items():
3519             if header.lower() != 'set-cookie':
3520                 continue
3521             if sys.version_info[0] >= 3:
3522                 cookies = cookies.encode('iso-8859-1')
3523             cookies = cookies.decode('utf-8')
3524             cookie_value = re.search(
3525                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3526             if cookie_value:
3527                 value, domain = cookie_value.groups()
3528                 self._set_cookie(domain, cookie, value)
3529                 break
3530
3531     def get_testcases(self, include_onlymatching=False):
3532         t = getattr(self, '_TEST', None)
3533         if t:
3534             assert not hasattr(self, '_TESTS'), \
3535                 '%s has _TEST and _TESTS' % type(self).__name__
3536             tests = [t]
3537         else:
3538             tests = getattr(self, '_TESTS', [])
3539         for t in tests:
3540             if not include_onlymatching and t.get('only_matching', False):
3541                 continue
3542             t['name'] = type(self).__name__[:-len('IE')]
3543             yield t
3544
3545     def is_suitable(self, age_limit):
3546         """ Test whether the extractor is generally suitable for the given
3547         age limit (i.e. pornographic sites are not, all others usually are) """
3548
3549         any_restricted = False
3550         for tc in self.get_testcases(include_onlymatching=False):
3551             if tc.get('playlist', []):
3552                 tc = tc['playlist'][0]
3553             is_restricted = age_restricted(
3554                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3555             if not is_restricted:
3556                 return True
3557             any_restricted = any_restricted or is_restricted
3558         return not any_restricted
3559
3560     def extract_subtitles(self, *args, **kwargs):
3561         if (self.get_param('writesubtitles', False)
3562                 or self.get_param('listsubtitles')):
3563             return self._get_subtitles(*args, **kwargs)
3564         return {}
3565
3566     def _get_subtitles(self, *args, **kwargs):
3567         raise NotImplementedError('This method must be implemented by subclasses')
3568
3569     def extract_comments(self, *args, **kwargs):
3570         if not self.get_param('getcomments'):
3571             return None
3572         generator = self._get_comments(*args, **kwargs)
3573
3574         def extractor():
3575             comments = []
3576             interrupted = True
3577             try:
3578                 while True:
3579                     comments.append(next(generator))
3580             except StopIteration:
3581                 interrupted = False
3582             except KeyboardInterrupt:
3583                 self.to_screen('Interrupted by user')
3584             except Exception as e:
3585                 if self.get_param('ignoreerrors') is not True:
3586                     raise
3587                 self._downloader.report_error(e)
3588             comment_count = len(comments)
3589             self.to_screen(f'Extracted {comment_count} comments')
3590             return {
3591                 'comments': comments,
3592                 'comment_count': None if interrupted else comment_count
3593             }
3594         return extractor
3595
3596     def _get_comments(self, *args, **kwargs):
3597         raise NotImplementedError('This method must be implemented by subclasses')
3598
3599     @staticmethod
3600     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3601         """ Merge subtitle items for one language. Items with duplicated URLs
3602         will be dropped. """
3603         list1_urls = set([item['url'] for item in subtitle_list1])
3604         ret = list(subtitle_list1)
3605         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3606         return ret
3607
3608     @classmethod
3609     def _merge_subtitles(cls, *dicts, target=None):
3610         """ Merge subtitle dictionaries, language by language. """
3611         if target is None:
3612             target = {}
3613         for d in dicts:
3614             for lang, subs in d.items():
3615                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3616         return target
3617
3618     def extract_automatic_captions(self, *args, **kwargs):
3619         if (self.get_param('writeautomaticsub', False)
3620                 or self.get_param('listsubtitles')):
3621             return self._get_automatic_captions(*args, **kwargs)
3622         return {}
3623
3624     def _get_automatic_captions(self, *args, **kwargs):
3625         raise NotImplementedError('This method must be implemented by subclasses')
3626
3627     def mark_watched(self, *args, **kwargs):
3628         if not self.get_param('mark_watched', False):
3629             return
3630         if (self._get_login_info()[0] is not None
3631                 or self.get_param('cookiefile')
3632                 or self.get_param('cookiesfrombrowser')):
3633             self._mark_watched(*args, **kwargs)
3634
3635     def _mark_watched(self, *args, **kwargs):
3636         raise NotImplementedError('This method must be implemented by subclasses')
3637
3638     def geo_verification_headers(self):
3639         headers = {}
3640         geo_verification_proxy = self.get_param('geo_verification_proxy')
3641         if geo_verification_proxy:
3642             headers['Ytdl-request-proxy'] = geo_verification_proxy
3643         return headers
3644
3645     def _generic_id(self, url):
3646         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3647
3648     def _generic_title(self, url):
3649         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3650
3651     @staticmethod
3652     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3653         all_known = all(map(
3654             lambda x: x is not None,
3655             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3656         return (
3657             'private' if is_private
3658             else 'premium_only' if needs_premium
3659             else 'subscriber_only' if needs_subscription
3660             else 'needs_auth' if needs_auth
3661             else 'unlisted' if is_unlisted
3662             else 'public' if all_known
3663             else None)
3664
3665     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3666         '''
3667         @returns            A list of values for the extractor argument given by "key"
3668                             or "default" if no such key is present
3669         @param default      The default value to return when the key is not present (default: [])
3670         @param casesense    When false, the values are converted to lower case
3671         '''
3672         val = traverse_obj(
3673             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3674         if val is None:
3675             return [] if default is NO_DEFAULT else default
3676         return list(val) if casesense else [x.lower() for x in val]
3677
3678
3679 class SearchInfoExtractor(InfoExtractor):
3680     """
3681     Base class for paged search queries extractors.
3682     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3683     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3684     """
3685
3686     _MAX_RESULTS = float('inf')
3687
3688     @classmethod
3689     def _make_valid_url(cls):
3690         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3691
3692     def _real_extract(self, query):
3693         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3694         if prefix == '':
3695             return self._get_n_results(query, 1)
3696         elif prefix == 'all':
3697             return self._get_n_results(query, self._MAX_RESULTS)
3698         else:
3699             n = int(prefix)
3700             if n <= 0:
3701                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3702             elif n > self._MAX_RESULTS:
3703                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3704                 n = self._MAX_RESULTS
3705             return self._get_n_results(query, n)
3706
3707     def _get_n_results(self, query, n):
3708         """Get a specified number of results for a query.
3709         Either this function or _search_results must be overridden by subclasses """
3710         return self.playlist_result(
3711             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3712             query, query)
3713
3714     def _search_results(self, query):
3715         """Returns an iterator of search results"""
3716         raise NotImplementedError('This method must be implemented by subclasses')
3717
3718     @property
3719     def SEARCH_KEY(self):
3720         return self._SEARCH_KEY