yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import collections
   6 import hashlib
   7 import itertools
   8 import json
   9 import netrc
  10 import os
  11 import random
  12 import re
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar_Cookie,
  19     compat_cookies_SimpleCookie,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader import FileDownloader
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     encode_data_uri,
  49     error_to_compat_str,
  50     extract_attributes,
  51     ExtractorError,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     join_nonempty,
  59     js_to_json,
  60     JSON_LD_RE,
  61     mimetype2ext,
  62     network_exceptions,
  63     NO_DEFAULT,
  64     orderedSet,
  65     parse_bitrate,
  66     parse_codecs,
  67     parse_duration,
  68     parse_iso8601,
  69     parse_m3u8_attributes,
  70     parse_resolution,
  71     RegexNotFoundError,
  72     sanitize_filename,
  73     sanitized_Request,
  74     str_or_none,
  75     str_to_int,
  76     strip_or_none,
  77     traverse_obj,
  78     try_get,
  79     unescapeHTML,
  80     UnsupportedError,
  81     unified_strdate,
  82     unified_timestamp,
  83     update_Request,
  84     update_url_query,
  85     url_basename,
  86     url_or_none,
  87     urljoin,
  88     variadic,
  89     xpath_element,
  90     xpath_text,
  91     xpath_with_ns,
  92 )
  93
  94
  95 class InfoExtractor(object):
  96     """Information Extractor class.
  97
  98     Information extractors are the classes that, given a URL, extract
  99     information about the video (or videos) the URL refers to. This
 100     information includes the real video URL, the video title, author and
 101     others. The information is stored in a dictionary which is then
 102     passed to the YoutubeDL. The YoutubeDL processes this
 103     information possibly downloading the video to the file system, among
 104     other possible outcomes.
 105
 106     The type field determines the type of the result.
 107     By far the most common value (and the default if _type is missing) is
 108     "video", which indicates a single video.
 109
 110     For a video, the dictionaries must include the following fields:
 111
 112     id:             Video identifier.
 113     title:          Video title, unescaped.
 114
 115     Additionally, it must contain either a formats entry or a url one:
 116
 117     formats:        A list of dictionaries for each format available, ordered
 118                     from worst to best quality.
 119
 120                     Potential fields:
 121                     * url        The mandatory URL representing the media:
 122                                    for plain file media - HTTP URL of this file,
 123                                    for RTMP - RTMP URL,
 124                                    for HLS - URL of the M3U8 media playlist,
 125                                    for HDS - URL of the F4M manifest,
 126                                    for DASH
 127                                      - HTTP URL to plain file media (in case of
 128                                        unfragmented media)
 129                                      - URL of the MPD manifest or base URL
 130                                        representing the media if MPD manifest
 131                                        is parsed from a string (in case of
 132                                        fragmented media)
 133                                    for MSS - URL of the ISM manifest.
 134                     * manifest_url
 135                                  The URL of the manifest file in case of
 136                                  fragmented media:
 137                                    for HLS - URL of the M3U8 master playlist,
 138                                    for HDS - URL of the F4M manifest,
 139                                    for DASH - URL of the MPD manifest,
 140                                    for MSS - URL of the ISM manifest.
 141                     * ext        Will be calculated from URL if missing
 142                     * format     A human-readable description of the format
 143                                  ("mp4 container with h264/opus").
 144                                  Calculated from the format_id, width, height.
 145                                  and format_note fields if missing.
 146                     * format_id  A short description of the format
 147                                  ("mp4_h264_opus" or "19").
 148                                 Technically optional, but strongly recommended.
 149                     * format_note Additional info about the format
 150                                  ("3D" or "DASH video")
 151                     * width      Width of the video, if known
 152                     * height     Height of the video, if known
 153                     * resolution Textual description of width and height
 154                     * dynamic_range The dynamic range of the video. One of:
 155                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 156                     * tbr        Average bitrate of audio and video in KBit/s
 157                     * abr        Average audio bitrate in KBit/s
 158                     * acodec     Name of the audio codec in use
 159                     * asr        Audio sampling rate in Hertz
 160                     * vbr        Average video bitrate in KBit/s
 161                     * fps        Frame rate
 162                     * vcodec     Name of the video codec in use
 163                     * container  Name of the container format
 164                     * filesize   The number of bytes, if known in advance
 165                     * filesize_approx  An estimate for the number of bytes
 166                     * player_url SWF Player URL (used for rtmpdump).
 167                     * protocol   The protocol that will be used for the actual
 168                                  download, lower-case. One of "http", "https" or
 169                                  one of the protocols defined in downloader.PROTOCOL_MAP
 170                     * fragment_base_url
 171                                  Base URL for fragments. Each fragment's path
 172                                  value (if present) will be relative to
 173                                  this URL.
 174                     * fragments  A list of fragments of a fragmented media.
 175                                  Each fragment entry must contain either an url
 176                                  or a path. If an url is present it should be
 177                                  considered by a client. Otherwise both path and
 178                                  fragment_base_url must be present. Here is
 179                                  the list of all potential fields:
 180                                  * "url" - fragment's URL
 181                                  * "path" - fragment's path relative to
 182                                             fragment_base_url
 183                                  * "duration" (optional, int or float)
 184                                  * "filesize" (optional, int)
 185                     * is_from_start  Is a live format that can be downloaded
 186                                 from the start. Boolean
 187                     * preference Order number of this format. If this field is
 188                                  present and not None, the formats get sorted
 189                                  by this field, regardless of all other values.
 190                                  -1 for default (order by other properties),
 191                                  -2 or smaller for less than default.
 192                                  < -1000 to hide the format (if there is
 193                                     another one which is strictly better)
 194                     * language   Language code, e.g. "de" or "en-US".
 195                     * language_preference  Is this in the language mentioned in
 196                                  the URL?
 197                                  10 if it's what the URL is about,
 198                                  -1 for default (don't know),
 199                                  -10 otherwise, other values reserved for now.
 200                     * quality    Order number of the video quality of this
 201                                  format, irrespective of the file format.
 202                                  -1 for default (order by other properties),
 203                                  -2 or smaller for less than default.
 204                     * source_preference  Order number for this video source
 205                                   (quality takes higher priority)
 206                                  -1 for default (order by other properties),
 207                                  -2 or smaller for less than default.
 208                     * http_headers  A dictionary of additional HTTP headers
 209                                  to add to the request.
 210                     * stretched_ratio  If given and not 1, indicates that the
 211                                  video's pixels are not square.
 212                                  width : height ratio as float.
 213                     * no_resume  The server does not support resuming the
 214                                  (HTTP or RTMP) download. Boolean.
 215                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 216                     * downloader_options  A dictionary of downloader options as
 217                                  described in FileDownloader
 218                     RTMP formats can also have the additional fields: page_url,
 219                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 220                     rtmp_protocol, rtmp_real_time
 221
 222     url:            Final video URL.
 223     ext:            Video filename extension.
 224     format:         The video format, defaults to ext (used for --get-format)
 225     player_url:     SWF Player URL (used for rtmpdump).
 226
 227     The following fields are optional:
 228
 229     direct:         True if a direct video file was given (must only be set by GenericIE)
 230     alt_title:      A secondary title of the video.
 231     display_id      An alternative identifier for the video, not necessarily
 232                     unique, but available before title. Typically, id is
 233                     something like "4234987", title "Dancing naked mole rats",
 234                     and display_id "dancing-naked-mole-rats"
 235     thumbnails:     A list of dictionaries, with the following entries:
 236                         * "id" (optional, string) - Thumbnail format ID
 237                         * "url"
 238                         * "preference" (optional, int) - quality of the image
 239                         * "width" (optional, int)
 240                         * "height" (optional, int)
 241                         * "resolution" (optional, string "{width}x{height}",
 242                                         deprecated)
 243                         * "filesize" (optional, int)
 244                         * "http_headers" (dict) - HTTP headers for the request
 245     thumbnail:      Full URL to a video thumbnail image.
 246     description:    Full video description.
 247     uploader:       Full name of the video uploader.
 248     license:        License name the video is licensed under.
 249     creator:        The creator of the video.
 250     timestamp:      UNIX timestamp of the moment the video was uploaded
 251     upload_date:    Video upload date (YYYYMMDD).
 252                     If not explicitly set, calculated from timestamp
 253     release_timestamp: UNIX timestamp of the moment the video was released.
 254                     If it is not clear whether to use timestamp or this, use the former
 255     release_date:   The date (YYYYMMDD) when the video was released.
 256                     If not explicitly set, calculated from release_timestamp
 257     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 258     modified_date:   The date (YYYYMMDD) when the video was last modified.
 259                     If not explicitly set, calculated from modified_timestamp
 260     uploader_id:    Nickname or id of the video uploader.
 261     uploader_url:   Full URL to a personal webpage of the video uploader.
 262     channel:        Full name of the channel the video is uploaded on.
 263                     Note that channel fields may or may not repeat uploader
 264                     fields. This depends on a particular extractor.
 265     channel_id:     Id of the channel.
 266     channel_url:    Full URL to a channel webpage.
 267     channel_follower_count: Number of followers of the channel.
 268     location:       Physical location where the video was filmed.
 269     subtitles:      The available subtitles as a dictionary in the format
 270                     {tag: subformats}. "tag" is usually a language code, and
 271                     "subformats" is a list sorted from lower to higher
 272                     preference, each element is a dictionary with the "ext"
 273                     entry and one of:
 274                         * "data": The subtitles file contents
 275                         * "url": A URL pointing to the subtitles file
 276                     It can optionally also have:
 277                         * "name": Name or description of the subtitles
 278                         * "http_headers": A dictionary of additional HTTP headers
 279                                   to add to the request.
 280                     "ext" will be calculated from URL if missing
 281     automatic_captions: Like 'subtitles'; contains automatically generated
 282                     captions instead of normal subtitles
 283     duration:       Length of the video in seconds, as an integer or float.
 284     view_count:     How many users have watched the video on the platform.
 285     like_count:     Number of positive ratings of the video
 286     dislike_count:  Number of negative ratings of the video
 287     repost_count:   Number of reposts of the video
 288     average_rating: Average rating give by users, the scale used depends on the webpage
 289     comment_count:  Number of comments on the video
 290     comments:       A list of comments, each with one or more of the following
 291                     properties (all but one of text or html optional):
 292                         * "author" - human-readable name of the comment author
 293                         * "author_id" - user ID of the comment author
 294                         * "author_thumbnail" - The thumbnail of the comment author
 295                         * "id" - Comment ID
 296                         * "html" - Comment as HTML
 297                         * "text" - Plain text of the comment
 298                         * "timestamp" - UNIX timestamp of comment
 299                         * "parent" - ID of the comment this one is replying to.
 300                                      Set to "root" to indicate that this is a
 301                                      comment to the original video.
 302                         * "like_count" - Number of positive ratings of the comment
 303                         * "dislike_count" - Number of negative ratings of the comment
 304                         * "is_favorited" - Whether the comment is marked as
 305                                            favorite by the video uploader
 306                         * "author_is_uploader" - Whether the comment is made by
 307                                                  the video uploader
 308     age_limit:      Age restriction for the video, as an integer (years)
 309     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 310                     should allow to get the same result again. (It will be set
 311                     by YoutubeDL if it's missing)
 312     categories:     A list of categories that the video falls in, for example
 313                     ["Sports", "Berlin"]
 314     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 315     cast:           A list of the video cast
 316     is_live:        True, False, or None (=unknown). Whether this video is a
 317                     live stream that goes on instead of a fixed-length video.
 318     was_live:       True, False, or None (=unknown). Whether this video was
 319                     originally a live stream.
 320     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 321                     If absent, automatically set from is_live, was_live
 322     start_time:     Time in seconds where the reproduction should start, as
 323                     specified in the URL.
 324     end_time:       Time in seconds where the reproduction should end, as
 325                     specified in the URL.
 326     chapters:       A list of dictionaries, with the following entries:
 327                         * "start_time" - The start time of the chapter in seconds
 328                         * "end_time" - The end time of the chapter in seconds
 329                         * "title" (optional, string)
 330     playable_in_embed: Whether this video is allowed to play in embedded
 331                     players on other sites. Can be True (=always allowed),
 332                     False (=never allowed), None (=unknown), or a string
 333                     specifying the criteria for embedability (Eg: 'whitelist')
 334     availability:   Under what condition the video is available. One of
 335                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 336                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 337                     to set it
 338     __post_extractor: A function to be called just before the metadata is
 339                     written to either disk, logger or console. The function
 340                     must return a dict which will be added to the info_dict.
 341                     This is usefull for additional information that is
 342                     time-consuming to extract. Note that the fields thus
 343                     extracted will not be available to output template and
 344                     match_filter. So, only "comments" and "comment_count" are
 345                     currently allowed to be extracted via this method.
 346
 347     The following fields should only be used when the video belongs to some logical
 348     chapter or section:
 349
 350     chapter:        Name or title of the chapter the video belongs to.
 351     chapter_number: Number of the chapter the video belongs to, as an integer.
 352     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 353
 354     The following fields should only be used when the video is an episode of some
 355     series, programme or podcast:
 356
 357     series:         Title of the series or programme the video episode belongs to.
 358     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 359     season:         Title of the season the video episode belongs to.
 360     season_number:  Number of the season the video episode belongs to, as an integer.
 361     season_id:      Id of the season the video episode belongs to, as a unicode string.
 362     episode:        Title of the video episode. Unlike mandatory video title field,
 363                     this field should denote the exact title of the video episode
 364                     without any kind of decoration.
 365     episode_number: Number of the video episode within a season, as an integer.
 366     episode_id:     Id of the video episode, as a unicode string.
 367
 368     The following fields should only be used when the media is a track or a part of
 369     a music album:
 370
 371     track:          Title of the track.
 372     track_number:   Number of the track within an album or a disc, as an integer.
 373     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 374                     as a unicode string.
 375     artist:         Artist(s) of the track.
 376     genre:          Genre(s) of the track.
 377     album:          Title of the album the track belongs to.
 378     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 379     album_artist:   List of all artists appeared on the album (e.g.
 380                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 381                     and compilations).
 382     disc_number:    Number of the disc or other physical medium the track belongs to,
 383                     as an integer.
 384     release_year:   Year (YYYY) when the album was released.
 385     composer:       Composer of the piece
 386
 387     Unless mentioned otherwise, the fields should be Unicode strings.
 388
 389     Unless mentioned otherwise, None is equivalent to absence of information.
 390
 391
 392     _type "playlist" indicates multiple videos.
 393     There must be a key "entries", which is a list, an iterable, or a PagedList
 394     object, each element of which is a valid dictionary by this specification.
 395
 396     Additionally, playlists can have "id", "title", and any other relevent
 397     attributes with the same semantics as videos (see above).
 398
 399     It can also have the following optional fields:
 400
 401     playlist_count: The total number of videos in a playlist. If not given,
 402                     YoutubeDL tries to calculate it from "entries"
 403
 404
 405     _type "multi_video" indicates that there are multiple videos that
 406     form a single show, for examples multiple acts of an opera or TV episode.
 407     It must have an entries key like a playlist and contain all the keys
 408     required for a video at the same time.
 409
 410
 411     _type "url" indicates that the video must be extracted from another
 412     location, possibly by a different extractor. Its only required key is:
 413     "url" - the next URL to extract.
 414     The key "ie_key" can be set to the class name (minus the trailing "IE",
 415     e.g. "Youtube") if the extractor class is known in advance.
 416     Additionally, the dictionary may have any properties of the resolved entity
 417     known in advance, for example "title" if the title of the referred video is
 418     known ahead of time.
 419
 420
 421     _type "url_transparent" entities have the same specification as "url", but
 422     indicate that the given additional information is more precise than the one
 423     associated with the resolved URL.
 424     This is useful when a site employs a video service that hosts the video and
 425     its technical metadata, but that video service does not embed a useful
 426     title, description etc.
 427
 428
 429     Subclasses of this should define a _VALID_URL regexp and, re-define the
 430     _real_extract() and (optionally) _real_initialize() methods.
 431     Probably, they should also be added to the list of extractors.
 432
 433     Subclasses may also override suitable() if necessary, but ensure the function
 434     signature is preserved and that this function imports everything it needs
 435     (except other extractors), so that lazy_extractors works correctly
 436
 437     _GEO_BYPASS attribute may be set to False in order to disable
 438     geo restriction bypass mechanisms for a particular extractor.
 439     Though it won't disable explicit geo restriction bypass based on
 440     country code provided with geo_bypass_country.
 441
 442     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 443     countries for this extractor. One of these countries will be used by
 444     geo restriction bypass mechanism right away in order to bypass
 445     geo restriction, of course, if the mechanism is not disabled.
 446
 447     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 448     IP blocks in CIDR notation for this extractor. One of these IP blocks
 449     will be used by geo restriction bypass mechanism similarly
 450     to _GEO_COUNTRIES.
 451
 452     The _WORKING attribute should be set to False for broken IEs
 453     in order to warn the users and skip the tests.
 454     """
 455
 456     _ready = False
 457     _downloader = None
 458     _x_forwarded_for_ip = None
 459     _GEO_BYPASS = True
 460     _GEO_COUNTRIES = None
 461     _GEO_IP_BLOCKS = None
 462     _WORKING = True
 463
 464     _LOGIN_HINTS = {
 465         'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
 466         'cookies': (
 467             'Use --cookies-from-browser or --cookies for the authentication. '
 468             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 469         'password': 'Use --username and --password, or --netrc to provide account credentials',
 470     }
 471
 472     def __init__(self, downloader=None):
 473         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 474         If a downloader is not passed during initialization,
 475         it must be set using "set_downloader()" before "extract()" is called"""
 476         self._ready = False
 477         self._x_forwarded_for_ip = None
 478         self._printed_messages = set()
 479         self.set_downloader(downloader)
 480
 481     @classmethod
 482     def _match_valid_url(cls, url):
 483         # This does not use has/getattr intentionally - we want to know whether
 484         # we have cached the regexp for *this* class, whereas getattr would also
 485         # match the superclass
 486         if '_VALID_URL_RE' not in cls.__dict__:
 487             if '_VALID_URL' not in cls.__dict__:
 488                 cls._VALID_URL = cls._make_valid_url()
 489             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 490         return cls._VALID_URL_RE.match(url)
 491
 492     @classmethod
 493     def suitable(cls, url):
 494         """Receives a URL and returns True if suitable for this IE."""
 495         # This function must import everything it needs (except other extractors),
 496         # so that lazy_extractors works correctly
 497         return cls._match_valid_url(url) is not None
 498
 499     @classmethod
 500     def _match_id(cls, url):
 501         return cls._match_valid_url(url).group('id')
 502
 503     @classmethod
 504     def get_temp_id(cls, url):
 505         try:
 506             return cls._match_id(url)
 507         except (IndexError, AttributeError):
 508             return None
 509
 510     @classmethod
 511     def working(cls):
 512         """Getter method for _WORKING."""
 513         return cls._WORKING
 514
 515     def initialize(self):
 516         """Initializes an instance (authentication, etc)."""
 517         self._printed_messages = set()
 518         self._initialize_geo_bypass({
 519             'countries': self._GEO_COUNTRIES,
 520             'ip_blocks': self._GEO_IP_BLOCKS,
 521         })
 522         if not self._ready:
 523             self._real_initialize()
 524             self._ready = True
 525
 526     def _initialize_geo_bypass(self, geo_bypass_context):
 527         """
 528         Initialize geo restriction bypass mechanism.
 529
 530         This method is used to initialize geo bypass mechanism based on faking
 531         X-Forwarded-For HTTP header. A random country from provided country list
 532         is selected and a random IP belonging to this country is generated. This
 533         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 534         HTTP requests.
 535
 536         This method will be used for initial geo bypass mechanism initialization
 537         during the instance initialization with _GEO_COUNTRIES and
 538         _GEO_IP_BLOCKS.
 539
 540         You may also manually call it from extractor's code if geo bypass
 541         information is not available beforehand (e.g. obtained during
 542         extraction) or due to some other reason. In this case you should pass
 543         this information in geo bypass context passed as first argument. It may
 544         contain following fields:
 545
 546         countries:  List of geo unrestricted countries (similar
 547                     to _GEO_COUNTRIES)
 548         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 549                     (similar to _GEO_IP_BLOCKS)
 550
 551         """
 552         if not self._x_forwarded_for_ip:
 553
 554             # Geo bypass mechanism is explicitly disabled by user
 555             if not self.get_param('geo_bypass', True):
 556                 return
 557
 558             if not geo_bypass_context:
 559                 geo_bypass_context = {}
 560
 561             # Backward compatibility: previously _initialize_geo_bypass
 562             # expected a list of countries, some 3rd party code may still use
 563             # it this way
 564             if isinstance(geo_bypass_context, (list, tuple)):
 565                 geo_bypass_context = {
 566                     'countries': geo_bypass_context,
 567                 }
 568
 569             # The whole point of geo bypass mechanism is to fake IP
 570             # as X-Forwarded-For HTTP header based on some IP block or
 571             # country code.
 572
 573             # Path 1: bypassing based on IP block in CIDR notation
 574
 575             # Explicit IP block specified by user, use it right away
 576             # regardless of whether extractor is geo bypassable or not
 577             ip_block = self.get_param('geo_bypass_ip_block', None)
 578
 579             # Otherwise use random IP block from geo bypass context but only
 580             # if extractor is known as geo bypassable
 581             if not ip_block:
 582                 ip_blocks = geo_bypass_context.get('ip_blocks')
 583                 if self._GEO_BYPASS and ip_blocks:
 584                     ip_block = random.choice(ip_blocks)
 585
 586             if ip_block:
 587                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 588                 self._downloader.write_debug(
 589                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 590                 return
 591
 592             # Path 2: bypassing based on country code
 593
 594             # Explicit country code specified by user, use it right away
 595             # regardless of whether extractor is geo bypassable or not
 596             country = self.get_param('geo_bypass_country', None)
 597
 598             # Otherwise use random country code from geo bypass context but
 599             # only if extractor is known as geo bypassable
 600             if not country:
 601                 countries = geo_bypass_context.get('countries')
 602                 if self._GEO_BYPASS and countries:
 603                     country = random.choice(countries)
 604
 605             if country:
 606                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 607                 self._downloader.write_debug(
 608                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 609
 610     def extract(self, url):
 611         """Extracts URL information and returns it in list of dicts."""
 612         try:
 613             for _ in range(2):
 614                 try:
 615                     self.initialize()
 616                     self.write_debug('Extracting URL: %s' % url)
 617                     ie_result = self._real_extract(url)
 618                     if ie_result is None:
 619                         return None
 620                     if self._x_forwarded_for_ip:
 621                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 622                     subtitles = ie_result.get('subtitles')
 623                     if (subtitles and 'live_chat' in subtitles
 624                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 625                         del subtitles['live_chat']
 626                     return ie_result
 627                 except GeoRestrictedError as e:
 628                     if self.__maybe_fake_ip_and_retry(e.countries):
 629                         continue
 630                     raise
 631         except UnsupportedError:
 632             raise
 633         except ExtractorError as e:
 634             kwargs = {
 635                 'video_id': e.video_id or self.get_temp_id(url),
 636                 'ie': self.IE_NAME,
 637                 'tb': e.traceback or sys.exc_info()[2],
 638                 'expected': e.expected,
 639                 'cause': e.cause
 640             }
 641             if hasattr(e, 'countries'):
 642                 kwargs['countries'] = e.countries
 643             raise type(e)(e.orig_msg, **kwargs)
 644         except compat_http_client.IncompleteRead as e:
 645             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 646         except (KeyError, StopIteration) as e:
 647             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 648
 649     def __maybe_fake_ip_and_retry(self, countries):
 650         if (not self.get_param('geo_bypass_country', None)
 651                 and self._GEO_BYPASS
 652                 and self.get_param('geo_bypass', True)
 653                 and not self._x_forwarded_for_ip
 654                 and countries):
 655             country_code = random.choice(countries)
 656             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 657             if self._x_forwarded_for_ip:
 658                 self.report_warning(
 659                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 660                     % (self._x_forwarded_for_ip, country_code.upper()))
 661                 return True
 662         return False
 663
 664     def set_downloader(self, downloader):
 665         """Sets a YoutubeDL instance as the downloader for this IE."""
 666         self._downloader = downloader
 667
 668     def _real_initialize(self):
 669         """Real initialization process. Redefine in subclasses."""
 670         pass
 671
 672     def _real_extract(self, url):
 673         """Real extraction process. Redefine in subclasses."""
 674         raise NotImplementedError('This method must be implemented by subclasses')
 675
 676     @classmethod
 677     def ie_key(cls):
 678         """A string for getting the InfoExtractor with get_info_extractor"""
 679         return cls.__name__[:-2]
 680
 681     @property
 682     def IE_NAME(self):
 683         return compat_str(type(self).__name__[:-2])
 684
 685     @staticmethod
 686     def __can_accept_status_code(err, expected_status):
 687         assert isinstance(err, compat_urllib_error.HTTPError)
 688         if expected_status is None:
 689             return False
 690         elif callable(expected_status):
 691             return expected_status(err.code) is True
 692         else:
 693             return err.code in variadic(expected_status)
 694
 695     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 696         """
 697         Return the response handle.
 698
 699         See _download_webpage docstring for arguments specification.
 700         """
 701         if not self._downloader._first_webpage_request:
 702             sleep_interval = self.get_param('sleep_interval_requests') or 0
 703             if sleep_interval > 0:
 704                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 705                 time.sleep(sleep_interval)
 706         else:
 707             self._downloader._first_webpage_request = False
 708
 709         if note is None:
 710             self.report_download_webpage(video_id)
 711         elif note is not False:
 712             if video_id is None:
 713                 self.to_screen('%s' % (note,))
 714             else:
 715                 self.to_screen('%s: %s' % (video_id, note))
 716
 717         # Some sites check X-Forwarded-For HTTP header in order to figure out
 718         # the origin of the client behind proxy. This allows bypassing geo
 719         # restriction by faking this header's value to IP that belongs to some
 720         # geo unrestricted country. We will do so once we encounter any
 721         # geo restriction error.
 722         if self._x_forwarded_for_ip:
 723             if 'X-Forwarded-For' not in headers:
 724                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 725
 726         if isinstance(url_or_request, compat_urllib_request.Request):
 727             url_or_request = update_Request(
 728                 url_or_request, data=data, headers=headers, query=query)
 729         else:
 730             if query:
 731                 url_or_request = update_url_query(url_or_request, query)
 732             if data is not None or headers:
 733                 url_or_request = sanitized_Request(url_or_request, data, headers)
 734         try:
 735             return self._downloader.urlopen(url_or_request)
 736         except network_exceptions as err:
 737             if isinstance(err, compat_urllib_error.HTTPError):
 738                 if self.__can_accept_status_code(err, expected_status):
 739                     # Retain reference to error to prevent file object from
 740                     # being closed before it can be read. Works around the
 741                     # effects of <https://bugs.python.org/issue15002>
 742                     # introduced in Python 3.4.1.
 743                     err.fp._error = err
 744                     return err.fp
 745
 746             if errnote is False:
 747                 return False
 748             if errnote is None:
 749                 errnote = 'Unable to download webpage'
 750
 751             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 752             if fatal:
 753                 raise ExtractorError(errmsg, cause=err)
 754             else:
 755                 self.report_warning(errmsg)
 756                 return False
 757
 758     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 759         """
 760         Return a tuple (page content as string, URL handle).
 761
 762         See _download_webpage docstring for arguments specification.
 763         """
 764         # Strip hashes from the URL (#1038)
 765         if isinstance(url_or_request, (compat_str, str)):
 766             url_or_request = url_or_request.partition('#')[0]
 767
 768         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 769         if urlh is False:
 770             assert not fatal
 771             return False
 772         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 773         return (content, urlh)
 774
 775     @staticmethod
 776     def _guess_encoding_from_content(content_type, webpage_bytes):
 777         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 778         if m:
 779             encoding = m.group(1)
 780         else:
 781             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 782                           webpage_bytes[:1024])
 783             if m:
 784                 encoding = m.group(1).decode('ascii')
 785             elif webpage_bytes.startswith(b'\xff\xfe'):
 786                 encoding = 'utf-16'
 787             else:
 788                 encoding = 'utf-8'
 789
 790         return encoding
 791
 792     def __check_blocked(self, content):
 793         first_block = content[:512]
 794         if ('<title>Access to this site is blocked</title>' in content
 795                 and 'Websense' in first_block):
 796             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 797             blocked_iframe = self._html_search_regex(
 798                 r'<iframe src="([^"]+)"', content,
 799                 'Websense information URL', default=None)
 800             if blocked_iframe:
 801                 msg += ' Visit %s for more details' % blocked_iframe
 802             raise ExtractorError(msg, expected=True)
 803         if '<title>The URL you requested has been blocked</title>' in first_block:
 804             msg = (
 805                 'Access to this webpage has been blocked by Indian censorship. '
 806                 'Use a VPN or proxy server (with --proxy) to route around it.')
 807             block_msg = self._html_search_regex(
 808                 r'</h1><p>(.*?)</p>',
 809                 content, 'block message', default=None)
 810             if block_msg:
 811                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 812             raise ExtractorError(msg, expected=True)
 813         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 814                 and 'blocklist.rkn.gov.ru' in content):
 815             raise ExtractorError(
 816                 'Access to this webpage has been blocked by decision of the Russian government. '
 817                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 818                 expected=True)
 819
 820     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 821         content_type = urlh.headers.get('Content-Type', '')
 822         webpage_bytes = urlh.read()
 823         if prefix is not None:
 824             webpage_bytes = prefix + webpage_bytes
 825         if not encoding:
 826             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 827         if self.get_param('dump_intermediate_pages', False):
 828             self.to_screen('Dumping request to ' + urlh.geturl())
 829             dump = base64.b64encode(webpage_bytes).decode('ascii')
 830             self._downloader.to_screen(dump)
 831         if self.get_param('write_pages', False):
 832             basen = '%s_%s' % (video_id, urlh.geturl())
 833             trim_length = self.get_param('trim_file_name') or 240
 834             if len(basen) > trim_length:
 835                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 836                 basen = basen[:trim_length - len(h)] + h
 837             raw_filename = basen + '.dump'
 838             filename = sanitize_filename(raw_filename, restricted=True)
 839             self.to_screen('Saving request to ' + filename)
 840             # Working around MAX_PATH limitation on Windows (see
 841             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 842             if compat_os_name == 'nt':
 843                 absfilepath = os.path.abspath(filename)
 844                 if len(absfilepath) > 259:
 845                     filename = '\\\\?\\' + absfilepath
 846             with open(filename, 'wb') as outf:
 847                 outf.write(webpage_bytes)
 848
 849         try:
 850             content = webpage_bytes.decode(encoding, 'replace')
 851         except LookupError:
 852             content = webpage_bytes.decode('utf-8', 'replace')
 853
 854         self.__check_blocked(content)
 855
 856         return content
 857
 858     def _download_webpage(
 859             self, url_or_request, video_id, note=None, errnote=None,
 860             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 861             headers={}, query={}, expected_status=None):
 862         """
 863         Return the data of the page as a string.
 864
 865         Arguments:
 866         url_or_request -- plain text URL as a string or
 867             a compat_urllib_request.Requestobject
 868         video_id -- Video/playlist/item identifier (string)
 869
 870         Keyword arguments:
 871         note -- note printed before downloading (string)
 872         errnote -- note printed in case of an error (string)
 873         fatal -- flag denoting whether error should be considered fatal,
 874             i.e. whether it should cause ExtractionError to be raised,
 875             otherwise a warning will be reported and extraction continued
 876         tries -- number of tries
 877         timeout -- sleep interval between tries
 878         encoding -- encoding for a page content decoding, guessed automatically
 879             when not explicitly specified
 880         data -- POST data (bytes)
 881         headers -- HTTP headers (dict)
 882         query -- URL query (dict)
 883         expected_status -- allows to accept failed HTTP requests (non 2xx
 884             status code) by explicitly specifying a set of accepted status
 885             codes. Can be any of the following entities:
 886                 - an integer type specifying an exact failed status code to
 887                   accept
 888                 - a list or a tuple of integer types specifying a list of
 889                   failed status codes to accept
 890                 - a callable accepting an actual failed status code and
 891                   returning True if it should be accepted
 892             Note that this argument does not affect success status codes (2xx)
 893             which are always accepted.
 894         """
 895
 896         success = False
 897         try_count = 0
 898         while success is False:
 899             try:
 900                 res = self._download_webpage_handle(
 901                     url_or_request, video_id, note, errnote, fatal,
 902                     encoding=encoding, data=data, headers=headers, query=query,
 903                     expected_status=expected_status)
 904                 success = True
 905             except compat_http_client.IncompleteRead as e:
 906                 try_count += 1
 907                 if try_count >= tries:
 908                     raise e
 909                 self._sleep(timeout, video_id)
 910         if res is False:
 911             return res
 912         else:
 913             content, _ = res
 914             return content
 915
 916     def _download_xml_handle(
 917             self, url_or_request, video_id, note='Downloading XML',
 918             errnote='Unable to download XML', transform_source=None,
 919             fatal=True, encoding=None, data=None, headers={}, query={},
 920             expected_status=None):
 921         """
 922         Return a tuple (xml as an compat_etree_Element, URL handle).
 923
 924         See _download_webpage docstring for arguments specification.
 925         """
 926         res = self._download_webpage_handle(
 927             url_or_request, video_id, note, errnote, fatal=fatal,
 928             encoding=encoding, data=data, headers=headers, query=query,
 929             expected_status=expected_status)
 930         if res is False:
 931             return res
 932         xml_string, urlh = res
 933         return self._parse_xml(
 934             xml_string, video_id, transform_source=transform_source,
 935             fatal=fatal), urlh
 936
 937     def _download_xml(
 938             self, url_or_request, video_id,
 939             note='Downloading XML', errnote='Unable to download XML',
 940             transform_source=None, fatal=True, encoding=None,
 941             data=None, headers={}, query={}, expected_status=None):
 942         """
 943         Return the xml as an compat_etree_Element.
 944
 945         See _download_webpage docstring for arguments specification.
 946         """
 947         res = self._download_xml_handle(
 948             url_or_request, video_id, note=note, errnote=errnote,
 949             transform_source=transform_source, fatal=fatal, encoding=encoding,
 950             data=data, headers=headers, query=query,
 951             expected_status=expected_status)
 952         return res if res is False else res[0]
 953
 954     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 955         if transform_source:
 956             xml_string = transform_source(xml_string)
 957         try:
 958             return compat_etree_fromstring(xml_string.encode('utf-8'))
 959         except compat_xml_parse_error as ve:
 960             errmsg = '%s: Failed to parse XML ' % video_id
 961             if fatal:
 962                 raise ExtractorError(errmsg, cause=ve)
 963             else:
 964                 self.report_warning(errmsg + str(ve))
 965
 966     def _download_json_handle(
 967             self, url_or_request, video_id, note='Downloading JSON metadata',
 968             errnote='Unable to download JSON metadata', transform_source=None,
 969             fatal=True, encoding=None, data=None, headers={}, query={},
 970             expected_status=None):
 971         """
 972         Return a tuple (JSON object, URL handle).
 973
 974         See _download_webpage docstring for arguments specification.
 975         """
 976         res = self._download_webpage_handle(
 977             url_or_request, video_id, note, errnote, fatal=fatal,
 978             encoding=encoding, data=data, headers=headers, query=query,
 979             expected_status=expected_status)
 980         if res is False:
 981             return res
 982         json_string, urlh = res
 983         return self._parse_json(
 984             json_string, video_id, transform_source=transform_source,
 985             fatal=fatal), urlh
 986
 987     def _download_json(
 988             self, url_or_request, video_id, note='Downloading JSON metadata',
 989             errnote='Unable to download JSON metadata', transform_source=None,
 990             fatal=True, encoding=None, data=None, headers={}, query={},
 991             expected_status=None):
 992         """
 993         Return the JSON object as a dict.
 994
 995         See _download_webpage docstring for arguments specification.
 996         """
 997         res = self._download_json_handle(
 998             url_or_request, video_id, note=note, errnote=errnote,
 999             transform_source=transform_source, fatal=fatal, encoding=encoding,
1000             data=data, headers=headers, query=query,
1001             expected_status=expected_status)
1002         return res if res is False else res[0]
1003
1004     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
1005         if transform_source:
1006             json_string = transform_source(json_string)
1007         try:
1008             return json.loads(json_string)
1009         except ValueError as ve:
1010             errmsg = '%s: Failed to parse JSON ' % video_id
1011             if fatal:
1012                 raise ExtractorError(errmsg, cause=ve)
1013             else:
1014                 self.report_warning(errmsg + str(ve))
1015
1016     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1017         return self._parse_json(
1018             data[data.find('{'):data.rfind('}') + 1],
1019             video_id, transform_source, fatal)
1020
1021     def _download_socket_json_handle(
1022             self, url_or_request, video_id, note='Polling socket',
1023             errnote='Unable to poll socket', transform_source=None,
1024             fatal=True, encoding=None, data=None, headers={}, query={},
1025             expected_status=None):
1026         """
1027         Return a tuple (JSON object, URL handle).
1028
1029         See _download_webpage docstring for arguments specification.
1030         """
1031         res = self._download_webpage_handle(
1032             url_or_request, video_id, note, errnote, fatal=fatal,
1033             encoding=encoding, data=data, headers=headers, query=query,
1034             expected_status=expected_status)
1035         if res is False:
1036             return res
1037         webpage, urlh = res
1038         return self._parse_socket_response_as_json(
1039             webpage, video_id, transform_source=transform_source,
1040             fatal=fatal), urlh
1041
1042     def _download_socket_json(
1043             self, url_or_request, video_id, note='Polling socket',
1044             errnote='Unable to poll socket', transform_source=None,
1045             fatal=True, encoding=None, data=None, headers={}, query={},
1046             expected_status=None):
1047         """
1048         Return the JSON object as a dict.
1049
1050         See _download_webpage docstring for arguments specification.
1051         """
1052         res = self._download_socket_json_handle(
1053             url_or_request, video_id, note=note, errnote=errnote,
1054             transform_source=transform_source, fatal=fatal, encoding=encoding,
1055             data=data, headers=headers, query=query,
1056             expected_status=expected_status)
1057         return res if res is False else res[0]
1058
1059     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1060         idstr = format_field(video_id, template='%s: ')
1061         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1062         if only_once:
1063             if f'WARNING: {msg}' in self._printed_messages:
1064                 return
1065             self._printed_messages.add(f'WARNING: {msg}')
1066         self._downloader.report_warning(msg, *args, **kwargs)
1067
1068     def to_screen(self, msg, *args, **kwargs):
1069         """Print msg to screen, prefixing it with '[ie_name]'"""
1070         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1071
1072     def write_debug(self, msg, *args, **kwargs):
1073         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1074
1075     def get_param(self, name, default=None, *args, **kwargs):
1076         if self._downloader:
1077             return self._downloader.params.get(name, default, *args, **kwargs)
1078         return default
1079
1080     def report_drm(self, video_id, partial=False):
1081         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1082
1083     def report_extraction(self, id_or_name):
1084         """Report information extraction."""
1085         self.to_screen('%s: Extracting information' % id_or_name)
1086
1087     def report_download_webpage(self, video_id):
1088         """Report webpage download."""
1089         self.to_screen('%s: Downloading webpage' % video_id)
1090
1091     def report_age_confirmation(self):
1092         """Report attempt to confirm age."""
1093         self.to_screen('Confirming age')
1094
1095     def report_login(self):
1096         """Report attempt to log in."""
1097         self.to_screen('Logging in')
1098
1099     def raise_login_required(
1100             self, msg='This video is only available for registered users',
1101             metadata_available=False, method='any'):
1102         if metadata_available and (
1103                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1104             self.report_warning(msg)
1105             return
1106         if method is not None:
1107             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1108         raise ExtractorError(msg, expected=True)
1109
1110     def raise_geo_restricted(
1111             self, msg='This video is not available from your location due to geo restriction',
1112             countries=None, metadata_available=False):
1113         if metadata_available and (
1114                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1115             self.report_warning(msg)
1116         else:
1117             raise GeoRestrictedError(msg, countries=countries)
1118
1119     def raise_no_formats(self, msg, expected=False, video_id=None):
1120         if expected and (
1121                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1122             self.report_warning(msg, video_id)
1123         elif isinstance(msg, ExtractorError):
1124             raise msg
1125         else:
1126             raise ExtractorError(msg, expected=expected, video_id=video_id)
1127
1128     # Methods for following #608
1129     @staticmethod
1130     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1131         """Returns a URL that points to a page that should be processed"""
1132         if ie is not None:
1133             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1134         if video_id is not None:
1135             kwargs['id'] = video_id
1136         if video_title is not None:
1137             kwargs['title'] = video_title
1138         return {
1139             **kwargs,
1140             '_type': 'url_transparent' if url_transparent else 'url',
1141             'url': url,
1142         }
1143
1144     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1145         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1146                 for m in orderedSet(map(getter, matches) if getter else matches))
1147         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1148
1149     @staticmethod
1150     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1151         """Returns a playlist"""
1152         if playlist_id:
1153             kwargs['id'] = playlist_id
1154         if playlist_title:
1155             kwargs['title'] = playlist_title
1156         if playlist_description is not None:
1157             kwargs['description'] = playlist_description
1158         return {
1159             **kwargs,
1160             '_type': 'multi_video' if multi_video else 'playlist',
1161             'entries': entries,
1162         }
1163
1164     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1165         """
1166         Perform a regex search on the given string, using a single or a list of
1167         patterns returning the first matching group.
1168         In case of failure return a default value or raise a WARNING or a
1169         RegexNotFoundError, depending on fatal, specifying the field name.
1170         """
1171         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1172             mobj = re.search(pattern, string, flags)
1173         else:
1174             for p in pattern:
1175                 mobj = re.search(p, string, flags)
1176                 if mobj:
1177                     break
1178
1179         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1180
1181         if mobj:
1182             if group is None:
1183                 # return the first matching group
1184                 return next(g for g in mobj.groups() if g is not None)
1185             elif isinstance(group, (list, tuple)):
1186                 return tuple(mobj.group(g) for g in group)
1187             else:
1188                 return mobj.group(group)
1189         elif default is not NO_DEFAULT:
1190             return default
1191         elif fatal:
1192             raise RegexNotFoundError('Unable to extract %s' % _name)
1193         else:
1194             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1195             return None
1196
1197     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1198         """
1199         Like _search_regex, but strips HTML tags and unescapes entities.
1200         """
1201         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1202         if res:
1203             return clean_html(res).strip()
1204         else:
1205             return res
1206
1207     def _get_netrc_login_info(self, netrc_machine=None):
1208         username = None
1209         password = None
1210         netrc_machine = netrc_machine or self._NETRC_MACHINE
1211
1212         if self.get_param('usenetrc', False):
1213             try:
1214                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1215                 if os.path.isdir(netrc_file):
1216                     netrc_file = os.path.join(netrc_file, '.netrc')
1217                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1218                 if info is not None:
1219                     username = info[0]
1220                     password = info[2]
1221                 else:
1222                     raise netrc.NetrcParseError(
1223                         'No authenticators for %s' % netrc_machine)
1224             except (IOError, netrc.NetrcParseError) as err:
1225                 self.report_warning(
1226                     'parsing .netrc: %s' % error_to_compat_str(err))
1227
1228         return username, password
1229
1230     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1231         """
1232         Get the login info as (username, password)
1233         First look for the manually specified credentials using username_option
1234         and password_option as keys in params dictionary. If no such credentials
1235         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1236         value.
1237         If there's no info available, return (None, None)
1238         """
1239
1240         # Attempt to use provided username and password or .netrc data
1241         username = self.get_param(username_option)
1242         if username is not None:
1243             password = self.get_param(password_option)
1244         else:
1245             username, password = self._get_netrc_login_info(netrc_machine)
1246
1247         return username, password
1248
1249     def _get_tfa_info(self, note='two-factor verification code'):
1250         """
1251         Get the two-factor authentication info
1252         TODO - asking the user will be required for sms/phone verify
1253         currently just uses the command line option
1254         If there's no info available, return None
1255         """
1256
1257         tfa = self.get_param('twofactor')
1258         if tfa is not None:
1259             return tfa
1260
1261         return compat_getpass('Type %s and press [Return]: ' % note)
1262
1263     # Helper functions for extracting OpenGraph info
1264     @staticmethod
1265     def _og_regexes(prop):
1266         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1267         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1268                        % {'prop': re.escape(prop)})
1269         template = r'<meta[^>]+?%s[^>]+?%s'
1270         return [
1271             template % (property_re, content_re),
1272             template % (content_re, property_re),
1273         ]
1274
1275     @staticmethod
1276     def _meta_regex(prop):
1277         return r'''(?isx)<meta
1278                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1279                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1280
1281     def _og_search_property(self, prop, html, name=None, **kargs):
1282         prop = variadic(prop)
1283         if name is None:
1284             name = 'OpenGraph %s' % prop[0]
1285         og_regexes = []
1286         for p in prop:
1287             og_regexes.extend(self._og_regexes(p))
1288         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1289         if escaped is None:
1290             return None
1291         return unescapeHTML(escaped)
1292
1293     def _og_search_thumbnail(self, html, **kargs):
1294         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1295
1296     def _og_search_description(self, html, **kargs):
1297         return self._og_search_property('description', html, fatal=False, **kargs)
1298
1299     def _og_search_title(self, html, **kargs):
1300         kargs.setdefault('fatal', False)
1301         return self._og_search_property('title', html, **kargs)
1302
1303     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1304         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1305         if secure:
1306             regexes = self._og_regexes('video:secure_url') + regexes
1307         return self._html_search_regex(regexes, html, name, **kargs)
1308
1309     def _og_search_url(self, html, **kargs):
1310         return self._og_search_property('url', html, **kargs)
1311
1312     def _html_extract_title(self, html, name, **kwargs):
1313         return self._html_search_regex(
1314             r'(?s)<title>(.*?)</title>', html, name, **kwargs)
1315
1316     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1317         name = variadic(name)
1318         if display_name is None:
1319             display_name = name[0]
1320         return self._html_search_regex(
1321             [self._meta_regex(n) for n in name],
1322             html, display_name, fatal=fatal, group='content', **kwargs)
1323
1324     def _dc_search_uploader(self, html):
1325         return self._html_search_meta('dc.creator', html, 'uploader')
1326
1327     def _rta_search(self, html):
1328         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1329         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1330                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1331                      html):
1332             return 18
1333         return 0
1334
1335     def _media_rating_search(self, html):
1336         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1337         rating = self._html_search_meta('rating', html)
1338
1339         if not rating:
1340             return None
1341
1342         RATING_TABLE = {
1343             'safe for kids': 0,
1344             'general': 8,
1345             '14 years': 14,
1346             'mature': 17,
1347             'restricted': 19,
1348         }
1349         return RATING_TABLE.get(rating.lower())
1350
1351     def _family_friendly_search(self, html):
1352         # See http://schema.org/VideoObject
1353         family_friendly = self._html_search_meta(
1354             'isFamilyFriendly', html, default=None)
1355
1356         if not family_friendly:
1357             return None
1358
1359         RATING_TABLE = {
1360             '1': 0,
1361             'true': 0,
1362             '0': 18,
1363             'false': 18,
1364         }
1365         return RATING_TABLE.get(family_friendly.lower())
1366
1367     def _twitter_search_player(self, html):
1368         return self._html_search_meta('twitter:player', html,
1369                                       'twitter card player')
1370
1371     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1372         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1373         default = kwargs.get('default', NO_DEFAULT)
1374         # JSON-LD may be malformed and thus `fatal` should be respected.
1375         # At the same time `default` may be passed that assumes `fatal=False`
1376         # for _search_regex. Let's simulate the same behavior here as well.
1377         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1378         json_ld = []
1379         for mobj in json_ld_list:
1380             json_ld_item = self._parse_json(
1381                 mobj.group('json_ld'), video_id, fatal=fatal)
1382             if not json_ld_item:
1383                 continue
1384             if isinstance(json_ld_item, dict):
1385                 json_ld.append(json_ld_item)
1386             elif isinstance(json_ld_item, (list, tuple)):
1387                 json_ld.extend(json_ld_item)
1388         if json_ld:
1389             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1390         if json_ld:
1391             return json_ld
1392         if default is not NO_DEFAULT:
1393             return default
1394         elif fatal:
1395             raise RegexNotFoundError('Unable to extract JSON-LD')
1396         else:
1397             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1398             return {}
1399
1400     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1401         if isinstance(json_ld, compat_str):
1402             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1403         if not json_ld:
1404             return {}
1405         info = {}
1406         if not isinstance(json_ld, (list, tuple, dict)):
1407             return info
1408         if isinstance(json_ld, dict):
1409             json_ld = [json_ld]
1410
1411         INTERACTION_TYPE_MAP = {
1412             'CommentAction': 'comment',
1413             'AgreeAction': 'like',
1414             'DisagreeAction': 'dislike',
1415             'LikeAction': 'like',
1416             'DislikeAction': 'dislike',
1417             'ListenAction': 'view',
1418             'WatchAction': 'view',
1419             'ViewAction': 'view',
1420         }
1421
1422         def extract_interaction_type(e):
1423             interaction_type = e.get('interactionType')
1424             if isinstance(interaction_type, dict):
1425                 interaction_type = interaction_type.get('@type')
1426             return str_or_none(interaction_type)
1427
1428         def extract_interaction_statistic(e):
1429             interaction_statistic = e.get('interactionStatistic')
1430             if isinstance(interaction_statistic, dict):
1431                 interaction_statistic = [interaction_statistic]
1432             if not isinstance(interaction_statistic, list):
1433                 return
1434             for is_e in interaction_statistic:
1435                 if not isinstance(is_e, dict):
1436                     continue
1437                 if is_e.get('@type') != 'InteractionCounter':
1438                     continue
1439                 interaction_type = extract_interaction_type(is_e)
1440                 if not interaction_type:
1441                     continue
1442                 # For interaction count some sites provide string instead of
1443                 # an integer (as per spec) with non digit characters (e.g. ",")
1444                 # so extracting count with more relaxed str_to_int
1445                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1446                 if interaction_count is None:
1447                     continue
1448                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1449                 if not count_kind:
1450                     continue
1451                 count_key = '%s_count' % count_kind
1452                 if info.get(count_key) is not None:
1453                     continue
1454                 info[count_key] = interaction_count
1455
1456         def extract_chapter_information(e):
1457             chapters = [{
1458                 'title': part.get('name'),
1459                 'start_time': part.get('startOffset'),
1460                 'end_time': part.get('endOffset'),
1461             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1462             for idx, (last_c, current_c, next_c) in enumerate(zip(
1463                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1464                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1465                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1466                 if None in current_c.values():
1467                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1468                     return
1469             if chapters:
1470                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1471                 info['chapters'] = chapters
1472
1473         def extract_video_object(e):
1474             assert e['@type'] == 'VideoObject'
1475             author = e.get('author')
1476             info.update({
1477                 'url': url_or_none(e.get('contentUrl')),
1478                 'title': unescapeHTML(e.get('name')),
1479                 'description': unescapeHTML(e.get('description')),
1480                 'thumbnails': [{'url': url_or_none(url)}
1481                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1482                 'duration': parse_duration(e.get('duration')),
1483                 'timestamp': unified_timestamp(e.get('uploadDate')),
1484                 # author can be an instance of 'Organization' or 'Person' types.
1485                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1486                 # however some websites are using 'Text' type instead.
1487                 # 1. https://schema.org/VideoObject
1488                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1489                 'filesize': float_or_none(e.get('contentSize')),
1490                 'tbr': int_or_none(e.get('bitrate')),
1491                 'width': int_or_none(e.get('width')),
1492                 'height': int_or_none(e.get('height')),
1493                 'view_count': int_or_none(e.get('interactionCount')),
1494             })
1495             extract_interaction_statistic(e)
1496             extract_chapter_information(e)
1497
1498         def traverse_json_ld(json_ld, at_top_level=True):
1499             for e in json_ld:
1500                 if at_top_level and '@context' not in e:
1501                     continue
1502                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1503                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1504                     break
1505                 item_type = e.get('@type')
1506                 if expected_type is not None and expected_type != item_type:
1507                     continue
1508                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1509                 if rating is not None:
1510                     info['average_rating'] = rating
1511                 if item_type in ('TVEpisode', 'Episode'):
1512                     episode_name = unescapeHTML(e.get('name'))
1513                     info.update({
1514                         'episode': episode_name,
1515                         'episode_number': int_or_none(e.get('episodeNumber')),
1516                         'description': unescapeHTML(e.get('description')),
1517                     })
1518                     if not info.get('title') and episode_name:
1519                         info['title'] = episode_name
1520                     part_of_season = e.get('partOfSeason')
1521                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1522                         info.update({
1523                             'season': unescapeHTML(part_of_season.get('name')),
1524                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1525                         })
1526                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1527                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1528                         info['series'] = unescapeHTML(part_of_series.get('name'))
1529                 elif item_type == 'Movie':
1530                     info.update({
1531                         'title': unescapeHTML(e.get('name')),
1532                         'description': unescapeHTML(e.get('description')),
1533                         'duration': parse_duration(e.get('duration')),
1534                         'timestamp': unified_timestamp(e.get('dateCreated')),
1535                     })
1536                 elif item_type in ('Article', 'NewsArticle'):
1537                     info.update({
1538                         'timestamp': parse_iso8601(e.get('datePublished')),
1539                         'title': unescapeHTML(e.get('headline')),
1540                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1541                     })
1542                     if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1543                         extract_video_object(e['video'][0])
1544                 elif item_type == 'VideoObject':
1545                     extract_video_object(e)
1546                     if expected_type is None:
1547                         continue
1548                     else:
1549                         break
1550                 video = e.get('video')
1551                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1552                     extract_video_object(video)
1553                 if expected_type is None:
1554                     continue
1555                 else:
1556                     break
1557         traverse_json_ld(json_ld)
1558
1559         return dict((k, v) for k, v in info.items() if v is not None)
1560
1561     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1562         return self._parse_json(
1563             self._search_regex(
1564                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1565                 webpage, 'next.js data', fatal=fatal, **kw),
1566             video_id, transform_source=transform_source, fatal=fatal)
1567
1568     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1569         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1570         # not all website do this, but it can be changed
1571         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1572         rectx = re.escape(context_name)
1573         js, arg_keys, arg_vals = self._search_regex(
1574             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1575              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1576             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1577
1578         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1579
1580         for key, val in args.items():
1581             if val in ('undefined', 'void 0'):
1582                 args[key] = 'null'
1583
1584         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1585
1586     @staticmethod
1587     def _hidden_inputs(html):
1588         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1589         hidden_inputs = {}
1590         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1591             attrs = extract_attributes(input)
1592             if not input:
1593                 continue
1594             if attrs.get('type') not in ('hidden', 'submit'):
1595                 continue
1596             name = attrs.get('name') or attrs.get('id')
1597             value = attrs.get('value')
1598             if name and value is not None:
1599                 hidden_inputs[name] = value
1600         return hidden_inputs
1601
1602     def _form_hidden_inputs(self, form_id, html):
1603         form = self._search_regex(
1604             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1605             html, '%s form' % form_id, group='form')
1606         return self._hidden_inputs(form)
1607
1608     class FormatSort:
1609         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1610
1611         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1612                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1613                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1614         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1615                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1616                         'fps', 'fs_approx', 'source', 'id')
1617
1618         settings = {
1619             'vcodec': {'type': 'ordered', 'regex': True,
1620                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1621             'acodec': {'type': 'ordered', 'regex': True,
1622                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1623             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1624                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1625             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1626                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1627             'vext': {'type': 'ordered', 'field': 'video_ext',
1628                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1629                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1630             'aext': {'type': 'ordered', 'field': 'audio_ext',
1631                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1632                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1633             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1634             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1635                            'field': ('vcodec', 'acodec'),
1636                            'function': lambda it: int(any(v != 'none' for v in it))},
1637             'ie_pref': {'priority': True, 'type': 'extractor'},
1638             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1639             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1640             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1641             'quality': {'convert': 'float', 'default': -1},
1642             'filesize': {'convert': 'bytes'},
1643             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1644             'id': {'convert': 'string', 'field': 'format_id'},
1645             'height': {'convert': 'float_none'},
1646             'width': {'convert': 'float_none'},
1647             'fps': {'convert': 'float_none'},
1648             'tbr': {'convert': 'float_none'},
1649             'vbr': {'convert': 'float_none'},
1650             'abr': {'convert': 'float_none'},
1651             'asr': {'convert': 'float_none'},
1652             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1653
1654             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1655             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1656             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1657             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1658             'res': {'type': 'multiple', 'field': ('height', 'width'),
1659                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1660
1661             # For compatibility with youtube-dl
1662             'format_id': {'type': 'alias', 'field': 'id'},
1663             'preference': {'type': 'alias', 'field': 'ie_pref'},
1664             'language_preference': {'type': 'alias', 'field': 'lang'},
1665             'source_preference': {'type': 'alias', 'field': 'source'},
1666             'protocol': {'type': 'alias', 'field': 'proto'},
1667             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1668
1669             # Deprecated
1670             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1671             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1672             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1673             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1674             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1675             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1676             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1677             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1678             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1679             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1680             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1681             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1682             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1683             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1684             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1685             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1686             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1687             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1688             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1689             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1690         }
1691
1692         def __init__(self, ie, field_preference):
1693             self._order = []
1694             self.ydl = ie._downloader
1695             self.evaluate_params(self.ydl.params, field_preference)
1696             if ie.get_param('verbose'):
1697                 self.print_verbose_info(self.ydl.write_debug)
1698
1699         def _get_field_setting(self, field, key):
1700             if field not in self.settings:
1701                 if key in ('forced', 'priority'):
1702                     return False
1703                 self.ydl.deprecation_warning(
1704                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1705                     'and may be removed in a future version')
1706                 self.settings[field] = {}
1707             propObj = self.settings[field]
1708             if key not in propObj:
1709                 type = propObj.get('type')
1710                 if key == 'field':
1711                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1712                 elif key == 'convert':
1713                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1714                 else:
1715                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1716                 propObj[key] = default
1717             return propObj[key]
1718
1719         def _resolve_field_value(self, field, value, convertNone=False):
1720             if value is None:
1721                 if not convertNone:
1722                     return None
1723             else:
1724                 value = value.lower()
1725             conversion = self._get_field_setting(field, 'convert')
1726             if conversion == 'ignore':
1727                 return None
1728             if conversion == 'string':
1729                 return value
1730             elif conversion == 'float_none':
1731                 return float_or_none(value)
1732             elif conversion == 'bytes':
1733                 return FileDownloader.parse_bytes(value)
1734             elif conversion == 'order':
1735                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1736                 use_regex = self._get_field_setting(field, 'regex')
1737                 list_length = len(order_list)
1738                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1739                 if use_regex and value is not None:
1740                     for i, regex in enumerate(order_list):
1741                         if regex and re.match(regex, value):
1742                             return list_length - i
1743                     return list_length - empty_pos  # not in list
1744                 else:  # not regex or  value = None
1745                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1746             else:
1747                 if value.isnumeric():
1748                     return float(value)
1749                 else:
1750                     self.settings[field]['convert'] = 'string'
1751                     return value
1752
1753         def evaluate_params(self, params, sort_extractor):
1754             self._use_free_order = params.get('prefer_free_formats', False)
1755             self._sort_user = params.get('format_sort', [])
1756             self._sort_extractor = sort_extractor
1757
1758             def add_item(field, reverse, closest, limit_text):
1759                 field = field.lower()
1760                 if field in self._order:
1761                     return
1762                 self._order.append(field)
1763                 limit = self._resolve_field_value(field, limit_text)
1764                 data = {
1765                     'reverse': reverse,
1766                     'closest': False if limit is None else closest,
1767                     'limit_text': limit_text,
1768                     'limit': limit}
1769                 if field in self.settings:
1770                     self.settings[field].update(data)
1771                 else:
1772                     self.settings[field] = data
1773
1774             sort_list = (
1775                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1776                 + (tuple() if params.get('format_sort_force', False)
1777                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1778                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1779
1780             for item in sort_list:
1781                 match = re.match(self.regex, item)
1782                 if match is None:
1783                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1784                 field = match.group('field')
1785                 if field is None:
1786                     continue
1787                 if self._get_field_setting(field, 'type') == 'alias':
1788                     alias, field = field, self._get_field_setting(field, 'field')
1789                     if self._get_field_setting(alias, 'deprecated'):
1790                         self.ydl.deprecation_warning(
1791                             f'Format sorting alias {alias} is deprecated '
1792                             f'and may be removed in a future version. Please use {field} instead')
1793                 reverse = match.group('reverse') is not None
1794                 closest = match.group('separator') == '~'
1795                 limit_text = match.group('limit')
1796
1797                 has_limit = limit_text is not None
1798                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1799                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1800
1801                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1802                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1803                 limit_count = len(limits)
1804                 for (i, f) in enumerate(fields):
1805                     add_item(f, reverse, closest,
1806                              limits[i] if i < limit_count
1807                              else limits[0] if has_limit and not has_multiple_limits
1808                              else None)
1809
1810         def print_verbose_info(self, write_debug):
1811             if self._sort_user:
1812                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1813             if self._sort_extractor:
1814                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1815             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1816                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1817                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1818                               self._get_field_setting(field, 'limit_text'),
1819                               self._get_field_setting(field, 'limit'))
1820                 if self._get_field_setting(field, 'limit_text') is not None else '')
1821                 for field in self._order if self._get_field_setting(field, 'visible')]))
1822
1823         def _calculate_field_preference_from_value(self, format, field, type, value):
1824             reverse = self._get_field_setting(field, 'reverse')
1825             closest = self._get_field_setting(field, 'closest')
1826             limit = self._get_field_setting(field, 'limit')
1827
1828             if type == 'extractor':
1829                 maximum = self._get_field_setting(field, 'max')
1830                 if value is None or (maximum is not None and value >= maximum):
1831                     value = -1
1832             elif type == 'boolean':
1833                 in_list = self._get_field_setting(field, 'in_list')
1834                 not_in_list = self._get_field_setting(field, 'not_in_list')
1835                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1836             elif type == 'ordered':
1837                 value = self._resolve_field_value(field, value, True)
1838
1839             # try to convert to number
1840             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1841             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1842             if is_num:
1843                 value = val_num
1844
1845             return ((-10, 0) if value is None
1846                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1847                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1848                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1849                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1850                     else (-1, value, 0))
1851
1852         def _calculate_field_preference(self, format, field):
1853             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1854             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1855             if type == 'multiple':
1856                 type = 'field'  # Only 'field' is allowed in multiple for now
1857                 actual_fields = self._get_field_setting(field, 'field')
1858
1859                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1860             else:
1861                 value = get_value(field)
1862             return self._calculate_field_preference_from_value(format, field, type, value)
1863
1864         def calculate_preference(self, format):
1865             # Determine missing protocol
1866             if not format.get('protocol'):
1867                 format['protocol'] = determine_protocol(format)
1868
1869             # Determine missing ext
1870             if not format.get('ext') and 'url' in format:
1871                 format['ext'] = determine_ext(format['url'])
1872             if format.get('vcodec') == 'none':
1873                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1874                 format['video_ext'] = 'none'
1875             else:
1876                 format['video_ext'] = format['ext']
1877                 format['audio_ext'] = 'none'
1878             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1879             #    format['preference'] = -1000
1880
1881             # Determine missing bitrates
1882             if format.get('tbr') is None:
1883                 if format.get('vbr') is not None and format.get('abr') is not None:
1884                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1885             else:
1886                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1887                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1888                 if format.get('acodec') != 'none' and format.get('abr') is None:
1889                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1890
1891             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1892
1893     def _sort_formats(self, formats, field_preference=[]):
1894         if not formats:
1895             return
1896         format_sort = self.FormatSort(self, field_preference)
1897         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1898
1899     def _check_formats(self, formats, video_id):
1900         if formats:
1901             formats[:] = filter(
1902                 lambda f: self._is_valid_url(
1903                     f['url'], video_id,
1904                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1905                 formats)
1906
1907     @staticmethod
1908     def _remove_duplicate_formats(formats):
1909         format_urls = set()
1910         unique_formats = []
1911         for f in formats:
1912             if f['url'] not in format_urls:
1913                 format_urls.add(f['url'])
1914                 unique_formats.append(f)
1915         formats[:] = unique_formats
1916
1917     def _is_valid_url(self, url, video_id, item='video', headers={}):
1918         url = self._proto_relative_url(url, scheme='http:')
1919         # For now assume non HTTP(S) URLs always valid
1920         if not (url.startswith('http://') or url.startswith('https://')):
1921             return True
1922         try:
1923             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1924             return True
1925         except ExtractorError as e:
1926             self.to_screen(
1927                 '%s: %s URL is invalid, skipping: %s'
1928                 % (video_id, item, error_to_compat_str(e.cause)))
1929             return False
1930
1931     def http_scheme(self):
1932         """ Either "http:" or "https:", depending on the user's preferences """
1933         return (
1934             'http:'
1935             if self.get_param('prefer_insecure', False)
1936             else 'https:')
1937
1938     def _proto_relative_url(self, url, scheme=None):
1939         if url is None:
1940             return url
1941         if url.startswith('//'):
1942             if scheme is None:
1943                 scheme = self.http_scheme()
1944             return scheme + url
1945         else:
1946             return url
1947
1948     def _sleep(self, timeout, video_id, msg_template=None):
1949         if msg_template is None:
1950             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1951         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1952         self.to_screen(msg)
1953         time.sleep(timeout)
1954
1955     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1956                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1957                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1958         manifest = self._download_xml(
1959             manifest_url, video_id, 'Downloading f4m manifest',
1960             'Unable to download f4m manifest',
1961             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1962             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1963             transform_source=transform_source,
1964             fatal=fatal, data=data, headers=headers, query=query)
1965
1966         if manifest is False:
1967             return []
1968
1969         return self._parse_f4m_formats(
1970             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1971             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1972
1973     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1974                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1975                            fatal=True, m3u8_id=None):
1976         if not isinstance(manifest, compat_etree_Element) and not fatal:
1977             return []
1978
1979         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1980         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1981         if akamai_pv is not None and ';' in akamai_pv.text:
1982             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1983             if playerVerificationChallenge.strip() != '':
1984                 return []
1985
1986         formats = []
1987         manifest_version = '1.0'
1988         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1989         if not media_nodes:
1990             manifest_version = '2.0'
1991             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1992         # Remove unsupported DRM protected media from final formats
1993         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1994         media_nodes = remove_encrypted_media(media_nodes)
1995         if not media_nodes:
1996             return formats
1997
1998         manifest_base_url = get_base_url(manifest)
1999
2000         bootstrap_info = xpath_element(
2001             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2002             'bootstrap info', default=None)
2003
2004         vcodec = None
2005         mime_type = xpath_text(
2006             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2007             'base URL', default=None)
2008         if mime_type and mime_type.startswith('audio/'):
2009             vcodec = 'none'
2010
2011         for i, media_el in enumerate(media_nodes):
2012             tbr = int_or_none(media_el.attrib.get('bitrate'))
2013             width = int_or_none(media_el.attrib.get('width'))
2014             height = int_or_none(media_el.attrib.get('height'))
2015             format_id = join_nonempty(f4m_id, tbr or i)
2016             # If <bootstrapInfo> is present, the specified f4m is a
2017             # stream-level manifest, and only set-level manifests may refer to
2018             # external resources.  See section 11.4 and section 4 of F4M spec
2019             if bootstrap_info is None:
2020                 media_url = None
2021                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2022                 if manifest_version == '2.0':
2023                     media_url = media_el.attrib.get('href')
2024                 if media_url is None:
2025                     media_url = media_el.attrib.get('url')
2026                 if not media_url:
2027                     continue
2028                 manifest_url = (
2029                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2030                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2031                 # If media_url is itself a f4m manifest do the recursive extraction
2032                 # since bitrates in parent manifest (this one) and media_url manifest
2033                 # may differ leading to inability to resolve the format by requested
2034                 # bitrate in f4m downloader
2035                 ext = determine_ext(manifest_url)
2036                 if ext == 'f4m':
2037                     f4m_formats = self._extract_f4m_formats(
2038                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2039                         transform_source=transform_source, fatal=fatal)
2040                     # Sometimes stream-level manifest contains single media entry that
2041                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2042                     # At the same time parent's media entry in set-level manifest may
2043                     # contain it. We will copy it from parent in such cases.
2044                     if len(f4m_formats) == 1:
2045                         f = f4m_formats[0]
2046                         f.update({
2047                             'tbr': f.get('tbr') or tbr,
2048                             'width': f.get('width') or width,
2049                             'height': f.get('height') or height,
2050                             'format_id': f.get('format_id') if not tbr else format_id,
2051                             'vcodec': vcodec,
2052                         })
2053                     formats.extend(f4m_formats)
2054                     continue
2055                 elif ext == 'm3u8':
2056                     formats.extend(self._extract_m3u8_formats(
2057                         manifest_url, video_id, 'mp4', preference=preference,
2058                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2059                     continue
2060             formats.append({
2061                 'format_id': format_id,
2062                 'url': manifest_url,
2063                 'manifest_url': manifest_url,
2064                 'ext': 'flv' if bootstrap_info is not None else None,
2065                 'protocol': 'f4m',
2066                 'tbr': tbr,
2067                 'width': width,
2068                 'height': height,
2069                 'vcodec': vcodec,
2070                 'preference': preference,
2071                 'quality': quality,
2072             })
2073         return formats
2074
2075     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2076         return {
2077             'format_id': join_nonempty(m3u8_id, 'meta'),
2078             'url': m3u8_url,
2079             'ext': ext,
2080             'protocol': 'm3u8',
2081             'preference': preference - 100 if preference else -100,
2082             'quality': quality,
2083             'resolution': 'multiple',
2084             'format_note': 'Quality selection URL',
2085         }
2086
2087     def _report_ignoring_subs(self, name):
2088         self.report_warning(bug_reports_message(
2089             f'Ignoring subtitle tracks found in the {name} manifest; '
2090             'if any subtitle tracks are missing,'
2091         ), only_once=True)
2092
2093     def _extract_m3u8_formats(self, *args, **kwargs):
2094         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2095         if subs:
2096             self._report_ignoring_subs('HLS')
2097         return fmts
2098
2099     def _extract_m3u8_formats_and_subtitles(
2100             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2101             preference=None, quality=None, m3u8_id=None, note=None,
2102             errnote=None, fatal=True, live=False, data=None, headers={},
2103             query={}):
2104
2105         res = self._download_webpage_handle(
2106             m3u8_url, video_id,
2107             note='Downloading m3u8 information' if note is None else note,
2108             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2109             fatal=fatal, data=data, headers=headers, query=query)
2110
2111         if res is False:
2112             return [], {}
2113
2114         m3u8_doc, urlh = res
2115         m3u8_url = urlh.geturl()
2116
2117         return self._parse_m3u8_formats_and_subtitles(
2118             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2119             preference=preference, quality=quality, m3u8_id=m3u8_id,
2120             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2121             headers=headers, query=query, video_id=video_id)
2122
2123     def _parse_m3u8_formats_and_subtitles(
2124             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2125             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2126             errnote=None, fatal=True, data=None, headers={}, query={},
2127             video_id=None):
2128         formats, subtitles = [], {}
2129
2130         has_drm = re.search('|'.join([
2131             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2132             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2133         ]), m3u8_doc)
2134
2135         def format_url(url):
2136             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2137
2138         if self.get_param('hls_split_discontinuity', False):
2139             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2140                 if not m3u8_doc:
2141                     if not manifest_url:
2142                         return []
2143                     m3u8_doc = self._download_webpage(
2144                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2145                         note=False, errnote='Failed to download m3u8 playlist information')
2146                     if m3u8_doc is False:
2147                         return []
2148                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2149
2150         else:
2151             def _extract_m3u8_playlist_indices(*args, **kwargs):
2152                 return [None]
2153
2154         # References:
2155         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2156         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2157         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2158
2159         # We should try extracting formats only from master playlists [1, 4.3.4],
2160         # i.e. playlists that describe available qualities. On the other hand
2161         # media playlists [1, 4.3.3] should be returned as is since they contain
2162         # just the media without qualities renditions.
2163         # Fortunately, master playlist can be easily distinguished from media
2164         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2165         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2166         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2167         # media playlist and MUST NOT appear in master playlist thus we can
2168         # clearly detect media playlist with this criterion.
2169
2170         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2171             formats = [{
2172                 'format_id': join_nonempty(m3u8_id, idx),
2173                 'format_index': idx,
2174                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2175                 'ext': ext,
2176                 'protocol': entry_protocol,
2177                 'preference': preference,
2178                 'quality': quality,
2179                 'has_drm': has_drm,
2180             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2181
2182             return formats, subtitles
2183
2184         groups = {}
2185         last_stream_inf = {}
2186
2187         def extract_media(x_media_line):
2188             media = parse_m3u8_attributes(x_media_line)
2189             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2190             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2191             if not (media_type and group_id and name):
2192                 return
2193             groups.setdefault(group_id, []).append(media)
2194             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2195             if media_type == 'SUBTITLES':
2196                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2197                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2198                 # However, lack of URI has been spotted in the wild.
2199                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2200                 if not media.get('URI'):
2201                     return
2202                 url = format_url(media['URI'])
2203                 sub_info = {
2204                     'url': url,
2205                     'ext': determine_ext(url),
2206                 }
2207                 if sub_info['ext'] == 'm3u8':
2208                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2209                     # files may contain is WebVTT:
2210                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2211                     sub_info['ext'] = 'vtt'
2212                     sub_info['protocol'] = 'm3u8_native'
2213                 lang = media.get('LANGUAGE') or 'und'
2214                 subtitles.setdefault(lang, []).append(sub_info)
2215             if media_type not in ('VIDEO', 'AUDIO'):
2216                 return
2217             media_url = media.get('URI')
2218             if media_url:
2219                 manifest_url = format_url(media_url)
2220                 formats.extend({
2221                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2222                     'format_note': name,
2223                     'format_index': idx,
2224                     'url': manifest_url,
2225                     'manifest_url': m3u8_url,
2226                     'language': media.get('LANGUAGE'),
2227                     'ext': ext,
2228                     'protocol': entry_protocol,
2229                     'preference': preference,
2230                     'quality': quality,
2231                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2232                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2233
2234         def build_stream_name():
2235             # Despite specification does not mention NAME attribute for
2236             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2237             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2238             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2239             stream_name = last_stream_inf.get('NAME')
2240             if stream_name:
2241                 return stream_name
2242             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2243             # from corresponding rendition group
2244             stream_group_id = last_stream_inf.get('VIDEO')
2245             if not stream_group_id:
2246                 return
2247             stream_group = groups.get(stream_group_id)
2248             if not stream_group:
2249                 return stream_group_id
2250             rendition = stream_group[0]
2251             return rendition.get('NAME') or stream_group_id
2252
2253         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2254         # chance to detect video only formats when EXT-X-STREAM-INF tags
2255         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2256         for line in m3u8_doc.splitlines():
2257             if line.startswith('#EXT-X-MEDIA:'):
2258                 extract_media(line)
2259
2260         for line in m3u8_doc.splitlines():
2261             if line.startswith('#EXT-X-STREAM-INF:'):
2262                 last_stream_inf = parse_m3u8_attributes(line)
2263             elif line.startswith('#') or not line.strip():
2264                 continue
2265             else:
2266                 tbr = float_or_none(
2267                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2268                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2269                 manifest_url = format_url(line.strip())
2270
2271                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2272                     format_id = [m3u8_id, None, idx]
2273                     # Bandwidth of live streams may differ over time thus making
2274                     # format_id unpredictable. So it's better to keep provided
2275                     # format_id intact.
2276                     if not live:
2277                         stream_name = build_stream_name()
2278                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2279                     f = {
2280                         'format_id': join_nonempty(*format_id),
2281                         'format_index': idx,
2282                         'url': manifest_url,
2283                         'manifest_url': m3u8_url,
2284                         'tbr': tbr,
2285                         'ext': ext,
2286                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2287                         'protocol': entry_protocol,
2288                         'preference': preference,
2289                         'quality': quality,
2290                     }
2291                     resolution = last_stream_inf.get('RESOLUTION')
2292                     if resolution:
2293                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2294                         if mobj:
2295                             f['width'] = int(mobj.group('width'))
2296                             f['height'] = int(mobj.group('height'))
2297                     # Unified Streaming Platform
2298                     mobj = re.search(
2299                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2300                     if mobj:
2301                         abr, vbr = mobj.groups()
2302                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2303                         f.update({
2304                             'vbr': vbr,
2305                             'abr': abr,
2306                         })
2307                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2308                     f.update(codecs)
2309                     audio_group_id = last_stream_inf.get('AUDIO')
2310                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2311                     # references a rendition group MUST have a CODECS attribute.
2312                     # However, this is not always respected, for example, [2]
2313                     # contains EXT-X-STREAM-INF tag which references AUDIO
2314                     # rendition group but does not have CODECS and despite
2315                     # referencing an audio group it represents a complete
2316                     # (with audio and video) format. So, for such cases we will
2317                     # ignore references to rendition groups and treat them
2318                     # as complete formats.
2319                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2320                         audio_group = groups.get(audio_group_id)
2321                         if audio_group and audio_group[0].get('URI'):
2322                             # TODO: update acodec for audio only formats with
2323                             # the same GROUP-ID
2324                             f['acodec'] = 'none'
2325                     if not f.get('ext'):
2326                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2327                     formats.append(f)
2328
2329                     # for DailyMotion
2330                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2331                     if progressive_uri:
2332                         http_f = f.copy()
2333                         del http_f['manifest_url']
2334                         http_f.update({
2335                             'format_id': f['format_id'].replace('hls-', 'http-'),
2336                             'protocol': 'http',
2337                             'url': progressive_uri,
2338                         })
2339                         formats.append(http_f)
2340
2341                 last_stream_inf = {}
2342         return formats, subtitles
2343
2344     def _extract_m3u8_vod_duration(
2345             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2346
2347         m3u8_vod = self._download_webpage(
2348             m3u8_vod_url, video_id,
2349             note='Downloading m3u8 VOD manifest' if note is None else note,
2350             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2351             fatal=False, data=data, headers=headers, query=query)
2352
2353         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2354
2355     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2356         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2357             return None
2358
2359         return int(sum(
2360             float(line[len('#EXTINF:'):].split(',')[0])
2361             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2362
2363     @staticmethod
2364     def _xpath_ns(path, namespace=None):
2365         if not namespace:
2366             return path
2367         out = []
2368         for c in path.split('/'):
2369             if not c or c == '.':
2370                 out.append(c)
2371             else:
2372                 out.append('{%s}%s' % (namespace, c))
2373         return '/'.join(out)
2374
2375     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2376         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2377
2378         if smil is False:
2379             assert not fatal
2380             return [], {}
2381
2382         namespace = self._parse_smil_namespace(smil)
2383
2384         fmts = self._parse_smil_formats(
2385             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2386         subs = self._parse_smil_subtitles(
2387             smil, namespace=namespace)
2388
2389         return fmts, subs
2390
2391     def _extract_smil_formats(self, *args, **kwargs):
2392         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2393         if subs:
2394             self._report_ignoring_subs('SMIL')
2395         return fmts
2396
2397     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2398         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2399         if smil is False:
2400             return {}
2401         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2402
2403     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2404         return self._download_xml(
2405             smil_url, video_id, 'Downloading SMIL file',
2406             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2407
2408     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2409         namespace = self._parse_smil_namespace(smil)
2410
2411         formats = self._parse_smil_formats(
2412             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2413         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2414
2415         video_id = os.path.splitext(url_basename(smil_url))[0]
2416         title = None
2417         description = None
2418         upload_date = None
2419         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2420             name = meta.attrib.get('name')
2421             content = meta.attrib.get('content')
2422             if not name or not content:
2423                 continue
2424             if not title and name == 'title':
2425                 title = content
2426             elif not description and name in ('description', 'abstract'):
2427                 description = content
2428             elif not upload_date and name == 'date':
2429                 upload_date = unified_strdate(content)
2430
2431         thumbnails = [{
2432             'id': image.get('type'),
2433             'url': image.get('src'),
2434             'width': int_or_none(image.get('width')),
2435             'height': int_or_none(image.get('height')),
2436         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2437
2438         return {
2439             'id': video_id,
2440             'title': title or video_id,
2441             'description': description,
2442             'upload_date': upload_date,
2443             'thumbnails': thumbnails,
2444             'formats': formats,
2445             'subtitles': subtitles,
2446         }
2447
2448     def _parse_smil_namespace(self, smil):
2449         return self._search_regex(
2450             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2451
2452     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2453         base = smil_url
2454         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2455             b = meta.get('base') or meta.get('httpBase')
2456             if b:
2457                 base = b
2458                 break
2459
2460         formats = []
2461         rtmp_count = 0
2462         http_count = 0
2463         m3u8_count = 0
2464         imgs_count = 0
2465
2466         srcs = set()
2467         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2468         for medium in media:
2469             src = medium.get('src')
2470             if not src or src in srcs:
2471                 continue
2472             srcs.add(src)
2473
2474             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2475             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2476             width = int_or_none(medium.get('width'))
2477             height = int_or_none(medium.get('height'))
2478             proto = medium.get('proto')
2479             ext = medium.get('ext')
2480             src_ext = determine_ext(src)
2481             streamer = medium.get('streamer') or base
2482
2483             if proto == 'rtmp' or streamer.startswith('rtmp'):
2484                 rtmp_count += 1
2485                 formats.append({
2486                     'url': streamer,
2487                     'play_path': src,
2488                     'ext': 'flv',
2489                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2490                     'tbr': bitrate,
2491                     'filesize': filesize,
2492                     'width': width,
2493                     'height': height,
2494                 })
2495                 if transform_rtmp_url:
2496                     streamer, src = transform_rtmp_url(streamer, src)
2497                     formats[-1].update({
2498                         'url': streamer,
2499                         'play_path': src,
2500                     })
2501                 continue
2502
2503             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2504             src_url = src_url.strip()
2505
2506             if proto == 'm3u8' or src_ext == 'm3u8':
2507                 m3u8_formats = self._extract_m3u8_formats(
2508                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2509                 if len(m3u8_formats) == 1:
2510                     m3u8_count += 1
2511                     m3u8_formats[0].update({
2512                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2513                         'tbr': bitrate,
2514                         'width': width,
2515                         'height': height,
2516                     })
2517                 formats.extend(m3u8_formats)
2518             elif src_ext == 'f4m':
2519                 f4m_url = src_url
2520                 if not f4m_params:
2521                     f4m_params = {
2522                         'hdcore': '3.2.0',
2523                         'plugin': 'flowplayer-3.2.0.1',
2524                     }
2525                 f4m_url += '&' if '?' in f4m_url else '?'
2526                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2527                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2528             elif src_ext == 'mpd':
2529                 formats.extend(self._extract_mpd_formats(
2530                     src_url, video_id, mpd_id='dash', fatal=False))
2531             elif re.search(r'\.ism/[Mm]anifest', src_url):
2532                 formats.extend(self._extract_ism_formats(
2533                     src_url, video_id, ism_id='mss', fatal=False))
2534             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2535                 http_count += 1
2536                 formats.append({
2537                     'url': src_url,
2538                     'ext': ext or src_ext or 'flv',
2539                     'format_id': 'http-%d' % (bitrate or http_count),
2540                     'tbr': bitrate,
2541                     'filesize': filesize,
2542                     'width': width,
2543                     'height': height,
2544                 })
2545
2546         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2547             src = medium.get('src')
2548             if not src or src in srcs:
2549                 continue
2550             srcs.add(src)
2551
2552             imgs_count += 1
2553             formats.append({
2554                 'format_id': 'imagestream-%d' % (imgs_count),
2555                 'url': src,
2556                 'ext': mimetype2ext(medium.get('type')),
2557                 'acodec': 'none',
2558                 'vcodec': 'none',
2559                 'width': int_or_none(medium.get('width')),
2560                 'height': int_or_none(medium.get('height')),
2561                 'format_note': 'SMIL storyboards',
2562             })
2563
2564         return formats
2565
2566     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2567         urls = []
2568         subtitles = {}
2569         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2570             src = textstream.get('src')
2571             if not src or src in urls:
2572                 continue
2573             urls.append(src)
2574             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2575             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2576             subtitles.setdefault(lang, []).append({
2577                 'url': src,
2578                 'ext': ext,
2579             })
2580         return subtitles
2581
2582     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2583         xspf = self._download_xml(
2584             xspf_url, playlist_id, 'Downloading xpsf playlist',
2585             'Unable to download xspf manifest', fatal=fatal)
2586         if xspf is False:
2587             return []
2588         return self._parse_xspf(
2589             xspf, playlist_id, xspf_url=xspf_url,
2590             xspf_base_url=base_url(xspf_url))
2591
2592     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2593         NS_MAP = {
2594             'xspf': 'http://xspf.org/ns/0/',
2595             's1': 'http://static.streamone.nl/player/ns/0',
2596         }
2597
2598         entries = []
2599         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2600             title = xpath_text(
2601                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2602             description = xpath_text(
2603                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2604             thumbnail = xpath_text(
2605                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2606             duration = float_or_none(
2607                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2608
2609             formats = []
2610             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2611                 format_url = urljoin(xspf_base_url, location.text)
2612                 if not format_url:
2613                     continue
2614                 formats.append({
2615                     'url': format_url,
2616                     'manifest_url': xspf_url,
2617                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2618                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2619                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2620                 })
2621             self._sort_formats(formats)
2622
2623             entries.append({
2624                 'id': playlist_id,
2625                 'title': title,
2626                 'description': description,
2627                 'thumbnail': thumbnail,
2628                 'duration': duration,
2629                 'formats': formats,
2630             })
2631         return entries
2632
2633     def _extract_mpd_formats(self, *args, **kwargs):
2634         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2635         if subs:
2636             self._report_ignoring_subs('DASH')
2637         return fmts
2638
2639     def _extract_mpd_formats_and_subtitles(
2640             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2641             fatal=True, data=None, headers={}, query={}):
2642         res = self._download_xml_handle(
2643             mpd_url, video_id,
2644             note='Downloading MPD manifest' if note is None else note,
2645             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2646             fatal=fatal, data=data, headers=headers, query=query)
2647         if res is False:
2648             return [], {}
2649         mpd_doc, urlh = res
2650         if mpd_doc is None:
2651             return [], {}
2652         mpd_base_url = base_url(urlh.geturl())
2653
2654         return self._parse_mpd_formats_and_subtitles(
2655             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2656
2657     def _parse_mpd_formats(self, *args, **kwargs):
2658         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2659         if subs:
2660             self._report_ignoring_subs('DASH')
2661         return fmts
2662
2663     def _parse_mpd_formats_and_subtitles(
2664             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2665         """
2666         Parse formats from MPD manifest.
2667         References:
2668          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2669             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2670          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2671         """
2672         if not self.get_param('dynamic_mpd', True):
2673             if mpd_doc.get('type') == 'dynamic':
2674                 return [], {}
2675
2676         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2677
2678         def _add_ns(path):
2679             return self._xpath_ns(path, namespace)
2680
2681         def is_drm_protected(element):
2682             return element.find(_add_ns('ContentProtection')) is not None
2683
2684         def extract_multisegment_info(element, ms_parent_info):
2685             ms_info = ms_parent_info.copy()
2686
2687             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2688             # common attributes and elements.  We will only extract relevant
2689             # for us.
2690             def extract_common(source):
2691                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2692                 if segment_timeline is not None:
2693                     s_e = segment_timeline.findall(_add_ns('S'))
2694                     if s_e:
2695                         ms_info['total_number'] = 0
2696                         ms_info['s'] = []
2697                         for s in s_e:
2698                             r = int(s.get('r', 0))
2699                             ms_info['total_number'] += 1 + r
2700                             ms_info['s'].append({
2701                                 't': int(s.get('t', 0)),
2702                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2703                                 'd': int(s.attrib['d']),
2704                                 'r': r,
2705                             })
2706                 start_number = source.get('startNumber')
2707                 if start_number:
2708                     ms_info['start_number'] = int(start_number)
2709                 timescale = source.get('timescale')
2710                 if timescale:
2711                     ms_info['timescale'] = int(timescale)
2712                 segment_duration = source.get('duration')
2713                 if segment_duration:
2714                     ms_info['segment_duration'] = float(segment_duration)
2715
2716             def extract_Initialization(source):
2717                 initialization = source.find(_add_ns('Initialization'))
2718                 if initialization is not None:
2719                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2720
2721             segment_list = element.find(_add_ns('SegmentList'))
2722             if segment_list is not None:
2723                 extract_common(segment_list)
2724                 extract_Initialization(segment_list)
2725                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2726                 if segment_urls_e:
2727                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2728             else:
2729                 segment_template = element.find(_add_ns('SegmentTemplate'))
2730                 if segment_template is not None:
2731                     extract_common(segment_template)
2732                     media = segment_template.get('media')
2733                     if media:
2734                         ms_info['media'] = media
2735                     initialization = segment_template.get('initialization')
2736                     if initialization:
2737                         ms_info['initialization'] = initialization
2738                     else:
2739                         extract_Initialization(segment_template)
2740             return ms_info
2741
2742         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2743         formats, subtitles = [], {}
2744         stream_numbers = collections.defaultdict(int)
2745         for period in mpd_doc.findall(_add_ns('Period')):
2746             period_duration = parse_duration(period.get('duration')) or mpd_duration
2747             period_ms_info = extract_multisegment_info(period, {
2748                 'start_number': 1,
2749                 'timescale': 1,
2750             })
2751             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2752                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2753                 for representation in adaptation_set.findall(_add_ns('Representation')):
2754                     representation_attrib = adaptation_set.attrib.copy()
2755                     representation_attrib.update(representation.attrib)
2756                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2757                     mime_type = representation_attrib['mimeType']
2758                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2759
2760                     codecs = parse_codecs(representation_attrib.get('codecs', ''))
2761                     if content_type not in ('video', 'audio', 'text'):
2762                         if mime_type == 'image/jpeg':
2763                             content_type = mime_type
2764                         elif codecs['vcodec'] != 'none':
2765                             content_type = 'video'
2766                         elif codecs['acodec'] != 'none':
2767                             content_type = 'audio'
2768                         elif codecs.get('tcodec', 'none') != 'none':
2769                             content_type = 'text'
2770                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2771                             content_type = 'text'
2772                         else:
2773                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2774                             continue
2775
2776                     base_url = ''
2777                     for element in (representation, adaptation_set, period, mpd_doc):
2778                         base_url_e = element.find(_add_ns('BaseURL'))
2779                         if base_url_e is not None:
2780                             base_url = base_url_e.text + base_url
2781                             if re.match(r'^https?://', base_url):
2782                                 break
2783                     if mpd_base_url and base_url.startswith('/'):
2784                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2785                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2786                         if not mpd_base_url.endswith('/'):
2787                             mpd_base_url += '/'
2788                         base_url = mpd_base_url + base_url
2789                     representation_id = representation_attrib.get('id')
2790                     lang = representation_attrib.get('lang')
2791                     url_el = representation.find(_add_ns('BaseURL'))
2792                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2793                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2794                     if representation_id is not None:
2795                         format_id = representation_id
2796                     else:
2797                         format_id = content_type
2798                     if mpd_id:
2799                         format_id = mpd_id + '-' + format_id
2800                     if content_type in ('video', 'audio'):
2801                         f = {
2802                             'format_id': format_id,
2803                             'manifest_url': mpd_url,
2804                             'ext': mimetype2ext(mime_type),
2805                             'width': int_or_none(representation_attrib.get('width')),
2806                             'height': int_or_none(representation_attrib.get('height')),
2807                             'tbr': float_or_none(bandwidth, 1000),
2808                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2809                             'fps': int_or_none(representation_attrib.get('frameRate')),
2810                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2811                             'format_note': 'DASH %s' % content_type,
2812                             'filesize': filesize,
2813                             'container': mimetype2ext(mime_type) + '_dash',
2814                             **codecs
2815                         }
2816                     elif content_type == 'text':
2817                         f = {
2818                             'ext': mimetype2ext(mime_type),
2819                             'manifest_url': mpd_url,
2820                             'filesize': filesize,
2821                         }
2822                     elif content_type == 'image/jpeg':
2823                         # See test case in VikiIE
2824                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2825                         f = {
2826                             'format_id': format_id,
2827                             'ext': 'mhtml',
2828                             'manifest_url': mpd_url,
2829                             'format_note': 'DASH storyboards (jpeg)',
2830                             'acodec': 'none',
2831                             'vcodec': 'none',
2832                         }
2833                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2834                         f['has_drm'] = True
2835                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2836
2837                     def prepare_template(template_name, identifiers):
2838                         tmpl = representation_ms_info[template_name]
2839                         # First of, % characters outside $...$ templates
2840                         # must be escaped by doubling for proper processing
2841                         # by % operator string formatting used further (see
2842                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2843                         t = ''
2844                         in_template = False
2845                         for c in tmpl:
2846                             t += c
2847                             if c == '$':
2848                                 in_template = not in_template
2849                             elif c == '%' and not in_template:
2850                                 t += c
2851                         # Next, $...$ templates are translated to their
2852                         # %(...) counterparts to be used with % operator
2853                         if representation_id is not None:
2854                             t = t.replace('$RepresentationID$', representation_id)
2855                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2856                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2857                         t.replace('$$', '$')
2858                         return t
2859
2860                     # @initialization is a regular template like @media one
2861                     # so it should be handled just the same way (see
2862                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2863                     if 'initialization' in representation_ms_info:
2864                         initialization_template = prepare_template(
2865                             'initialization',
2866                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2867                             # $Time$ shall not be included for @initialization thus
2868                             # only $Bandwidth$ remains
2869                             ('Bandwidth', ))
2870                         representation_ms_info['initialization_url'] = initialization_template % {
2871                             'Bandwidth': bandwidth,
2872                         }
2873
2874                     def location_key(location):
2875                         return 'url' if re.match(r'^https?://', location) else 'path'
2876
2877                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2878
2879                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2880                         media_location_key = location_key(media_template)
2881
2882                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2883                         # can't be used at the same time
2884                         if '%(Number' in media_template and 's' not in representation_ms_info:
2885                             segment_duration = None
2886                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2887                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2888                                 representation_ms_info['total_number'] = int(math.ceil(
2889                                     float_or_none(period_duration, segment_duration, default=0)))
2890                             representation_ms_info['fragments'] = [{
2891                                 media_location_key: media_template % {
2892                                     'Number': segment_number,
2893                                     'Bandwidth': bandwidth,
2894                                 },
2895                                 'duration': segment_duration,
2896                             } for segment_number in range(
2897                                 representation_ms_info['start_number'],
2898                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2899                         else:
2900                             # $Number*$ or $Time$ in media template with S list available
2901                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2902                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2903                             representation_ms_info['fragments'] = []
2904                             segment_time = 0
2905                             segment_d = None
2906                             segment_number = representation_ms_info['start_number']
2907
2908                             def add_segment_url():
2909                                 segment_url = media_template % {
2910                                     'Time': segment_time,
2911                                     'Bandwidth': bandwidth,
2912                                     'Number': segment_number,
2913                                 }
2914                                 representation_ms_info['fragments'].append({
2915                                     media_location_key: segment_url,
2916                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2917                                 })
2918
2919                             for num, s in enumerate(representation_ms_info['s']):
2920                                 segment_time = s.get('t') or segment_time
2921                                 segment_d = s['d']
2922                                 add_segment_url()
2923                                 segment_number += 1
2924                                 for r in range(s.get('r', 0)):
2925                                     segment_time += segment_d
2926                                     add_segment_url()
2927                                     segment_number += 1
2928                                 segment_time += segment_d
2929                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2930                         # No media template
2931                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2932                         # or any YouTube dashsegments video
2933                         fragments = []
2934                         segment_index = 0
2935                         timescale = representation_ms_info['timescale']
2936                         for s in representation_ms_info['s']:
2937                             duration = float_or_none(s['d'], timescale)
2938                             for r in range(s.get('r', 0) + 1):
2939                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2940                                 fragments.append({
2941                                     location_key(segment_uri): segment_uri,
2942                                     'duration': duration,
2943                                 })
2944                                 segment_index += 1
2945                         representation_ms_info['fragments'] = fragments
2946                     elif 'segment_urls' in representation_ms_info:
2947                         # Segment URLs with no SegmentTimeline
2948                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2949                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2950                         fragments = []
2951                         segment_duration = float_or_none(
2952                             representation_ms_info['segment_duration'],
2953                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2954                         for segment_url in representation_ms_info['segment_urls']:
2955                             fragment = {
2956                                 location_key(segment_url): segment_url,
2957                             }
2958                             if segment_duration:
2959                                 fragment['duration'] = segment_duration
2960                             fragments.append(fragment)
2961                         representation_ms_info['fragments'] = fragments
2962                     # If there is a fragments key available then we correctly recognized fragmented media.
2963                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2964                     # assumption is not necessarily correct since we may simply have no support for
2965                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2966                     if 'fragments' in representation_ms_info:
2967                         f.update({
2968                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2969                             'url': mpd_url or base_url,
2970                             'fragment_base_url': base_url,
2971                             'fragments': [],
2972                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2973                         })
2974                         if 'initialization_url' in representation_ms_info:
2975                             initialization_url = representation_ms_info['initialization_url']
2976                             if not f.get('url'):
2977                                 f['url'] = initialization_url
2978                             f['fragments'].append({location_key(initialization_url): initialization_url})
2979                         f['fragments'].extend(representation_ms_info['fragments'])
2980                         if not period_duration:
2981                             period_duration = try_get(
2982                                 representation_ms_info,
2983                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2984                     else:
2985                         # Assuming direct URL to unfragmented media.
2986                         f['url'] = base_url
2987                     if content_type in ('video', 'audio', 'image/jpeg'):
2988                         f['manifest_stream_number'] = stream_numbers[f['url']]
2989                         stream_numbers[f['url']] += 1
2990                         formats.append(f)
2991                     elif content_type == 'text':
2992                         subtitles.setdefault(lang or 'und', []).append(f)
2993
2994         return formats, subtitles
2995
2996     def _extract_ism_formats(self, *args, **kwargs):
2997         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2998         if subs:
2999             self._report_ignoring_subs('ISM')
3000         return fmts
3001
3002     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3003         res = self._download_xml_handle(
3004             ism_url, video_id,
3005             note='Downloading ISM manifest' if note is None else note,
3006             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3007             fatal=fatal, data=data, headers=headers, query=query)
3008         if res is False:
3009             return [], {}
3010         ism_doc, urlh = res
3011         if ism_doc is None:
3012             return [], {}
3013
3014         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3015
3016     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3017         """
3018         Parse formats from ISM manifest.
3019         References:
3020          1. [MS-SSTR]: Smooth Streaming Protocol,
3021             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3022         """
3023         if ism_doc.get('IsLive') == 'TRUE':
3024             return [], {}
3025
3026         duration = int(ism_doc.attrib['Duration'])
3027         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3028
3029         formats = []
3030         subtitles = {}
3031         for stream in ism_doc.findall('StreamIndex'):
3032             stream_type = stream.get('Type')
3033             if stream_type not in ('video', 'audio', 'text'):
3034                 continue
3035             url_pattern = stream.attrib['Url']
3036             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3037             stream_name = stream.get('Name')
3038             stream_language = stream.get('Language', 'und')
3039             for track in stream.findall('QualityLevel'):
3040                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3041                 # TODO: add support for WVC1 and WMAP
3042                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3043                     self.report_warning('%s is not a supported codec' % fourcc)
3044                     continue
3045                 tbr = int(track.attrib['Bitrate']) // 1000
3046                 # [1] does not mention Width and Height attributes. However,
3047                 # they're often present while MaxWidth and MaxHeight are
3048                 # missing, so should be used as fallbacks
3049                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3050                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3051                 sampling_rate = int_or_none(track.get('SamplingRate'))
3052
3053                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3054                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3055
3056                 fragments = []
3057                 fragment_ctx = {
3058                     'time': 0,
3059                 }
3060                 stream_fragments = stream.findall('c')
3061                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3062                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3063                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3064                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3065                     if not fragment_ctx['duration']:
3066                         try:
3067                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3068                         except IndexError:
3069                             next_fragment_time = duration
3070                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3071                     for _ in range(fragment_repeat):
3072                         fragments.append({
3073                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3074                             'duration': fragment_ctx['duration'] / stream_timescale,
3075                         })
3076                         fragment_ctx['time'] += fragment_ctx['duration']
3077
3078                 if stream_type == 'text':
3079                     subtitles.setdefault(stream_language, []).append({
3080                         'ext': 'ismt',
3081                         'protocol': 'ism',
3082                         'url': ism_url,
3083                         'manifest_url': ism_url,
3084                         'fragments': fragments,
3085                         '_download_params': {
3086                             'stream_type': stream_type,
3087                             'duration': duration,
3088                             'timescale': stream_timescale,
3089                             'fourcc': fourcc,
3090                             'language': stream_language,
3091                             'codec_private_data': track.get('CodecPrivateData'),
3092                         }
3093                     })
3094                 elif stream_type in ('video', 'audio'):
3095                     formats.append({
3096                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3097                         'url': ism_url,
3098                         'manifest_url': ism_url,
3099                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3100                         'width': width,
3101                         'height': height,
3102                         'tbr': tbr,
3103                         'asr': sampling_rate,
3104                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3105                         'acodec': 'none' if stream_type == 'video' else fourcc,
3106                         'protocol': 'ism',
3107                         'fragments': fragments,
3108                         'has_drm': ism_doc.find('Protection') is not None,
3109                         '_download_params': {
3110                             'stream_type': stream_type,
3111                             'duration': duration,
3112                             'timescale': stream_timescale,
3113                             'width': width or 0,
3114                             'height': height or 0,
3115                             'fourcc': fourcc,
3116                             'language': stream_language,
3117                             'codec_private_data': track.get('CodecPrivateData'),
3118                             'sampling_rate': sampling_rate,
3119                             'channels': int_or_none(track.get('Channels', 2)),
3120                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3121                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3122                         },
3123                     })
3124         return formats, subtitles
3125
3126     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3127         def absolute_url(item_url):
3128             return urljoin(base_url, item_url)
3129
3130         def parse_content_type(content_type):
3131             if not content_type:
3132                 return {}
3133             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3134             if ctr:
3135                 mimetype, codecs = ctr.groups()
3136                 f = parse_codecs(codecs)
3137                 f['ext'] = mimetype2ext(mimetype)
3138                 return f
3139             return {}
3140
3141         def _media_formats(src, cur_media_type, type_info={}):
3142             full_url = absolute_url(src)
3143             ext = type_info.get('ext') or determine_ext(full_url)
3144             if ext == 'm3u8':
3145                 is_plain_url = False
3146                 formats = self._extract_m3u8_formats(
3147                     full_url, video_id, ext='mp4',
3148                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3149                     preference=preference, quality=quality, fatal=False)
3150             elif ext == 'mpd':
3151                 is_plain_url = False
3152                 formats = self._extract_mpd_formats(
3153                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3154             else:
3155                 is_plain_url = True
3156                 formats = [{
3157                     'url': full_url,
3158                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3159                 }]
3160             return is_plain_url, formats
3161
3162         entries = []
3163         # amp-video and amp-audio are very similar to their HTML5 counterparts
3164         # so we wll include them right here (see
3165         # https://www.ampproject.org/docs/reference/components/amp-video)
3166         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3167         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3168         media_tags = [(media_tag, media_tag_name, media_type, '')
3169                       for media_tag, media_tag_name, media_type
3170                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3171         media_tags.extend(re.findall(
3172             # We only allow video|audio followed by a whitespace or '>'.
3173             # Allowing more characters may end up in significant slow down (see
3174             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3175             # http://www.porntrex.com/maps/videositemap.xml).
3176             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3177         for media_tag, _, media_type, media_content in media_tags:
3178             media_info = {
3179                 'formats': [],
3180                 'subtitles': {},
3181             }
3182             media_attributes = extract_attributes(media_tag)
3183             src = strip_or_none(media_attributes.get('src'))
3184             if src:
3185                 _, formats = _media_formats(src, media_type)
3186                 media_info['formats'].extend(formats)
3187             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3188             if media_content:
3189                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3190                     s_attr = extract_attributes(source_tag)
3191                     # data-video-src and data-src are non standard but seen
3192                     # several times in the wild
3193                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3194                     if not src:
3195                         continue
3196                     f = parse_content_type(s_attr.get('type'))
3197                     is_plain_url, formats = _media_formats(src, media_type, f)
3198                     if is_plain_url:
3199                         # width, height, res, label and title attributes are
3200                         # all not standard but seen several times in the wild
3201                         labels = [
3202                             s_attr.get(lbl)
3203                             for lbl in ('label', 'title')
3204                             if str_or_none(s_attr.get(lbl))
3205                         ]
3206                         width = int_or_none(s_attr.get('width'))
3207                         height = (int_or_none(s_attr.get('height'))
3208                                   or int_or_none(s_attr.get('res')))
3209                         if not width or not height:
3210                             for lbl in labels:
3211                                 resolution = parse_resolution(lbl)
3212                                 if not resolution:
3213                                     continue
3214                                 width = width or resolution.get('width')
3215                                 height = height or resolution.get('height')
3216                         for lbl in labels:
3217                             tbr = parse_bitrate(lbl)
3218                             if tbr:
3219                                 break
3220                         else:
3221                             tbr = None
3222                         f.update({
3223                             'width': width,
3224                             'height': height,
3225                             'tbr': tbr,
3226                             'format_id': s_attr.get('label') or s_attr.get('title'),
3227                         })
3228                         f.update(formats[0])
3229                         media_info['formats'].append(f)
3230                     else:
3231                         media_info['formats'].extend(formats)
3232                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3233                     track_attributes = extract_attributes(track_tag)
3234                     kind = track_attributes.get('kind')
3235                     if not kind or kind in ('subtitles', 'captions'):
3236                         src = strip_or_none(track_attributes.get('src'))
3237                         if not src:
3238                             continue
3239                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3240                         media_info['subtitles'].setdefault(lang, []).append({
3241                             'url': absolute_url(src),
3242                         })
3243             for f in media_info['formats']:
3244                 f.setdefault('http_headers', {})['Referer'] = base_url
3245             if media_info['formats'] or media_info['subtitles']:
3246                 entries.append(media_info)
3247         return entries
3248
3249     def _extract_akamai_formats(self, *args, **kwargs):
3250         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3251         if subs:
3252             self._report_ignoring_subs('akamai')
3253         return fmts
3254
3255     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3256         signed = 'hdnea=' in manifest_url
3257         if not signed:
3258             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3259             manifest_url = re.sub(
3260                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3261                 '', manifest_url).strip('?')
3262
3263         formats = []
3264         subtitles = {}
3265
3266         hdcore_sign = 'hdcore=3.7.0'
3267         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3268         hds_host = hosts.get('hds')
3269         if hds_host:
3270             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3271         if 'hdcore=' not in f4m_url:
3272             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3273         f4m_formats = self._extract_f4m_formats(
3274             f4m_url, video_id, f4m_id='hds', fatal=False)
3275         for entry in f4m_formats:
3276             entry.update({'extra_param_to_segment_url': hdcore_sign})
3277         formats.extend(f4m_formats)
3278
3279         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3280         hls_host = hosts.get('hls')
3281         if hls_host:
3282             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3283         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3284             m3u8_url, video_id, 'mp4', 'm3u8_native',
3285             m3u8_id='hls', fatal=False)
3286         formats.extend(m3u8_formats)
3287         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3288
3289         http_host = hosts.get('http')
3290         if http_host and m3u8_formats and not signed:
3291             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3292             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3293             qualities_length = len(qualities)
3294             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3295                 i = 0
3296                 for f in m3u8_formats:
3297                     if f['vcodec'] != 'none':
3298                         for protocol in ('http', 'https'):
3299                             http_f = f.copy()
3300                             del http_f['manifest_url']
3301                             http_url = re.sub(
3302                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3303                             http_f.update({
3304                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3305                                 'url': http_url,
3306                                 'protocol': protocol,
3307                             })
3308                             formats.append(http_f)
3309                         i += 1
3310
3311         return formats, subtitles
3312
3313     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3314         query = compat_urlparse.urlparse(url).query
3315         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3316         mobj = re.search(
3317             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3318         url_base = mobj.group('url')
3319         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3320         formats = []
3321
3322         def manifest_url(manifest):
3323             m_url = '%s/%s' % (http_base_url, manifest)
3324             if query:
3325                 m_url += '?%s' % query
3326             return m_url
3327
3328         if 'm3u8' not in skip_protocols:
3329             formats.extend(self._extract_m3u8_formats(
3330                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3331                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3332         if 'f4m' not in skip_protocols:
3333             formats.extend(self._extract_f4m_formats(
3334                 manifest_url('manifest.f4m'),
3335                 video_id, f4m_id='hds', fatal=False))
3336         if 'dash' not in skip_protocols:
3337             formats.extend(self._extract_mpd_formats(
3338                 manifest_url('manifest.mpd'),
3339                 video_id, mpd_id='dash', fatal=False))
3340         if re.search(r'(?:/smil:|\.smil)', url_base):
3341             if 'smil' not in skip_protocols:
3342                 rtmp_formats = self._extract_smil_formats(
3343                     manifest_url('jwplayer.smil'),
3344                     video_id, fatal=False)
3345                 for rtmp_format in rtmp_formats:
3346                     rtsp_format = rtmp_format.copy()
3347                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3348                     del rtsp_format['play_path']
3349                     del rtsp_format['ext']
3350                     rtsp_format.update({
3351                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3352                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3353                         'protocol': 'rtsp',
3354                     })
3355                     formats.extend([rtmp_format, rtsp_format])
3356         else:
3357             for protocol in ('rtmp', 'rtsp'):
3358                 if protocol not in skip_protocols:
3359                     formats.append({
3360                         'url': '%s:%s' % (protocol, url_base),
3361                         'format_id': protocol,
3362                         'protocol': protocol,
3363                     })
3364         return formats
3365
3366     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3367         mobj = re.search(
3368             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3369             webpage)
3370         if mobj:
3371             try:
3372                 jwplayer_data = self._parse_json(mobj.group('options'),
3373                                                  video_id=video_id,
3374                                                  transform_source=transform_source)
3375             except ExtractorError:
3376                 pass
3377             else:
3378                 if isinstance(jwplayer_data, dict):
3379                     return jwplayer_data
3380
3381     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3382         jwplayer_data = self._find_jwplayer_data(
3383             webpage, video_id, transform_source=js_to_json)
3384         return self._parse_jwplayer_data(
3385             jwplayer_data, video_id, *args, **kwargs)
3386
3387     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3388                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3389         # JWPlayer backward compatibility: flattened playlists
3390         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3391         if 'playlist' not in jwplayer_data:
3392             jwplayer_data = {'playlist': [jwplayer_data]}
3393
3394         entries = []
3395
3396         # JWPlayer backward compatibility: single playlist item
3397         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3398         if not isinstance(jwplayer_data['playlist'], list):
3399             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3400
3401         for video_data in jwplayer_data['playlist']:
3402             # JWPlayer backward compatibility: flattened sources
3403             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3404             if 'sources' not in video_data:
3405                 video_data['sources'] = [video_data]
3406
3407             this_video_id = video_id or video_data['mediaid']
3408
3409             formats = self._parse_jwplayer_formats(
3410                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3411                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3412
3413             subtitles = {}
3414             tracks = video_data.get('tracks')
3415             if tracks and isinstance(tracks, list):
3416                 for track in tracks:
3417                     if not isinstance(track, dict):
3418                         continue
3419                     track_kind = track.get('kind')
3420                     if not track_kind or not isinstance(track_kind, compat_str):
3421                         continue
3422                     if track_kind.lower() not in ('captions', 'subtitles'):
3423                         continue
3424                     track_url = urljoin(base_url, track.get('file'))
3425                     if not track_url:
3426                         continue
3427                     subtitles.setdefault(track.get('label') or 'en', []).append({
3428                         'url': self._proto_relative_url(track_url)
3429                     })
3430
3431             entry = {
3432                 'id': this_video_id,
3433                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3434                 'description': clean_html(video_data.get('description')),
3435                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3436                 'timestamp': int_or_none(video_data.get('pubdate')),
3437                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3438                 'subtitles': subtitles,
3439             }
3440             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3441             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3442                 entry.update({
3443                     '_type': 'url_transparent',
3444                     'url': formats[0]['url'],
3445                 })
3446             else:
3447                 self._sort_formats(formats)
3448                 entry['formats'] = formats
3449             entries.append(entry)
3450         if len(entries) == 1:
3451             return entries[0]
3452         else:
3453             return self.playlist_result(entries)
3454
3455     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3456                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3457         urls = []
3458         formats = []
3459         for source in jwplayer_sources_data:
3460             if not isinstance(source, dict):
3461                 continue
3462             source_url = urljoin(
3463                 base_url, self._proto_relative_url(source.get('file')))
3464             if not source_url or source_url in urls:
3465                 continue
3466             urls.append(source_url)
3467             source_type = source.get('type') or ''
3468             ext = mimetype2ext(source_type) or determine_ext(source_url)
3469             if source_type == 'hls' or ext == 'm3u8':
3470                 formats.extend(self._extract_m3u8_formats(
3471                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3472                     m3u8_id=m3u8_id, fatal=False))
3473             elif source_type == 'dash' or ext == 'mpd':
3474                 formats.extend(self._extract_mpd_formats(
3475                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3476             elif ext == 'smil':
3477                 formats.extend(self._extract_smil_formats(
3478                     source_url, video_id, fatal=False))
3479             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3480             elif source_type.startswith('audio') or ext in (
3481                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3482                 formats.append({
3483                     'url': source_url,
3484                     'vcodec': 'none',
3485                     'ext': ext,
3486                 })
3487             else:
3488                 height = int_or_none(source.get('height'))
3489                 if height is None:
3490                     # Often no height is provided but there is a label in
3491                     # format like "1080p", "720p SD", or 1080.
3492                     height = int_or_none(self._search_regex(
3493                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3494                         'height', default=None))
3495                 a_format = {
3496                     'url': source_url,
3497                     'width': int_or_none(source.get('width')),
3498                     'height': height,
3499                     'tbr': int_or_none(source.get('bitrate')),
3500                     'ext': ext,
3501                 }
3502                 if source_url.startswith('rtmp'):
3503                     a_format['ext'] = 'flv'
3504                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3505                     # of jwplayer.flash.swf
3506                     rtmp_url_parts = re.split(
3507                         r'((?:mp4|mp3|flv):)', source_url, 1)
3508                     if len(rtmp_url_parts) == 3:
3509                         rtmp_url, prefix, play_path = rtmp_url_parts
3510                         a_format.update({
3511                             'url': rtmp_url,
3512                             'play_path': prefix + play_path,
3513                         })
3514                     if rtmp_params:
3515                         a_format.update(rtmp_params)
3516                 formats.append(a_format)
3517         return formats
3518
3519     def _live_title(self, name):
3520         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3521         return name
3522
3523     def _int(self, v, name, fatal=False, **kwargs):
3524         res = int_or_none(v, **kwargs)
3525         if res is None:
3526             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3527             if fatal:
3528                 raise ExtractorError(msg)
3529             else:
3530                 self.report_warning(msg)
3531         return res
3532
3533     def _float(self, v, name, fatal=False, **kwargs):
3534         res = float_or_none(v, **kwargs)
3535         if res is None:
3536             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3537             if fatal:
3538                 raise ExtractorError(msg)
3539             else:
3540                 self.report_warning(msg)
3541         return res
3542
3543     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3544                     path='/', secure=False, discard=False, rest={}, **kwargs):
3545         cookie = compat_cookiejar_Cookie(
3546             0, name, value, port, port is not None, domain, True,
3547             domain.startswith('.'), path, True, secure, expire_time,
3548             discard, None, None, rest)
3549         self._downloader.cookiejar.set_cookie(cookie)
3550
3551     def _get_cookies(self, url):
3552         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3553         req = sanitized_Request(url)
3554         self._downloader.cookiejar.add_cookie_header(req)
3555         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3556
3557     def _apply_first_set_cookie_header(self, url_handle, cookie):
3558         """
3559         Apply first Set-Cookie header instead of the last. Experimental.
3560
3561         Some sites (e.g. [1-3]) may serve two cookies under the same name
3562         in Set-Cookie header and expect the first (old) one to be set rather
3563         than second (new). However, as of RFC6265 the newer one cookie
3564         should be set into cookie store what actually happens.
3565         We will workaround this issue by resetting the cookie to
3566         the first one manually.
3567         1. https://new.vk.com/
3568         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3569         3. https://learning.oreilly.com/
3570         """
3571         for header, cookies in url_handle.headers.items():
3572             if header.lower() != 'set-cookie':
3573                 continue
3574             if sys.version_info[0] >= 3:
3575                 cookies = cookies.encode('iso-8859-1')
3576             cookies = cookies.decode('utf-8')
3577             cookie_value = re.search(
3578                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3579             if cookie_value:
3580                 value, domain = cookie_value.groups()
3581                 self._set_cookie(domain, cookie, value)
3582                 break
3583
3584     def get_testcases(self, include_onlymatching=False):
3585         t = getattr(self, '_TEST', None)
3586         if t:
3587             assert not hasattr(self, '_TESTS'), \
3588                 '%s has _TEST and _TESTS' % type(self).__name__
3589             tests = [t]
3590         else:
3591             tests = getattr(self, '_TESTS', [])
3592         for t in tests:
3593             if not include_onlymatching and t.get('only_matching', False):
3594                 continue
3595             t['name'] = type(self).__name__[:-len('IE')]
3596             yield t
3597
3598     def is_suitable(self, age_limit):
3599         """ Test whether the extractor is generally suitable for the given
3600         age limit (i.e. pornographic sites are not, all others usually are) """
3601
3602         any_restricted = False
3603         for tc in self.get_testcases(include_onlymatching=False):
3604             if tc.get('playlist', []):
3605                 tc = tc['playlist'][0]
3606             is_restricted = age_restricted(
3607                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3608             if not is_restricted:
3609                 return True
3610             any_restricted = any_restricted or is_restricted
3611         return not any_restricted
3612
3613     def extract_subtitles(self, *args, **kwargs):
3614         if (self.get_param('writesubtitles', False)
3615                 or self.get_param('listsubtitles')):
3616             return self._get_subtitles(*args, **kwargs)
3617         return {}
3618
3619     def _get_subtitles(self, *args, **kwargs):
3620         raise NotImplementedError('This method must be implemented by subclasses')
3621
3622     def extract_comments(self, *args, **kwargs):
3623         if not self.get_param('getcomments'):
3624             return None
3625         generator = self._get_comments(*args, **kwargs)
3626
3627         def extractor():
3628             comments = []
3629             interrupted = True
3630             try:
3631                 while True:
3632                     comments.append(next(generator))
3633             except StopIteration:
3634                 interrupted = False
3635             except KeyboardInterrupt:
3636                 self.to_screen('Interrupted by user')
3637             except Exception as e:
3638                 if self.get_param('ignoreerrors') is not True:
3639                     raise
3640                 self._downloader.report_error(e)
3641             comment_count = len(comments)
3642             self.to_screen(f'Extracted {comment_count} comments')
3643             return {
3644                 'comments': comments,
3645                 'comment_count': None if interrupted else comment_count
3646             }
3647         return extractor
3648
3649     def _get_comments(self, *args, **kwargs):
3650         raise NotImplementedError('This method must be implemented by subclasses')
3651
3652     @staticmethod
3653     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3654         """ Merge subtitle items for one language. Items with duplicated URLs
3655         will be dropped. """
3656         list1_urls = set([item['url'] for item in subtitle_list1])
3657         ret = list(subtitle_list1)
3658         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3659         return ret
3660
3661     @classmethod
3662     def _merge_subtitles(cls, *dicts, target=None):
3663         """ Merge subtitle dictionaries, language by language. """
3664         if target is None:
3665             target = {}
3666         for d in dicts:
3667             for lang, subs in d.items():
3668                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3669         return target
3670
3671     def extract_automatic_captions(self, *args, **kwargs):
3672         if (self.get_param('writeautomaticsub', False)
3673                 or self.get_param('listsubtitles')):
3674             return self._get_automatic_captions(*args, **kwargs)
3675         return {}
3676
3677     def _get_automatic_captions(self, *args, **kwargs):
3678         raise NotImplementedError('This method must be implemented by subclasses')
3679
3680     def mark_watched(self, *args, **kwargs):
3681         if not self.get_param('mark_watched', False):
3682             return
3683         if (hasattr(self, '_NETRC_MACHINE') and self._get_login_info()[0] is not None
3684                 or self.get_param('cookiefile')
3685                 or self.get_param('cookiesfrombrowser')):
3686             self._mark_watched(*args, **kwargs)
3687
3688     def _mark_watched(self, *args, **kwargs):
3689         raise NotImplementedError('This method must be implemented by subclasses')
3690
3691     def geo_verification_headers(self):
3692         headers = {}
3693         geo_verification_proxy = self.get_param('geo_verification_proxy')
3694         if geo_verification_proxy:
3695             headers['Ytdl-request-proxy'] = geo_verification_proxy
3696         return headers
3697
3698     def _generic_id(self, url):
3699         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3700
3701     def _generic_title(self, url):
3702         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3703
3704     @staticmethod
3705     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3706         all_known = all(map(
3707             lambda x: x is not None,
3708             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3709         return (
3710             'private' if is_private
3711             else 'premium_only' if needs_premium
3712             else 'subscriber_only' if needs_subscription
3713             else 'needs_auth' if needs_auth
3714             else 'unlisted' if is_unlisted
3715             else 'public' if all_known
3716             else None)
3717
3718     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3719         '''
3720         @returns            A list of values for the extractor argument given by "key"
3721                             or "default" if no such key is present
3722         @param default      The default value to return when the key is not present (default: [])
3723         @param casesense    When false, the values are converted to lower case
3724         '''
3725         val = traverse_obj(
3726             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3727         if val is None:
3728             return [] if default is NO_DEFAULT else default
3729         return list(val) if casesense else [x.lower() for x in val]
3730
3731     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3732         if not playlist_id or not video_id:
3733             return not video_id
3734
3735         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3736         if no_playlist is not None:
3737             return not no_playlist
3738
3739         video_id = '' if video_id is True else f' {video_id}'
3740         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3741         if self.get_param('noplaylist'):
3742             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3743             return False
3744         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3745         return True
3746
3747
3748 class SearchInfoExtractor(InfoExtractor):
3749     """
3750     Base class for paged search queries extractors.
3751     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3752     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3753     """
3754
3755     _MAX_RESULTS = float('inf')
3756
3757     @classmethod
3758     def _make_valid_url(cls):
3759         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3760
3761     def _real_extract(self, query):
3762         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3763         if prefix == '':
3764             return self._get_n_results(query, 1)
3765         elif prefix == 'all':
3766             return self._get_n_results(query, self._MAX_RESULTS)
3767         else:
3768             n = int(prefix)
3769             if n <= 0:
3770                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3771             elif n > self._MAX_RESULTS:
3772                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3773                 n = self._MAX_RESULTS
3774             return self._get_n_results(query, n)
3775
3776     def _get_n_results(self, query, n):
3777         """Get a specified number of results for a query.
3778         Either this function or _search_results must be overridden by subclasses """
3779         return self.playlist_result(
3780             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3781             query, query)
3782
3783     def _search_results(self, query):
3784         """Returns an iterator of search results"""
3785         raise NotImplementedError('This method must be implemented by subclasses')
3786
3787     @property
3788     def SEARCH_KEY(self):
3789         return self._SEARCH_KEY