yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import collections
   6 import hashlib
   7 import itertools
   8 import json
   9 import netrc
  10 import os
  11 import random
  12 import re
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar_Cookie,
  19     compat_cookies_SimpleCookie,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader import FileDownloader
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     encode_data_uri,
  49     error_to_compat_str,
  50     extract_attributes,
  51     ExtractorError,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     join_nonempty,
  59     js_to_json,
  60     JSON_LD_RE,
  61     mimetype2ext,
  62     network_exceptions,
  63     NO_DEFAULT,
  64     orderedSet,
  65     parse_bitrate,
  66     parse_codecs,
  67     parse_duration,
  68     parse_iso8601,
  69     parse_m3u8_attributes,
  70     parse_resolution,
  71     RegexNotFoundError,
  72     sanitize_filename,
  73     sanitized_Request,
  74     str_or_none,
  75     str_to_int,
  76     strip_or_none,
  77     traverse_obj,
  78     unescapeHTML,
  79     UnsupportedError,
  80     unified_strdate,
  81     unified_timestamp,
  82     update_Request,
  83     update_url_query,
  84     url_basename,
  85     url_or_none,
  86     urljoin,
  87     variadic,
  88     xpath_element,
  89     xpath_text,
  90     xpath_with_ns,
  91 )
  92
  93
  94 class InfoExtractor(object):
  95     """Information Extractor class.
  96
  97     Information extractors are the classes that, given a URL, extract
  98     information about the video (or videos) the URL refers to. This
  99     information includes the real video URL, the video title, author and
 100     others. The information is stored in a dictionary which is then
 101     passed to the YoutubeDL. The YoutubeDL processes this
 102     information possibly downloading the video to the file system, among
 103     other possible outcomes.
 104
 105     The type field determines the type of the result.
 106     By far the most common value (and the default if _type is missing) is
 107     "video", which indicates a single video.
 108
 109     For a video, the dictionaries must include the following fields:
 110
 111     id:             Video identifier.
 112     title:          Video title, unescaped.
 113
 114     Additionally, it must contain either a formats entry or a url one:
 115
 116     formats:        A list of dictionaries for each format available, ordered
 117                     from worst to best quality.
 118
 119                     Potential fields:
 120                     * url        The mandatory URL representing the media:
 121                                    for plain file media - HTTP URL of this file,
 122                                    for RTMP - RTMP URL,
 123                                    for HLS - URL of the M3U8 media playlist,
 124                                    for HDS - URL of the F4M manifest,
 125                                    for DASH
 126                                      - HTTP URL to plain file media (in case of
 127                                        unfragmented media)
 128                                      - URL of the MPD manifest or base URL
 129                                        representing the media if MPD manifest
 130                                        is parsed from a string (in case of
 131                                        fragmented media)
 132                                    for MSS - URL of the ISM manifest.
 133                     * manifest_url
 134                                  The URL of the manifest file in case of
 135                                  fragmented media:
 136                                    for HLS - URL of the M3U8 master playlist,
 137                                    for HDS - URL of the F4M manifest,
 138                                    for DASH - URL of the MPD manifest,
 139                                    for MSS - URL of the ISM manifest.
 140                     * ext        Will be calculated from URL if missing
 141                     * format     A human-readable description of the format
 142                                  ("mp4 container with h264/opus").
 143                                  Calculated from the format_id, width, height.
 144                                  and format_note fields if missing.
 145                     * format_id  A short description of the format
 146                                  ("mp4_h264_opus" or "19").
 147                                 Technically optional, but strongly recommended.
 148                     * format_note Additional info about the format
 149                                  ("3D" or "DASH video")
 150                     * width      Width of the video, if known
 151                     * height     Height of the video, if known
 152                     * resolution Textual description of width and height
 153                     * dynamic_range The dynamic range of the video. One of:
 154                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 155                     * tbr        Average bitrate of audio and video in KBit/s
 156                     * abr        Average audio bitrate in KBit/s
 157                     * acodec     Name of the audio codec in use
 158                     * asr        Audio sampling rate in Hertz
 159                     * vbr        Average video bitrate in KBit/s
 160                     * fps        Frame rate
 161                     * vcodec     Name of the video codec in use
 162                     * container  Name of the container format
 163                     * filesize   The number of bytes, if known in advance
 164                     * filesize_approx  An estimate for the number of bytes
 165                     * player_url SWF Player URL (used for rtmpdump).
 166                     * protocol   The protocol that will be used for the actual
 167                                  download, lower-case. One of "http", "https" or
 168                                  one of the protocols defined in downloader.PROTOCOL_MAP
 169                     * fragment_base_url
 170                                  Base URL for fragments. Each fragment's path
 171                                  value (if present) will be relative to
 172                                  this URL.
 173                     * fragments  A list of fragments of a fragmented media.
 174                                  Each fragment entry must contain either an url
 175                                  or a path. If an url is present it should be
 176                                  considered by a client. Otherwise both path and
 177                                  fragment_base_url must be present. Here is
 178                                  the list of all potential fields:
 179                                  * "url" - fragment's URL
 180                                  * "path" - fragment's path relative to
 181                                             fragment_base_url
 182                                  * "duration" (optional, int or float)
 183                                  * "filesize" (optional, int)
 184                     * is_from_start  Is a live format that can be downloaded
 185                                 from the start. Boolean
 186                     * preference Order number of this format. If this field is
 187                                  present and not None, the formats get sorted
 188                                  by this field, regardless of all other values.
 189                                  -1 for default (order by other properties),
 190                                  -2 or smaller for less than default.
 191                                  < -1000 to hide the format (if there is
 192                                     another one which is strictly better)
 193                     * language   Language code, e.g. "de" or "en-US".
 194                     * language_preference  Is this in the language mentioned in
 195                                  the URL?
 196                                  10 if it's what the URL is about,
 197                                  -1 for default (don't know),
 198                                  -10 otherwise, other values reserved for now.
 199                     * quality    Order number of the video quality of this
 200                                  format, irrespective of the file format.
 201                                  -1 for default (order by other properties),
 202                                  -2 or smaller for less than default.
 203                     * source_preference  Order number for this video source
 204                                   (quality takes higher priority)
 205                                  -1 for default (order by other properties),
 206                                  -2 or smaller for less than default.
 207                     * http_headers  A dictionary of additional HTTP headers
 208                                  to add to the request.
 209                     * stretched_ratio  If given and not 1, indicates that the
 210                                  video's pixels are not square.
 211                                  width : height ratio as float.
 212                     * no_resume  The server does not support resuming the
 213                                  (HTTP or RTMP) download. Boolean.
 214                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 215                     * downloader_options  A dictionary of downloader options as
 216                                  described in FileDownloader
 217                     RTMP formats can also have the additional fields: page_url,
 218                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 219                     rtmp_protocol, rtmp_real_time
 220
 221     url:            Final video URL.
 222     ext:            Video filename extension.
 223     format:         The video format, defaults to ext (used for --get-format)
 224     player_url:     SWF Player URL (used for rtmpdump).
 225
 226     The following fields are optional:
 227
 228     alt_title:      A secondary title of the video.
 229     display_id      An alternative identifier for the video, not necessarily
 230                     unique, but available before title. Typically, id is
 231                     something like "4234987", title "Dancing naked mole rats",
 232                     and display_id "dancing-naked-mole-rats"
 233     thumbnails:     A list of dictionaries, with the following entries:
 234                         * "id" (optional, string) - Thumbnail format ID
 235                         * "url"
 236                         * "preference" (optional, int) - quality of the image
 237                         * "width" (optional, int)
 238                         * "height" (optional, int)
 239                         * "resolution" (optional, string "{width}x{height}",
 240                                         deprecated)
 241                         * "filesize" (optional, int)
 242     thumbnail:      Full URL to a video thumbnail image.
 243     description:    Full video description.
 244     uploader:       Full name of the video uploader.
 245     license:        License name the video is licensed under.
 246     creator:        The creator of the video.
 247     timestamp:      UNIX timestamp of the moment the video was uploaded
 248     upload_date:    Video upload date (YYYYMMDD).
 249                     If not explicitly set, calculated from timestamp
 250     release_timestamp: UNIX timestamp of the moment the video was released.
 251                     If it is not clear whether to use timestamp or this, use the former
 252     release_date:   The date (YYYYMMDD) when the video was released.
 253                     If not explicitly set, calculated from release_timestamp
 254     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 255     modified_date:   The date (YYYYMMDD) when the video was last modified.
 256                     If not explicitly set, calculated from modified_timestamp
 257     uploader_id:    Nickname or id of the video uploader.
 258     uploader_url:   Full URL to a personal webpage of the video uploader.
 259     channel:        Full name of the channel the video is uploaded on.
 260                     Note that channel fields may or may not repeat uploader
 261                     fields. This depends on a particular extractor.
 262     channel_id:     Id of the channel.
 263     channel_url:    Full URL to a channel webpage.
 264     channel_follower_count: Number of followers of the channel.
 265     location:       Physical location where the video was filmed.
 266     subtitles:      The available subtitles as a dictionary in the format
 267                     {tag: subformats}. "tag" is usually a language code, and
 268                     "subformats" is a list sorted from lower to higher
 269                     preference, each element is a dictionary with the "ext"
 270                     entry and one of:
 271                         * "data": The subtitles file contents
 272                         * "url": A URL pointing to the subtitles file
 273                     It can optionally also have:
 274                         * "name": Name or description of the subtitles
 275                     "ext" will be calculated from URL if missing
 276     automatic_captions: Like 'subtitles'; contains automatically generated
 277                     captions instead of normal subtitles
 278     duration:       Length of the video in seconds, as an integer or float.
 279     view_count:     How many users have watched the video on the platform.
 280     like_count:     Number of positive ratings of the video
 281     dislike_count:  Number of negative ratings of the video
 282     repost_count:   Number of reposts of the video
 283     average_rating: Average rating give by users, the scale used depends on the webpage
 284     comment_count:  Number of comments on the video
 285     comments:       A list of comments, each with one or more of the following
 286                     properties (all but one of text or html optional):
 287                         * "author" - human-readable name of the comment author
 288                         * "author_id" - user ID of the comment author
 289                         * "author_thumbnail" - The thumbnail of the comment author
 290                         * "id" - Comment ID
 291                         * "html" - Comment as HTML
 292                         * "text" - Plain text of the comment
 293                         * "timestamp" - UNIX timestamp of comment
 294                         * "parent" - ID of the comment this one is replying to.
 295                                      Set to "root" to indicate that this is a
 296                                      comment to the original video.
 297                         * "like_count" - Number of positive ratings of the comment
 298                         * "dislike_count" - Number of negative ratings of the comment
 299                         * "is_favorited" - Whether the comment is marked as
 300                                            favorite by the video uploader
 301                         * "author_is_uploader" - Whether the comment is made by
 302                                                  the video uploader
 303     age_limit:      Age restriction for the video, as an integer (years)
 304     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 305                     should allow to get the same result again. (It will be set
 306                     by YoutubeDL if it's missing)
 307     categories:     A list of categories that the video falls in, for example
 308                     ["Sports", "Berlin"]
 309     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 310     cast:           A list of the video cast
 311     is_live:        True, False, or None (=unknown). Whether this video is a
 312                     live stream that goes on instead of a fixed-length video.
 313     was_live:       True, False, or None (=unknown). Whether this video was
 314                     originally a live stream.
 315     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 316                     If absent, automatically set from is_live, was_live
 317     start_time:     Time in seconds where the reproduction should start, as
 318                     specified in the URL.
 319     end_time:       Time in seconds where the reproduction should end, as
 320                     specified in the URL.
 321     chapters:       A list of dictionaries, with the following entries:
 322                         * "start_time" - The start time of the chapter in seconds
 323                         * "end_time" - The end time of the chapter in seconds
 324                         * "title" (optional, string)
 325     playable_in_embed: Whether this video is allowed to play in embedded
 326                     players on other sites. Can be True (=always allowed),
 327                     False (=never allowed), None (=unknown), or a string
 328                     specifying the criteria for embedability (Eg: 'whitelist')
 329     availability:   Under what condition the video is available. One of
 330                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 331                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 332                     to set it
 333     __post_extractor: A function to be called just before the metadata is
 334                     written to either disk, logger or console. The function
 335                     must return a dict which will be added to the info_dict.
 336                     This is usefull for additional information that is
 337                     time-consuming to extract. Note that the fields thus
 338                     extracted will not be available to output template and
 339                     match_filter. So, only "comments" and "comment_count" are
 340                     currently allowed to be extracted via this method.
 341
 342     The following fields should only be used when the video belongs to some logical
 343     chapter or section:
 344
 345     chapter:        Name or title of the chapter the video belongs to.
 346     chapter_number: Number of the chapter the video belongs to, as an integer.
 347     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 348
 349     The following fields should only be used when the video is an episode of some
 350     series, programme or podcast:
 351
 352     series:         Title of the series or programme the video episode belongs to.
 353     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 354     season:         Title of the season the video episode belongs to.
 355     season_number:  Number of the season the video episode belongs to, as an integer.
 356     season_id:      Id of the season the video episode belongs to, as a unicode string.
 357     episode:        Title of the video episode. Unlike mandatory video title field,
 358                     this field should denote the exact title of the video episode
 359                     without any kind of decoration.
 360     episode_number: Number of the video episode within a season, as an integer.
 361     episode_id:     Id of the video episode, as a unicode string.
 362
 363     The following fields should only be used when the media is a track or a part of
 364     a music album:
 365
 366     track:          Title of the track.
 367     track_number:   Number of the track within an album or a disc, as an integer.
 368     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 369                     as a unicode string.
 370     artist:         Artist(s) of the track.
 371     genre:          Genre(s) of the track.
 372     album:          Title of the album the track belongs to.
 373     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 374     album_artist:   List of all artists appeared on the album (e.g.
 375                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 376                     and compilations).
 377     disc_number:    Number of the disc or other physical medium the track belongs to,
 378                     as an integer.
 379     release_year:   Year (YYYY) when the album was released.
 380     composer:       Composer of the piece
 381
 382     Unless mentioned otherwise, the fields should be Unicode strings.
 383
 384     Unless mentioned otherwise, None is equivalent to absence of information.
 385
 386
 387     _type "playlist" indicates multiple videos.
 388     There must be a key "entries", which is a list, an iterable, or a PagedList
 389     object, each element of which is a valid dictionary by this specification.
 390
 391     Additionally, playlists can have "id", "title", and any other relevent
 392     attributes with the same semantics as videos (see above).
 393
 394     It can also have the following optional fields:
 395
 396     playlist_count: The total number of videos in a playlist. If not given,
 397                     YoutubeDL tries to calculate it from "entries"
 398
 399
 400     _type "multi_video" indicates that there are multiple videos that
 401     form a single show, for examples multiple acts of an opera or TV episode.
 402     It must have an entries key like a playlist and contain all the keys
 403     required for a video at the same time.
 404
 405
 406     _type "url" indicates that the video must be extracted from another
 407     location, possibly by a different extractor. Its only required key is:
 408     "url" - the next URL to extract.
 409     The key "ie_key" can be set to the class name (minus the trailing "IE",
 410     e.g. "Youtube") if the extractor class is known in advance.
 411     Additionally, the dictionary may have any properties of the resolved entity
 412     known in advance, for example "title" if the title of the referred video is
 413     known ahead of time.
 414
 415
 416     _type "url_transparent" entities have the same specification as "url", but
 417     indicate that the given additional information is more precise than the one
 418     associated with the resolved URL.
 419     This is useful when a site employs a video service that hosts the video and
 420     its technical metadata, but that video service does not embed a useful
 421     title, description etc.
 422
 423
 424     Subclasses of this one should re-define the _real_initialize() and
 425     _real_extract() methods and define a _VALID_URL regexp.
 426     Probably, they should also be added to the list of extractors.
 427
 428     Subclasses may also override suitable() if necessary, but ensure the function
 429     signature is preserved and that this function imports everything it needs
 430     (except other extractors), so that lazy_extractors works correctly
 431
 432     _GEO_BYPASS attribute may be set to False in order to disable
 433     geo restriction bypass mechanisms for a particular extractor.
 434     Though it won't disable explicit geo restriction bypass based on
 435     country code provided with geo_bypass_country.
 436
 437     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 438     countries for this extractor. One of these countries will be used by
 439     geo restriction bypass mechanism right away in order to bypass
 440     geo restriction, of course, if the mechanism is not disabled.
 441
 442     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 443     IP blocks in CIDR notation for this extractor. One of these IP blocks
 444     will be used by geo restriction bypass mechanism similarly
 445     to _GEO_COUNTRIES.
 446
 447     The _WORKING attribute should be set to False for broken IEs
 448     in order to warn the users and skip the tests.
 449     """
 450
 451     _ready = False
 452     _downloader = None
 453     _x_forwarded_for_ip = None
 454     _GEO_BYPASS = True
 455     _GEO_COUNTRIES = None
 456     _GEO_IP_BLOCKS = None
 457     _WORKING = True
 458
 459     _LOGIN_HINTS = {
 460         'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
 461         'cookies': (
 462             'Use --cookies-from-browser or --cookies for the authentication. '
 463             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 464         'password': 'Use --username and --password, or --netrc to provide account credentials',
 465     }
 466
 467     def __init__(self, downloader=None):
 468         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 469         If a downloader is not passed during initialization,
 470         it must be set using "set_downloader()" before "extract()" is called"""
 471         self._ready = False
 472         self._x_forwarded_for_ip = None
 473         self._printed_messages = set()
 474         self.set_downloader(downloader)
 475
 476     @classmethod
 477     def _match_valid_url(cls, url):
 478         # This does not use has/getattr intentionally - we want to know whether
 479         # we have cached the regexp for *this* class, whereas getattr would also
 480         # match the superclass
 481         if '_VALID_URL_RE' not in cls.__dict__:
 482             if '_VALID_URL' not in cls.__dict__:
 483                 cls._VALID_URL = cls._make_valid_url()
 484             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 485         return cls._VALID_URL_RE.match(url)
 486
 487     @classmethod
 488     def suitable(cls, url):
 489         """Receives a URL and returns True if suitable for this IE."""
 490         # This function must import everything it needs (except other extractors),
 491         # so that lazy_extractors works correctly
 492         return cls._match_valid_url(url) is not None
 493
 494     @classmethod
 495     def _match_id(cls, url):
 496         return cls._match_valid_url(url).group('id')
 497
 498     @classmethod
 499     def get_temp_id(cls, url):
 500         try:
 501             return cls._match_id(url)
 502         except (IndexError, AttributeError):
 503             return None
 504
 505     @classmethod
 506     def working(cls):
 507         """Getter method for _WORKING."""
 508         return cls._WORKING
 509
 510     def initialize(self):
 511         """Initializes an instance (authentication, etc)."""
 512         self._printed_messages = set()
 513         self._initialize_geo_bypass({
 514             'countries': self._GEO_COUNTRIES,
 515             'ip_blocks': self._GEO_IP_BLOCKS,
 516         })
 517         if not self._ready:
 518             self._real_initialize()
 519             self._ready = True
 520
 521     def _initialize_geo_bypass(self, geo_bypass_context):
 522         """
 523         Initialize geo restriction bypass mechanism.
 524
 525         This method is used to initialize geo bypass mechanism based on faking
 526         X-Forwarded-For HTTP header. A random country from provided country list
 527         is selected and a random IP belonging to this country is generated. This
 528         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 529         HTTP requests.
 530
 531         This method will be used for initial geo bypass mechanism initialization
 532         during the instance initialization with _GEO_COUNTRIES and
 533         _GEO_IP_BLOCKS.
 534
 535         You may also manually call it from extractor's code if geo bypass
 536         information is not available beforehand (e.g. obtained during
 537         extraction) or due to some other reason. In this case you should pass
 538         this information in geo bypass context passed as first argument. It may
 539         contain following fields:
 540
 541         countries:  List of geo unrestricted countries (similar
 542                     to _GEO_COUNTRIES)
 543         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 544                     (similar to _GEO_IP_BLOCKS)
 545
 546         """
 547         if not self._x_forwarded_for_ip:
 548
 549             # Geo bypass mechanism is explicitly disabled by user
 550             if not self.get_param('geo_bypass', True):
 551                 return
 552
 553             if not geo_bypass_context:
 554                 geo_bypass_context = {}
 555
 556             # Backward compatibility: previously _initialize_geo_bypass
 557             # expected a list of countries, some 3rd party code may still use
 558             # it this way
 559             if isinstance(geo_bypass_context, (list, tuple)):
 560                 geo_bypass_context = {
 561                     'countries': geo_bypass_context,
 562                 }
 563
 564             # The whole point of geo bypass mechanism is to fake IP
 565             # as X-Forwarded-For HTTP header based on some IP block or
 566             # country code.
 567
 568             # Path 1: bypassing based on IP block in CIDR notation
 569
 570             # Explicit IP block specified by user, use it right away
 571             # regardless of whether extractor is geo bypassable or not
 572             ip_block = self.get_param('geo_bypass_ip_block', None)
 573
 574             # Otherwise use random IP block from geo bypass context but only
 575             # if extractor is known as geo bypassable
 576             if not ip_block:
 577                 ip_blocks = geo_bypass_context.get('ip_blocks')
 578                 if self._GEO_BYPASS and ip_blocks:
 579                     ip_block = random.choice(ip_blocks)
 580
 581             if ip_block:
 582                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 583                 self._downloader.write_debug(
 584                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 585                 return
 586
 587             # Path 2: bypassing based on country code
 588
 589             # Explicit country code specified by user, use it right away
 590             # regardless of whether extractor is geo bypassable or not
 591             country = self.get_param('geo_bypass_country', None)
 592
 593             # Otherwise use random country code from geo bypass context but
 594             # only if extractor is known as geo bypassable
 595             if not country:
 596                 countries = geo_bypass_context.get('countries')
 597                 if self._GEO_BYPASS and countries:
 598                     country = random.choice(countries)
 599
 600             if country:
 601                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 602                 self._downloader.write_debug(
 603                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 604
 605     def extract(self, url):
 606         """Extracts URL information and returns it in list of dicts."""
 607         try:
 608             for _ in range(2):
 609                 try:
 610                     self.initialize()
 611                     self.write_debug('Extracting URL: %s' % url)
 612                     ie_result = self._real_extract(url)
 613                     if ie_result is None:
 614                         return None
 615                     if self._x_forwarded_for_ip:
 616                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 617                     subtitles = ie_result.get('subtitles')
 618                     if (subtitles and 'live_chat' in subtitles
 619                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 620                         del subtitles['live_chat']
 621                     return ie_result
 622                 except GeoRestrictedError as e:
 623                     if self.__maybe_fake_ip_and_retry(e.countries):
 624                         continue
 625                     raise
 626         except UnsupportedError:
 627             raise
 628         except ExtractorError as e:
 629             kwargs = {
 630                 'video_id': e.video_id or self.get_temp_id(url),
 631                 'ie': self.IE_NAME,
 632                 'tb': e.traceback or sys.exc_info()[2],
 633                 'expected': e.expected,
 634                 'cause': e.cause
 635             }
 636             if hasattr(e, 'countries'):
 637                 kwargs['countries'] = e.countries
 638             raise type(e)(e.msg, **kwargs)
 639         except compat_http_client.IncompleteRead as e:
 640             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 641         except (KeyError, StopIteration) as e:
 642             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 643
 644     def __maybe_fake_ip_and_retry(self, countries):
 645         if (not self.get_param('geo_bypass_country', None)
 646                 and self._GEO_BYPASS
 647                 and self.get_param('geo_bypass', True)
 648                 and not self._x_forwarded_for_ip
 649                 and countries):
 650             country_code = random.choice(countries)
 651             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 652             if self._x_forwarded_for_ip:
 653                 self.report_warning(
 654                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 655                     % (self._x_forwarded_for_ip, country_code.upper()))
 656                 return True
 657         return False
 658
 659     def set_downloader(self, downloader):
 660         """Sets the downloader for this IE."""
 661         self._downloader = downloader
 662
 663     def _real_initialize(self):
 664         """Real initialization process. Redefine in subclasses."""
 665         pass
 666
 667     def _real_extract(self, url):
 668         """Real extraction process. Redefine in subclasses."""
 669         pass
 670
 671     @classmethod
 672     def ie_key(cls):
 673         """A string for getting the InfoExtractor with get_info_extractor"""
 674         return cls.__name__[:-2]
 675
 676     @property
 677     def IE_NAME(self):
 678         return compat_str(type(self).__name__[:-2])
 679
 680     @staticmethod
 681     def __can_accept_status_code(err, expected_status):
 682         assert isinstance(err, compat_urllib_error.HTTPError)
 683         if expected_status is None:
 684             return False
 685         elif callable(expected_status):
 686             return expected_status(err.code) is True
 687         else:
 688             return err.code in variadic(expected_status)
 689
 690     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 691         """
 692         Return the response handle.
 693
 694         See _download_webpage docstring for arguments specification.
 695         """
 696         if not self._downloader._first_webpage_request:
 697             sleep_interval = self.get_param('sleep_interval_requests') or 0
 698             if sleep_interval > 0:
 699                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 700                 time.sleep(sleep_interval)
 701         else:
 702             self._downloader._first_webpage_request = False
 703
 704         if note is None:
 705             self.report_download_webpage(video_id)
 706         elif note is not False:
 707             if video_id is None:
 708                 self.to_screen('%s' % (note,))
 709             else:
 710                 self.to_screen('%s: %s' % (video_id, note))
 711
 712         # Some sites check X-Forwarded-For HTTP header in order to figure out
 713         # the origin of the client behind proxy. This allows bypassing geo
 714         # restriction by faking this header's value to IP that belongs to some
 715         # geo unrestricted country. We will do so once we encounter any
 716         # geo restriction error.
 717         if self._x_forwarded_for_ip:
 718             if 'X-Forwarded-For' not in headers:
 719                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 720
 721         if isinstance(url_or_request, compat_urllib_request.Request):
 722             url_or_request = update_Request(
 723                 url_or_request, data=data, headers=headers, query=query)
 724         else:
 725             if query:
 726                 url_or_request = update_url_query(url_or_request, query)
 727             if data is not None or headers:
 728                 url_or_request = sanitized_Request(url_or_request, data, headers)
 729         try:
 730             return self._downloader.urlopen(url_or_request)
 731         except network_exceptions as err:
 732             if isinstance(err, compat_urllib_error.HTTPError):
 733                 if self.__can_accept_status_code(err, expected_status):
 734                     # Retain reference to error to prevent file object from
 735                     # being closed before it can be read. Works around the
 736                     # effects of <https://bugs.python.org/issue15002>
 737                     # introduced in Python 3.4.1.
 738                     err.fp._error = err
 739                     return err.fp
 740
 741             if errnote is False:
 742                 return False
 743             if errnote is None:
 744                 errnote = 'Unable to download webpage'
 745
 746             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 747             if fatal:
 748                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 749             else:
 750                 self.report_warning(errmsg)
 751                 return False
 752
 753     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 754         """
 755         Return a tuple (page content as string, URL handle).
 756
 757         See _download_webpage docstring for arguments specification.
 758         """
 759         # Strip hashes from the URL (#1038)
 760         if isinstance(url_or_request, (compat_str, str)):
 761             url_or_request = url_or_request.partition('#')[0]
 762
 763         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 764         if urlh is False:
 765             assert not fatal
 766             return False
 767         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 768         return (content, urlh)
 769
 770     @staticmethod
 771     def _guess_encoding_from_content(content_type, webpage_bytes):
 772         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 773         if m:
 774             encoding = m.group(1)
 775         else:
 776             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 777                           webpage_bytes[:1024])
 778             if m:
 779                 encoding = m.group(1).decode('ascii')
 780             elif webpage_bytes.startswith(b'\xff\xfe'):
 781                 encoding = 'utf-16'
 782             else:
 783                 encoding = 'utf-8'
 784
 785         return encoding
 786
 787     def __check_blocked(self, content):
 788         first_block = content[:512]
 789         if ('<title>Access to this site is blocked</title>' in content
 790                 and 'Websense' in first_block):
 791             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 792             blocked_iframe = self._html_search_regex(
 793                 r'<iframe src="([^"]+)"', content,
 794                 'Websense information URL', default=None)
 795             if blocked_iframe:
 796                 msg += ' Visit %s for more details' % blocked_iframe
 797             raise ExtractorError(msg, expected=True)
 798         if '<title>The URL you requested has been blocked</title>' in first_block:
 799             msg = (
 800                 'Access to this webpage has been blocked by Indian censorship. '
 801                 'Use a VPN or proxy server (with --proxy) to route around it.')
 802             block_msg = self._html_search_regex(
 803                 r'</h1><p>(.*?)</p>',
 804                 content, 'block message', default=None)
 805             if block_msg:
 806                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 807             raise ExtractorError(msg, expected=True)
 808         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 809                 and 'blocklist.rkn.gov.ru' in content):
 810             raise ExtractorError(
 811                 'Access to this webpage has been blocked by decision of the Russian government. '
 812                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 813                 expected=True)
 814
 815     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 816         content_type = urlh.headers.get('Content-Type', '')
 817         webpage_bytes = urlh.read()
 818         if prefix is not None:
 819             webpage_bytes = prefix + webpage_bytes
 820         if not encoding:
 821             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 822         if self.get_param('dump_intermediate_pages', False):
 823             self.to_screen('Dumping request to ' + urlh.geturl())
 824             dump = base64.b64encode(webpage_bytes).decode('ascii')
 825             self._downloader.to_screen(dump)
 826         if self.get_param('write_pages', False):
 827             basen = '%s_%s' % (video_id, urlh.geturl())
 828             trim_length = self.get_param('trim_file_name') or 240
 829             if len(basen) > trim_length:
 830                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 831                 basen = basen[:trim_length - len(h)] + h
 832             raw_filename = basen + '.dump'
 833             filename = sanitize_filename(raw_filename, restricted=True)
 834             self.to_screen('Saving request to ' + filename)
 835             # Working around MAX_PATH limitation on Windows (see
 836             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 837             if compat_os_name == 'nt':
 838                 absfilepath = os.path.abspath(filename)
 839                 if len(absfilepath) > 259:
 840                     filename = '\\\\?\\' + absfilepath
 841             with open(filename, 'wb') as outf:
 842                 outf.write(webpage_bytes)
 843
 844         try:
 845             content = webpage_bytes.decode(encoding, 'replace')
 846         except LookupError:
 847             content = webpage_bytes.decode('utf-8', 'replace')
 848
 849         self.__check_blocked(content)
 850
 851         return content
 852
 853     def _download_webpage(
 854             self, url_or_request, video_id, note=None, errnote=None,
 855             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 856             headers={}, query={}, expected_status=None):
 857         """
 858         Return the data of the page as a string.
 859
 860         Arguments:
 861         url_or_request -- plain text URL as a string or
 862             a compat_urllib_request.Requestobject
 863         video_id -- Video/playlist/item identifier (string)
 864
 865         Keyword arguments:
 866         note -- note printed before downloading (string)
 867         errnote -- note printed in case of an error (string)
 868         fatal -- flag denoting whether error should be considered fatal,
 869             i.e. whether it should cause ExtractionError to be raised,
 870             otherwise a warning will be reported and extraction continued
 871         tries -- number of tries
 872         timeout -- sleep interval between tries
 873         encoding -- encoding for a page content decoding, guessed automatically
 874             when not explicitly specified
 875         data -- POST data (bytes)
 876         headers -- HTTP headers (dict)
 877         query -- URL query (dict)
 878         expected_status -- allows to accept failed HTTP requests (non 2xx
 879             status code) by explicitly specifying a set of accepted status
 880             codes. Can be any of the following entities:
 881                 - an integer type specifying an exact failed status code to
 882                   accept
 883                 - a list or a tuple of integer types specifying a list of
 884                   failed status codes to accept
 885                 - a callable accepting an actual failed status code and
 886                   returning True if it should be accepted
 887             Note that this argument does not affect success status codes (2xx)
 888             which are always accepted.
 889         """
 890
 891         success = False
 892         try_count = 0
 893         while success is False:
 894             try:
 895                 res = self._download_webpage_handle(
 896                     url_or_request, video_id, note, errnote, fatal,
 897                     encoding=encoding, data=data, headers=headers, query=query,
 898                     expected_status=expected_status)
 899                 success = True
 900             except compat_http_client.IncompleteRead as e:
 901                 try_count += 1
 902                 if try_count >= tries:
 903                     raise e
 904                 self._sleep(timeout, video_id)
 905         if res is False:
 906             return res
 907         else:
 908             content, _ = res
 909             return content
 910
 911     def _download_xml_handle(
 912             self, url_or_request, video_id, note='Downloading XML',
 913             errnote='Unable to download XML', transform_source=None,
 914             fatal=True, encoding=None, data=None, headers={}, query={},
 915             expected_status=None):
 916         """
 917         Return a tuple (xml as an compat_etree_Element, URL handle).
 918
 919         See _download_webpage docstring for arguments specification.
 920         """
 921         res = self._download_webpage_handle(
 922             url_or_request, video_id, note, errnote, fatal=fatal,
 923             encoding=encoding, data=data, headers=headers, query=query,
 924             expected_status=expected_status)
 925         if res is False:
 926             return res
 927         xml_string, urlh = res
 928         return self._parse_xml(
 929             xml_string, video_id, transform_source=transform_source,
 930             fatal=fatal), urlh
 931
 932     def _download_xml(
 933             self, url_or_request, video_id,
 934             note='Downloading XML', errnote='Unable to download XML',
 935             transform_source=None, fatal=True, encoding=None,
 936             data=None, headers={}, query={}, expected_status=None):
 937         """
 938         Return the xml as an compat_etree_Element.
 939
 940         See _download_webpage docstring for arguments specification.
 941         """
 942         res = self._download_xml_handle(
 943             url_or_request, video_id, note=note, errnote=errnote,
 944             transform_source=transform_source, fatal=fatal, encoding=encoding,
 945             data=data, headers=headers, query=query,
 946             expected_status=expected_status)
 947         return res if res is False else res[0]
 948
 949     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 950         if transform_source:
 951             xml_string = transform_source(xml_string)
 952         try:
 953             return compat_etree_fromstring(xml_string.encode('utf-8'))
 954         except compat_xml_parse_error as ve:
 955             errmsg = '%s: Failed to parse XML ' % video_id
 956             if fatal:
 957                 raise ExtractorError(errmsg, cause=ve)
 958             else:
 959                 self.report_warning(errmsg + str(ve))
 960
 961     def _download_json_handle(
 962             self, url_or_request, video_id, note='Downloading JSON metadata',
 963             errnote='Unable to download JSON metadata', transform_source=None,
 964             fatal=True, encoding=None, data=None, headers={}, query={},
 965             expected_status=None):
 966         """
 967         Return a tuple (JSON object, URL handle).
 968
 969         See _download_webpage docstring for arguments specification.
 970         """
 971         res = self._download_webpage_handle(
 972             url_or_request, video_id, note, errnote, fatal=fatal,
 973             encoding=encoding, data=data, headers=headers, query=query,
 974             expected_status=expected_status)
 975         if res is False:
 976             return res
 977         json_string, urlh = res
 978         return self._parse_json(
 979             json_string, video_id, transform_source=transform_source,
 980             fatal=fatal), urlh
 981
 982     def _download_json(
 983             self, url_or_request, video_id, note='Downloading JSON metadata',
 984             errnote='Unable to download JSON metadata', transform_source=None,
 985             fatal=True, encoding=None, data=None, headers={}, query={},
 986             expected_status=None):
 987         """
 988         Return the JSON object as a dict.
 989
 990         See _download_webpage docstring for arguments specification.
 991         """
 992         res = self._download_json_handle(
 993             url_or_request, video_id, note=note, errnote=errnote,
 994             transform_source=transform_source, fatal=fatal, encoding=encoding,
 995             data=data, headers=headers, query=query,
 996             expected_status=expected_status)
 997         return res if res is False else res[0]
 998
 999     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
1000         if transform_source:
1001             json_string = transform_source(json_string)
1002         try:
1003             return json.loads(json_string)
1004         except ValueError as ve:
1005             errmsg = '%s: Failed to parse JSON ' % video_id
1006             if fatal:
1007                 raise ExtractorError(errmsg, cause=ve)
1008             else:
1009                 self.report_warning(errmsg + str(ve))
1010
1011     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1012         return self._parse_json(
1013             data[data.find('{'):data.rfind('}') + 1],
1014             video_id, transform_source, fatal)
1015
1016     def _download_socket_json_handle(
1017             self, url_or_request, video_id, note='Polling socket',
1018             errnote='Unable to poll socket', transform_source=None,
1019             fatal=True, encoding=None, data=None, headers={}, query={},
1020             expected_status=None):
1021         """
1022         Return a tuple (JSON object, URL handle).
1023
1024         See _download_webpage docstring for arguments specification.
1025         """
1026         res = self._download_webpage_handle(
1027             url_or_request, video_id, note, errnote, fatal=fatal,
1028             encoding=encoding, data=data, headers=headers, query=query,
1029             expected_status=expected_status)
1030         if res is False:
1031             return res
1032         webpage, urlh = res
1033         return self._parse_socket_response_as_json(
1034             webpage, video_id, transform_source=transform_source,
1035             fatal=fatal), urlh
1036
1037     def _download_socket_json(
1038             self, url_or_request, video_id, note='Polling socket',
1039             errnote='Unable to poll socket', transform_source=None,
1040             fatal=True, encoding=None, data=None, headers={}, query={},
1041             expected_status=None):
1042         """
1043         Return the JSON object as a dict.
1044
1045         See _download_webpage docstring for arguments specification.
1046         """
1047         res = self._download_socket_json_handle(
1048             url_or_request, video_id, note=note, errnote=errnote,
1049             transform_source=transform_source, fatal=fatal, encoding=encoding,
1050             data=data, headers=headers, query=query,
1051             expected_status=expected_status)
1052         return res if res is False else res[0]
1053
1054     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1055         idstr = format_field(video_id, template='%s: ')
1056         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1057         if only_once:
1058             if f'WARNING: {msg}' in self._printed_messages:
1059                 return
1060             self._printed_messages.add(f'WARNING: {msg}')
1061         self._downloader.report_warning(msg, *args, **kwargs)
1062
1063     def to_screen(self, msg, *args, **kwargs):
1064         """Print msg to screen, prefixing it with '[ie_name]'"""
1065         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1066
1067     def write_debug(self, msg, *args, **kwargs):
1068         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1069
1070     def get_param(self, name, default=None, *args, **kwargs):
1071         if self._downloader:
1072             return self._downloader.params.get(name, default, *args, **kwargs)
1073         return default
1074
1075     def report_drm(self, video_id, partial=False):
1076         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1077
1078     def report_extraction(self, id_or_name):
1079         """Report information extraction."""
1080         self.to_screen('%s: Extracting information' % id_or_name)
1081
1082     def report_download_webpage(self, video_id):
1083         """Report webpage download."""
1084         self.to_screen('%s: Downloading webpage' % video_id)
1085
1086     def report_age_confirmation(self):
1087         """Report attempt to confirm age."""
1088         self.to_screen('Confirming age')
1089
1090     def report_login(self):
1091         """Report attempt to log in."""
1092         self.to_screen('Logging in')
1093
1094     def raise_login_required(
1095             self, msg='This video is only available for registered users',
1096             metadata_available=False, method='any'):
1097         if metadata_available and (
1098                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1099             self.report_warning(msg)
1100         if method is not None:
1101             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1102         raise ExtractorError(msg, expected=True)
1103
1104     def raise_geo_restricted(
1105             self, msg='This video is not available from your location due to geo restriction',
1106             countries=None, metadata_available=False):
1107         if metadata_available and (
1108                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1109             self.report_warning(msg)
1110         else:
1111             raise GeoRestrictedError(msg, countries=countries)
1112
1113     def raise_no_formats(self, msg, expected=False, video_id=None):
1114         if expected and (
1115                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1116             self.report_warning(msg, video_id)
1117         elif isinstance(msg, ExtractorError):
1118             raise msg
1119         else:
1120             raise ExtractorError(msg, expected=expected, video_id=video_id)
1121
1122     # Methods for following #608
1123     @staticmethod
1124     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1125         """Returns a URL that points to a page that should be processed"""
1126         if ie is not None:
1127             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1128         if video_id is not None:
1129             kwargs['id'] = video_id
1130         if video_title is not None:
1131             kwargs['title'] = video_title
1132         return {
1133             **kwargs,
1134             '_type': 'url_transparent' if url_transparent else 'url',
1135             'url': url,
1136         }
1137
1138     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, **kwargs):
1139         urls = (self.url_result(self._proto_relative_url(m), ie)
1140                 for m in orderedSet(map(getter, matches) if getter else matches))
1141         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1142
1143     @staticmethod
1144     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1145         """Returns a playlist"""
1146         if playlist_id:
1147             kwargs['id'] = playlist_id
1148         if playlist_title:
1149             kwargs['title'] = playlist_title
1150         if playlist_description is not None:
1151             kwargs['description'] = playlist_description
1152         return {
1153             **kwargs,
1154             '_type': 'multi_video' if multi_video else 'playlist',
1155             'entries': entries,
1156         }
1157
1158     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1159         """
1160         Perform a regex search on the given string, using a single or a list of
1161         patterns returning the first matching group.
1162         In case of failure return a default value or raise a WARNING or a
1163         RegexNotFoundError, depending on fatal, specifying the field name.
1164         """
1165         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1166             mobj = re.search(pattern, string, flags)
1167         else:
1168             for p in pattern:
1169                 mobj = re.search(p, string, flags)
1170                 if mobj:
1171                     break
1172
1173         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1174
1175         if mobj:
1176             if group is None:
1177                 # return the first matching group
1178                 return next(g for g in mobj.groups() if g is not None)
1179             elif isinstance(group, (list, tuple)):
1180                 return tuple(mobj.group(g) for g in group)
1181             else:
1182                 return mobj.group(group)
1183         elif default is not NO_DEFAULT:
1184             return default
1185         elif fatal:
1186             raise RegexNotFoundError('Unable to extract %s' % _name)
1187         else:
1188             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1189             return None
1190
1191     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1192         """
1193         Like _search_regex, but strips HTML tags and unescapes entities.
1194         """
1195         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1196         if res:
1197             return clean_html(res).strip()
1198         else:
1199             return res
1200
1201     def _get_netrc_login_info(self, netrc_machine=None):
1202         username = None
1203         password = None
1204         netrc_machine = netrc_machine or self._NETRC_MACHINE
1205
1206         if self.get_param('usenetrc', False):
1207             try:
1208                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1209                 if os.path.isdir(netrc_file):
1210                     netrc_file = os.path.join(netrc_file, '.netrc')
1211                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1212                 if info is not None:
1213                     username = info[0]
1214                     password = info[2]
1215                 else:
1216                     raise netrc.NetrcParseError(
1217                         'No authenticators for %s' % netrc_machine)
1218             except (IOError, netrc.NetrcParseError) as err:
1219                 self.report_warning(
1220                     'parsing .netrc: %s' % error_to_compat_str(err))
1221
1222         return username, password
1223
1224     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1225         """
1226         Get the login info as (username, password)
1227         First look for the manually specified credentials using username_option
1228         and password_option as keys in params dictionary. If no such credentials
1229         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1230         value.
1231         If there's no info available, return (None, None)
1232         """
1233
1234         # Attempt to use provided username and password or .netrc data
1235         username = self.get_param(username_option)
1236         if username is not None:
1237             password = self.get_param(password_option)
1238         else:
1239             username, password = self._get_netrc_login_info(netrc_machine)
1240
1241         return username, password
1242
1243     def _get_tfa_info(self, note='two-factor verification code'):
1244         """
1245         Get the two-factor authentication info
1246         TODO - asking the user will be required for sms/phone verify
1247         currently just uses the command line option
1248         If there's no info available, return None
1249         """
1250
1251         tfa = self.get_param('twofactor')
1252         if tfa is not None:
1253             return tfa
1254
1255         return compat_getpass('Type %s and press [Return]: ' % note)
1256
1257     # Helper functions for extracting OpenGraph info
1258     @staticmethod
1259     def _og_regexes(prop):
1260         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1261         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1262                        % {'prop': re.escape(prop)})
1263         template = r'<meta[^>]+?%s[^>]+?%s'
1264         return [
1265             template % (property_re, content_re),
1266             template % (content_re, property_re),
1267         ]
1268
1269     @staticmethod
1270     def _meta_regex(prop):
1271         return r'''(?isx)<meta
1272                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1273                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1274
1275     def _og_search_property(self, prop, html, name=None, **kargs):
1276         prop = variadic(prop)
1277         if name is None:
1278             name = 'OpenGraph %s' % prop[0]
1279         og_regexes = []
1280         for p in prop:
1281             og_regexes.extend(self._og_regexes(p))
1282         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1283         if escaped is None:
1284             return None
1285         return unescapeHTML(escaped)
1286
1287     def _og_search_thumbnail(self, html, **kargs):
1288         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1289
1290     def _og_search_description(self, html, **kargs):
1291         return self._og_search_property('description', html, fatal=False, **kargs)
1292
1293     def _og_search_title(self, html, **kargs):
1294         kargs.setdefault('fatal', False)
1295         return self._og_search_property('title', html, **kargs)
1296
1297     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1298         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1299         if secure:
1300             regexes = self._og_regexes('video:secure_url') + regexes
1301         return self._html_search_regex(regexes, html, name, **kargs)
1302
1303     def _og_search_url(self, html, **kargs):
1304         return self._og_search_property('url', html, **kargs)
1305
1306     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1307         name = variadic(name)
1308         if display_name is None:
1309             display_name = name[0]
1310         return self._html_search_regex(
1311             [self._meta_regex(n) for n in name],
1312             html, display_name, fatal=fatal, group='content', **kwargs)
1313
1314     def _dc_search_uploader(self, html):
1315         return self._html_search_meta('dc.creator', html, 'uploader')
1316
1317     def _rta_search(self, html):
1318         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1319         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1320                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1321                      html):
1322             return 18
1323         return 0
1324
1325     def _media_rating_search(self, html):
1326         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1327         rating = self._html_search_meta('rating', html)
1328
1329         if not rating:
1330             return None
1331
1332         RATING_TABLE = {
1333             'safe for kids': 0,
1334             'general': 8,
1335             '14 years': 14,
1336             'mature': 17,
1337             'restricted': 19,
1338         }
1339         return RATING_TABLE.get(rating.lower())
1340
1341     def _family_friendly_search(self, html):
1342         # See http://schema.org/VideoObject
1343         family_friendly = self._html_search_meta(
1344             'isFamilyFriendly', html, default=None)
1345
1346         if not family_friendly:
1347             return None
1348
1349         RATING_TABLE = {
1350             '1': 0,
1351             'true': 0,
1352             '0': 18,
1353             'false': 18,
1354         }
1355         return RATING_TABLE.get(family_friendly.lower())
1356
1357     def _twitter_search_player(self, html):
1358         return self._html_search_meta('twitter:player', html,
1359                                       'twitter card player')
1360
1361     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1362         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1363         default = kwargs.get('default', NO_DEFAULT)
1364         # JSON-LD may be malformed and thus `fatal` should be respected.
1365         # At the same time `default` may be passed that assumes `fatal=False`
1366         # for _search_regex. Let's simulate the same behavior here as well.
1367         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1368         json_ld = []
1369         for mobj in json_ld_list:
1370             json_ld_item = self._parse_json(
1371                 mobj.group('json_ld'), video_id, fatal=fatal)
1372             if not json_ld_item:
1373                 continue
1374             if isinstance(json_ld_item, dict):
1375                 json_ld.append(json_ld_item)
1376             elif isinstance(json_ld_item, (list, tuple)):
1377                 json_ld.extend(json_ld_item)
1378         if json_ld:
1379             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1380         if json_ld:
1381             return json_ld
1382         if default is not NO_DEFAULT:
1383             return default
1384         elif fatal:
1385             raise RegexNotFoundError('Unable to extract JSON-LD')
1386         else:
1387             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1388             return {}
1389
1390     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1391         if isinstance(json_ld, compat_str):
1392             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1393         if not json_ld:
1394             return {}
1395         info = {}
1396         if not isinstance(json_ld, (list, tuple, dict)):
1397             return info
1398         if isinstance(json_ld, dict):
1399             json_ld = [json_ld]
1400
1401         INTERACTION_TYPE_MAP = {
1402             'CommentAction': 'comment',
1403             'AgreeAction': 'like',
1404             'DisagreeAction': 'dislike',
1405             'LikeAction': 'like',
1406             'DislikeAction': 'dislike',
1407             'ListenAction': 'view',
1408             'WatchAction': 'view',
1409             'ViewAction': 'view',
1410         }
1411
1412         def extract_interaction_type(e):
1413             interaction_type = e.get('interactionType')
1414             if isinstance(interaction_type, dict):
1415                 interaction_type = interaction_type.get('@type')
1416             return str_or_none(interaction_type)
1417
1418         def extract_interaction_statistic(e):
1419             interaction_statistic = e.get('interactionStatistic')
1420             if isinstance(interaction_statistic, dict):
1421                 interaction_statistic = [interaction_statistic]
1422             if not isinstance(interaction_statistic, list):
1423                 return
1424             for is_e in interaction_statistic:
1425                 if not isinstance(is_e, dict):
1426                     continue
1427                 if is_e.get('@type') != 'InteractionCounter':
1428                     continue
1429                 interaction_type = extract_interaction_type(is_e)
1430                 if not interaction_type:
1431                     continue
1432                 # For interaction count some sites provide string instead of
1433                 # an integer (as per spec) with non digit characters (e.g. ",")
1434                 # so extracting count with more relaxed str_to_int
1435                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1436                 if interaction_count is None:
1437                     continue
1438                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1439                 if not count_kind:
1440                     continue
1441                 count_key = '%s_count' % count_kind
1442                 if info.get(count_key) is not None:
1443                     continue
1444                 info[count_key] = interaction_count
1445
1446         def extract_chapter_information(e):
1447             chapters = [{
1448                 'title': part.get('name'),
1449                 'start_time': part.get('startOffset'),
1450                 'end_time': part.get('endOffset'),
1451             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1452             for idx, (last_c, current_c, next_c) in enumerate(zip(
1453                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1454                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1455                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1456                 if None in current_c.values():
1457                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1458                     return
1459             if chapters:
1460                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1461                 info['chapters'] = chapters
1462
1463         def extract_video_object(e):
1464             assert e['@type'] == 'VideoObject'
1465             author = e.get('author')
1466             info.update({
1467                 'url': url_or_none(e.get('contentUrl')),
1468                 'title': unescapeHTML(e.get('name')),
1469                 'description': unescapeHTML(e.get('description')),
1470                 'thumbnails': [{'url': url_or_none(url)}
1471                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1472                 'duration': parse_duration(e.get('duration')),
1473                 'timestamp': unified_timestamp(e.get('uploadDate')),
1474                 # author can be an instance of 'Organization' or 'Person' types.
1475                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1476                 # however some websites are using 'Text' type instead.
1477                 # 1. https://schema.org/VideoObject
1478                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1479                 'filesize': float_or_none(e.get('contentSize')),
1480                 'tbr': int_or_none(e.get('bitrate')),
1481                 'width': int_or_none(e.get('width')),
1482                 'height': int_or_none(e.get('height')),
1483                 'view_count': int_or_none(e.get('interactionCount')),
1484             })
1485             extract_interaction_statistic(e)
1486             extract_chapter_information(e)
1487
1488         def traverse_json_ld(json_ld, at_top_level=True):
1489             for e in json_ld:
1490                 if at_top_level and '@context' not in e:
1491                     continue
1492                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1493                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1494                     break
1495                 item_type = e.get('@type')
1496                 if expected_type is not None and expected_type != item_type:
1497                     continue
1498                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1499                 if rating is not None:
1500                     info['average_rating'] = rating
1501                 if item_type in ('TVEpisode', 'Episode'):
1502                     episode_name = unescapeHTML(e.get('name'))
1503                     info.update({
1504                         'episode': episode_name,
1505                         'episode_number': int_or_none(e.get('episodeNumber')),
1506                         'description': unescapeHTML(e.get('description')),
1507                     })
1508                     if not info.get('title') and episode_name:
1509                         info['title'] = episode_name
1510                     part_of_season = e.get('partOfSeason')
1511                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1512                         info.update({
1513                             'season': unescapeHTML(part_of_season.get('name')),
1514                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1515                         })
1516                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1517                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1518                         info['series'] = unescapeHTML(part_of_series.get('name'))
1519                 elif item_type == 'Movie':
1520                     info.update({
1521                         'title': unescapeHTML(e.get('name')),
1522                         'description': unescapeHTML(e.get('description')),
1523                         'duration': parse_duration(e.get('duration')),
1524                         'timestamp': unified_timestamp(e.get('dateCreated')),
1525                     })
1526                 elif item_type in ('Article', 'NewsArticle'):
1527                     info.update({
1528                         'timestamp': parse_iso8601(e.get('datePublished')),
1529                         'title': unescapeHTML(e.get('headline')),
1530                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1531                     })
1532                     if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1533                         extract_video_object(e['video'][0])
1534                 elif item_type == 'VideoObject':
1535                     extract_video_object(e)
1536                     if expected_type is None:
1537                         continue
1538                     else:
1539                         break
1540                 video = e.get('video')
1541                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1542                     extract_video_object(video)
1543                 if expected_type is None:
1544                     continue
1545                 else:
1546                     break
1547         traverse_json_ld(json_ld)
1548
1549         return dict((k, v) for k, v in info.items() if v is not None)
1550
1551     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1552         return self._parse_json(
1553             self._search_regex(
1554                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1555                 webpage, 'next.js data', fatal=fatal, **kw),
1556             video_id, transform_source=transform_source, fatal=fatal)
1557
1558     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1559         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1560         # not all website do this, but it can be changed
1561         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1562         rectx = re.escape(context_name)
1563         js, arg_keys, arg_vals = self._search_regex(
1564             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1565              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1566             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1567
1568         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1569
1570         for key, val in args.items():
1571             if val in ('undefined', 'void 0'):
1572                 args[key] = 'null'
1573
1574         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1575
1576     @staticmethod
1577     def _hidden_inputs(html):
1578         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1579         hidden_inputs = {}
1580         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1581             attrs = extract_attributes(input)
1582             if not input:
1583                 continue
1584             if attrs.get('type') not in ('hidden', 'submit'):
1585                 continue
1586             name = attrs.get('name') or attrs.get('id')
1587             value = attrs.get('value')
1588             if name and value is not None:
1589                 hidden_inputs[name] = value
1590         return hidden_inputs
1591
1592     def _form_hidden_inputs(self, form_id, html):
1593         form = self._search_regex(
1594             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1595             html, '%s form' % form_id, group='form')
1596         return self._hidden_inputs(form)
1597
1598     class FormatSort:
1599         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1600
1601         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1602                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1603                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1604         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1605                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1606                         'fps', 'fs_approx', 'source', 'id')
1607
1608         settings = {
1609             'vcodec': {'type': 'ordered', 'regex': True,
1610                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1611             'acodec': {'type': 'ordered', 'regex': True,
1612                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1613             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1614                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1615             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1616                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1617             'vext': {'type': 'ordered', 'field': 'video_ext',
1618                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1619                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1620             'aext': {'type': 'ordered', 'field': 'audio_ext',
1621                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1622                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1623             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1624             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1625                            'field': ('vcodec', 'acodec'),
1626                            'function': lambda it: int(any(v != 'none' for v in it))},
1627             'ie_pref': {'priority': True, 'type': 'extractor'},
1628             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1629             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1630             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1631             'quality': {'convert': 'float', 'default': -1},
1632             'filesize': {'convert': 'bytes'},
1633             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1634             'id': {'convert': 'string', 'field': 'format_id'},
1635             'height': {'convert': 'float_none'},
1636             'width': {'convert': 'float_none'},
1637             'fps': {'convert': 'float_none'},
1638             'tbr': {'convert': 'float_none'},
1639             'vbr': {'convert': 'float_none'},
1640             'abr': {'convert': 'float_none'},
1641             'asr': {'convert': 'float_none'},
1642             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1643
1644             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1645             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1646             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1647             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1648             'res': {'type': 'multiple', 'field': ('height', 'width'),
1649                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1650
1651             # For compatibility with youtube-dl
1652             'format_id': {'type': 'alias', 'field': 'id'},
1653             'preference': {'type': 'alias', 'field': 'ie_pref'},
1654             'language_preference': {'type': 'alias', 'field': 'lang'},
1655
1656             # Deprecated
1657             'dimension': {'type': 'alias', 'field': 'res'},
1658             'resolution': {'type': 'alias', 'field': 'res'},
1659             'extension': {'type': 'alias', 'field': 'ext'},
1660             'bitrate': {'type': 'alias', 'field': 'br'},
1661             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1662             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1663             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1664             'framerate': {'type': 'alias', 'field': 'fps'},
1665             'protocol': {'type': 'alias', 'field': 'proto'},
1666             'source_preference': {'type': 'alias', 'field': 'source'},
1667             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1668             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1669             'samplerate': {'type': 'alias', 'field': 'asr'},
1670             'video_ext': {'type': 'alias', 'field': 'vext'},
1671             'audio_ext': {'type': 'alias', 'field': 'aext'},
1672             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1673             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1674             'video': {'type': 'alias', 'field': 'hasvid'},
1675             'has_video': {'type': 'alias', 'field': 'hasvid'},
1676             'audio': {'type': 'alias', 'field': 'hasaud'},
1677             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1678             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1679             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1680         }
1681
1682         def __init__(self, ie, field_preference):
1683             self._order = []
1684             self.ydl = ie._downloader
1685             self.evaluate_params(self.ydl.params, field_preference)
1686             if ie.get_param('verbose'):
1687                 self.print_verbose_info(self.ydl.write_debug)
1688
1689         def _get_field_setting(self, field, key):
1690             if field not in self.settings:
1691                 if key in ('forced', 'priority'):
1692                     return False
1693                 self.ydl.deprecation_warning(
1694                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1695                     'and may be removed in a future version')
1696                 self.settings[field] = {}
1697             propObj = self.settings[field]
1698             if key not in propObj:
1699                 type = propObj.get('type')
1700                 if key == 'field':
1701                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1702                 elif key == 'convert':
1703                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1704                 else:
1705                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1706                 propObj[key] = default
1707             return propObj[key]
1708
1709         def _resolve_field_value(self, field, value, convertNone=False):
1710             if value is None:
1711                 if not convertNone:
1712                     return None
1713             else:
1714                 value = value.lower()
1715             conversion = self._get_field_setting(field, 'convert')
1716             if conversion == 'ignore':
1717                 return None
1718             if conversion == 'string':
1719                 return value
1720             elif conversion == 'float_none':
1721                 return float_or_none(value)
1722             elif conversion == 'bytes':
1723                 return FileDownloader.parse_bytes(value)
1724             elif conversion == 'order':
1725                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1726                 use_regex = self._get_field_setting(field, 'regex')
1727                 list_length = len(order_list)
1728                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1729                 if use_regex and value is not None:
1730                     for i, regex in enumerate(order_list):
1731                         if regex and re.match(regex, value):
1732                             return list_length - i
1733                     return list_length - empty_pos  # not in list
1734                 else:  # not regex or  value = None
1735                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1736             else:
1737                 if value.isnumeric():
1738                     return float(value)
1739                 else:
1740                     self.settings[field]['convert'] = 'string'
1741                     return value
1742
1743         def evaluate_params(self, params, sort_extractor):
1744             self._use_free_order = params.get('prefer_free_formats', False)
1745             self._sort_user = params.get('format_sort', [])
1746             self._sort_extractor = sort_extractor
1747
1748             def add_item(field, reverse, closest, limit_text):
1749                 field = field.lower()
1750                 if field in self._order:
1751                     return
1752                 self._order.append(field)
1753                 limit = self._resolve_field_value(field, limit_text)
1754                 data = {
1755                     'reverse': reverse,
1756                     'closest': False if limit is None else closest,
1757                     'limit_text': limit_text,
1758                     'limit': limit}
1759                 if field in self.settings:
1760                     self.settings[field].update(data)
1761                 else:
1762                     self.settings[field] = data
1763
1764             sort_list = (
1765                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1766                 + (tuple() if params.get('format_sort_force', False)
1767                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1768                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1769
1770             for item in sort_list:
1771                 match = re.match(self.regex, item)
1772                 if match is None:
1773                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1774                 field = match.group('field')
1775                 if field is None:
1776                     continue
1777                 if self._get_field_setting(field, 'type') == 'alias':
1778                     alias, field = field, self._get_field_setting(field, 'field')
1779                     if alias not in ('format_id', 'preference', 'language_preference'):
1780                         self.ydl.deprecation_warning(
1781                             f'Format sorting alias {alias} is deprecated '
1782                             f'and may be removed in a future version. Please use {field} instead')
1783                 reverse = match.group('reverse') is not None
1784                 closest = match.group('separator') == '~'
1785                 limit_text = match.group('limit')
1786
1787                 has_limit = limit_text is not None
1788                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1789                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1790
1791                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1792                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1793                 limit_count = len(limits)
1794                 for (i, f) in enumerate(fields):
1795                     add_item(f, reverse, closest,
1796                              limits[i] if i < limit_count
1797                              else limits[0] if has_limit and not has_multiple_limits
1798                              else None)
1799
1800         def print_verbose_info(self, write_debug):
1801             if self._sort_user:
1802                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1803             if self._sort_extractor:
1804                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1805             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1806                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1807                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1808                               self._get_field_setting(field, 'limit_text'),
1809                               self._get_field_setting(field, 'limit'))
1810                 if self._get_field_setting(field, 'limit_text') is not None else '')
1811                 for field in self._order if self._get_field_setting(field, 'visible')]))
1812
1813         def _calculate_field_preference_from_value(self, format, field, type, value):
1814             reverse = self._get_field_setting(field, 'reverse')
1815             closest = self._get_field_setting(field, 'closest')
1816             limit = self._get_field_setting(field, 'limit')
1817
1818             if type == 'extractor':
1819                 maximum = self._get_field_setting(field, 'max')
1820                 if value is None or (maximum is not None and value >= maximum):
1821                     value = -1
1822             elif type == 'boolean':
1823                 in_list = self._get_field_setting(field, 'in_list')
1824                 not_in_list = self._get_field_setting(field, 'not_in_list')
1825                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1826             elif type == 'ordered':
1827                 value = self._resolve_field_value(field, value, True)
1828
1829             # try to convert to number
1830             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1831             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1832             if is_num:
1833                 value = val_num
1834
1835             return ((-10, 0) if value is None
1836                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1837                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1838                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1839                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1840                     else (-1, value, 0))
1841
1842         def _calculate_field_preference(self, format, field):
1843             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1844             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1845             if type == 'multiple':
1846                 type = 'field'  # Only 'field' is allowed in multiple for now
1847                 actual_fields = self._get_field_setting(field, 'field')
1848
1849                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1850             else:
1851                 value = get_value(field)
1852             return self._calculate_field_preference_from_value(format, field, type, value)
1853
1854         def calculate_preference(self, format):
1855             # Determine missing protocol
1856             if not format.get('protocol'):
1857                 format['protocol'] = determine_protocol(format)
1858
1859             # Determine missing ext
1860             if not format.get('ext') and 'url' in format:
1861                 format['ext'] = determine_ext(format['url'])
1862             if format.get('vcodec') == 'none':
1863                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1864                 format['video_ext'] = 'none'
1865             else:
1866                 format['video_ext'] = format['ext']
1867                 format['audio_ext'] = 'none'
1868             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1869             #    format['preference'] = -1000
1870
1871             # Determine missing bitrates
1872             if format.get('tbr') is None:
1873                 if format.get('vbr') is not None and format.get('abr') is not None:
1874                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1875             else:
1876                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1877                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1878                 if format.get('acodec') != 'none' and format.get('abr') is None:
1879                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1880
1881             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1882
1883     def _sort_formats(self, formats, field_preference=[]):
1884         if not formats:
1885             return
1886         format_sort = self.FormatSort(self, field_preference)
1887         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1888
1889     def _check_formats(self, formats, video_id):
1890         if formats:
1891             formats[:] = filter(
1892                 lambda f: self._is_valid_url(
1893                     f['url'], video_id,
1894                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1895                 formats)
1896
1897     @staticmethod
1898     def _remove_duplicate_formats(formats):
1899         format_urls = set()
1900         unique_formats = []
1901         for f in formats:
1902             if f['url'] not in format_urls:
1903                 format_urls.add(f['url'])
1904                 unique_formats.append(f)
1905         formats[:] = unique_formats
1906
1907     def _is_valid_url(self, url, video_id, item='video', headers={}):
1908         url = self._proto_relative_url(url, scheme='http:')
1909         # For now assume non HTTP(S) URLs always valid
1910         if not (url.startswith('http://') or url.startswith('https://')):
1911             return True
1912         try:
1913             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1914             return True
1915         except ExtractorError as e:
1916             self.to_screen(
1917                 '%s: %s URL is invalid, skipping: %s'
1918                 % (video_id, item, error_to_compat_str(e.cause)))
1919             return False
1920
1921     def http_scheme(self):
1922         """ Either "http:" or "https:", depending on the user's preferences """
1923         return (
1924             'http:'
1925             if self.get_param('prefer_insecure', False)
1926             else 'https:')
1927
1928     def _proto_relative_url(self, url, scheme=None):
1929         if url is None:
1930             return url
1931         if url.startswith('//'):
1932             if scheme is None:
1933                 scheme = self.http_scheme()
1934             return scheme + url
1935         else:
1936             return url
1937
1938     def _sleep(self, timeout, video_id, msg_template=None):
1939         if msg_template is None:
1940             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1941         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1942         self.to_screen(msg)
1943         time.sleep(timeout)
1944
1945     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1946                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1947                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1948         manifest = self._download_xml(
1949             manifest_url, video_id, 'Downloading f4m manifest',
1950             'Unable to download f4m manifest',
1951             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1952             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1953             transform_source=transform_source,
1954             fatal=fatal, data=data, headers=headers, query=query)
1955
1956         if manifest is False:
1957             return []
1958
1959         return self._parse_f4m_formats(
1960             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1961             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1962
1963     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1964                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1965                            fatal=True, m3u8_id=None):
1966         if not isinstance(manifest, compat_etree_Element) and not fatal:
1967             return []
1968
1969         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1970         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1971         if akamai_pv is not None and ';' in akamai_pv.text:
1972             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1973             if playerVerificationChallenge.strip() != '':
1974                 return []
1975
1976         formats = []
1977         manifest_version = '1.0'
1978         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1979         if not media_nodes:
1980             manifest_version = '2.0'
1981             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1982         # Remove unsupported DRM protected media from final formats
1983         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1984         media_nodes = remove_encrypted_media(media_nodes)
1985         if not media_nodes:
1986             return formats
1987
1988         manifest_base_url = get_base_url(manifest)
1989
1990         bootstrap_info = xpath_element(
1991             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1992             'bootstrap info', default=None)
1993
1994         vcodec = None
1995         mime_type = xpath_text(
1996             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1997             'base URL', default=None)
1998         if mime_type and mime_type.startswith('audio/'):
1999             vcodec = 'none'
2000
2001         for i, media_el in enumerate(media_nodes):
2002             tbr = int_or_none(media_el.attrib.get('bitrate'))
2003             width = int_or_none(media_el.attrib.get('width'))
2004             height = int_or_none(media_el.attrib.get('height'))
2005             format_id = join_nonempty(f4m_id, tbr or i)
2006             # If <bootstrapInfo> is present, the specified f4m is a
2007             # stream-level manifest, and only set-level manifests may refer to
2008             # external resources.  See section 11.4 and section 4 of F4M spec
2009             if bootstrap_info is None:
2010                 media_url = None
2011                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2012                 if manifest_version == '2.0':
2013                     media_url = media_el.attrib.get('href')
2014                 if media_url is None:
2015                     media_url = media_el.attrib.get('url')
2016                 if not media_url:
2017                     continue
2018                 manifest_url = (
2019                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2020                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2021                 # If media_url is itself a f4m manifest do the recursive extraction
2022                 # since bitrates in parent manifest (this one) and media_url manifest
2023                 # may differ leading to inability to resolve the format by requested
2024                 # bitrate in f4m downloader
2025                 ext = determine_ext(manifest_url)
2026                 if ext == 'f4m':
2027                     f4m_formats = self._extract_f4m_formats(
2028                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2029                         transform_source=transform_source, fatal=fatal)
2030                     # Sometimes stream-level manifest contains single media entry that
2031                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2032                     # At the same time parent's media entry in set-level manifest may
2033                     # contain it. We will copy it from parent in such cases.
2034                     if len(f4m_formats) == 1:
2035                         f = f4m_formats[0]
2036                         f.update({
2037                             'tbr': f.get('tbr') or tbr,
2038                             'width': f.get('width') or width,
2039                             'height': f.get('height') or height,
2040                             'format_id': f.get('format_id') if not tbr else format_id,
2041                             'vcodec': vcodec,
2042                         })
2043                     formats.extend(f4m_formats)
2044                     continue
2045                 elif ext == 'm3u8':
2046                     formats.extend(self._extract_m3u8_formats(
2047                         manifest_url, video_id, 'mp4', preference=preference,
2048                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2049                     continue
2050             formats.append({
2051                 'format_id': format_id,
2052                 'url': manifest_url,
2053                 'manifest_url': manifest_url,
2054                 'ext': 'flv' if bootstrap_info is not None else None,
2055                 'protocol': 'f4m',
2056                 'tbr': tbr,
2057                 'width': width,
2058                 'height': height,
2059                 'vcodec': vcodec,
2060                 'preference': preference,
2061                 'quality': quality,
2062             })
2063         return formats
2064
2065     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2066         return {
2067             'format_id': join_nonempty(m3u8_id, 'meta'),
2068             'url': m3u8_url,
2069             'ext': ext,
2070             'protocol': 'm3u8',
2071             'preference': preference - 100 if preference else -100,
2072             'quality': quality,
2073             'resolution': 'multiple',
2074             'format_note': 'Quality selection URL',
2075         }
2076
2077     def _report_ignoring_subs(self, name):
2078         self.report_warning(bug_reports_message(
2079             f'Ignoring subtitle tracks found in the {name} manifest; '
2080             'if any subtitle tracks are missing,'
2081         ), only_once=True)
2082
2083     def _extract_m3u8_formats(self, *args, **kwargs):
2084         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2085         if subs:
2086             self._report_ignoring_subs('HLS')
2087         return fmts
2088
2089     def _extract_m3u8_formats_and_subtitles(
2090             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2091             preference=None, quality=None, m3u8_id=None, note=None,
2092             errnote=None, fatal=True, live=False, data=None, headers={},
2093             query={}):
2094
2095         res = self._download_webpage_handle(
2096             m3u8_url, video_id,
2097             note='Downloading m3u8 information' if note is None else note,
2098             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2099             fatal=fatal, data=data, headers=headers, query=query)
2100
2101         if res is False:
2102             return [], {}
2103
2104         m3u8_doc, urlh = res
2105         m3u8_url = urlh.geturl()
2106
2107         return self._parse_m3u8_formats_and_subtitles(
2108             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2109             preference=preference, quality=quality, m3u8_id=m3u8_id,
2110             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2111             headers=headers, query=query, video_id=video_id)
2112
2113     def _parse_m3u8_formats_and_subtitles(
2114             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2115             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2116             errnote=None, fatal=True, data=None, headers={}, query={},
2117             video_id=None):
2118         formats, subtitles = [], {}
2119
2120         has_drm = re.search('|'.join([
2121             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2122             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2123         ]), m3u8_doc)
2124
2125         def format_url(url):
2126             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2127
2128         if self.get_param('hls_split_discontinuity', False):
2129             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2130                 if not m3u8_doc:
2131                     if not manifest_url:
2132                         return []
2133                     m3u8_doc = self._download_webpage(
2134                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2135                         note=False, errnote='Failed to download m3u8 playlist information')
2136                     if m3u8_doc is False:
2137                         return []
2138                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2139
2140         else:
2141             def _extract_m3u8_playlist_indices(*args, **kwargs):
2142                 return [None]
2143
2144         # References:
2145         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2146         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2147         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2148
2149         # We should try extracting formats only from master playlists [1, 4.3.4],
2150         # i.e. playlists that describe available qualities. On the other hand
2151         # media playlists [1, 4.3.3] should be returned as is since they contain
2152         # just the media without qualities renditions.
2153         # Fortunately, master playlist can be easily distinguished from media
2154         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2155         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2156         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2157         # media playlist and MUST NOT appear in master playlist thus we can
2158         # clearly detect media playlist with this criterion.
2159
2160         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2161             formats = [{
2162                 'format_id': join_nonempty(m3u8_id, idx),
2163                 'format_index': idx,
2164                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2165                 'ext': ext,
2166                 'protocol': entry_protocol,
2167                 'preference': preference,
2168                 'quality': quality,
2169                 'has_drm': has_drm,
2170             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2171
2172             return formats, subtitles
2173
2174         groups = {}
2175         last_stream_inf = {}
2176
2177         def extract_media(x_media_line):
2178             media = parse_m3u8_attributes(x_media_line)
2179             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2180             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2181             if not (media_type and group_id and name):
2182                 return
2183             groups.setdefault(group_id, []).append(media)
2184             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2185             if media_type == 'SUBTITLES':
2186                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2187                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2188                 # However, lack of URI has been spotted in the wild.
2189                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2190                 if not media.get('URI'):
2191                     return
2192                 url = format_url(media['URI'])
2193                 sub_info = {
2194                     'url': url,
2195                     'ext': determine_ext(url),
2196                 }
2197                 if sub_info['ext'] == 'm3u8':
2198                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2199                     # files may contain is WebVTT:
2200                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2201                     sub_info['ext'] = 'vtt'
2202                     sub_info['protocol'] = 'm3u8_native'
2203                 lang = media.get('LANGUAGE') or 'und'
2204                 subtitles.setdefault(lang, []).append(sub_info)
2205             if media_type not in ('VIDEO', 'AUDIO'):
2206                 return
2207             media_url = media.get('URI')
2208             if media_url:
2209                 manifest_url = format_url(media_url)
2210                 formats.extend({
2211                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2212                     'format_note': name,
2213                     'format_index': idx,
2214                     'url': manifest_url,
2215                     'manifest_url': m3u8_url,
2216                     'language': media.get('LANGUAGE'),
2217                     'ext': ext,
2218                     'protocol': entry_protocol,
2219                     'preference': preference,
2220                     'quality': quality,
2221                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2222                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2223
2224         def build_stream_name():
2225             # Despite specification does not mention NAME attribute for
2226             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2227             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2228             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2229             stream_name = last_stream_inf.get('NAME')
2230             if stream_name:
2231                 return stream_name
2232             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2233             # from corresponding rendition group
2234             stream_group_id = last_stream_inf.get('VIDEO')
2235             if not stream_group_id:
2236                 return
2237             stream_group = groups.get(stream_group_id)
2238             if not stream_group:
2239                 return stream_group_id
2240             rendition = stream_group[0]
2241             return rendition.get('NAME') or stream_group_id
2242
2243         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2244         # chance to detect video only formats when EXT-X-STREAM-INF tags
2245         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2246         for line in m3u8_doc.splitlines():
2247             if line.startswith('#EXT-X-MEDIA:'):
2248                 extract_media(line)
2249
2250         for line in m3u8_doc.splitlines():
2251             if line.startswith('#EXT-X-STREAM-INF:'):
2252                 last_stream_inf = parse_m3u8_attributes(line)
2253             elif line.startswith('#') or not line.strip():
2254                 continue
2255             else:
2256                 tbr = float_or_none(
2257                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2258                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2259                 manifest_url = format_url(line.strip())
2260
2261                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2262                     format_id = [m3u8_id, None, idx]
2263                     # Bandwidth of live streams may differ over time thus making
2264                     # format_id unpredictable. So it's better to keep provided
2265                     # format_id intact.
2266                     if not live:
2267                         stream_name = build_stream_name()
2268                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2269                     f = {
2270                         'format_id': join_nonempty(*format_id),
2271                         'format_index': idx,
2272                         'url': manifest_url,
2273                         'manifest_url': m3u8_url,
2274                         'tbr': tbr,
2275                         'ext': ext,
2276                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2277                         'protocol': entry_protocol,
2278                         'preference': preference,
2279                         'quality': quality,
2280                     }
2281                     resolution = last_stream_inf.get('RESOLUTION')
2282                     if resolution:
2283                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2284                         if mobj:
2285                             f['width'] = int(mobj.group('width'))
2286                             f['height'] = int(mobj.group('height'))
2287                     # Unified Streaming Platform
2288                     mobj = re.search(
2289                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2290                     if mobj:
2291                         abr, vbr = mobj.groups()
2292                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2293                         f.update({
2294                             'vbr': vbr,
2295                             'abr': abr,
2296                         })
2297                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2298                     f.update(codecs)
2299                     audio_group_id = last_stream_inf.get('AUDIO')
2300                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2301                     # references a rendition group MUST have a CODECS attribute.
2302                     # However, this is not always respected, for example, [2]
2303                     # contains EXT-X-STREAM-INF tag which references AUDIO
2304                     # rendition group but does not have CODECS and despite
2305                     # referencing an audio group it represents a complete
2306                     # (with audio and video) format. So, for such cases we will
2307                     # ignore references to rendition groups and treat them
2308                     # as complete formats.
2309                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2310                         audio_group = groups.get(audio_group_id)
2311                         if audio_group and audio_group[0].get('URI'):
2312                             # TODO: update acodec for audio only formats with
2313                             # the same GROUP-ID
2314                             f['acodec'] = 'none'
2315                     if not f.get('ext'):
2316                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2317                     formats.append(f)
2318
2319                     # for DailyMotion
2320                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2321                     if progressive_uri:
2322                         http_f = f.copy()
2323                         del http_f['manifest_url']
2324                         http_f.update({
2325                             'format_id': f['format_id'].replace('hls-', 'http-'),
2326                             'protocol': 'http',
2327                             'url': progressive_uri,
2328                         })
2329                         formats.append(http_f)
2330
2331                 last_stream_inf = {}
2332         return formats, subtitles
2333
2334     def _extract_m3u8_vod_duration(
2335             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2336
2337         m3u8_vod = self._download_webpage(
2338             m3u8_vod_url, video_id,
2339             note='Downloading m3u8 VOD manifest' if note is None else note,
2340             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2341             fatal=False, data=data, headers=headers, query=query)
2342
2343         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2344
2345     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2346         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2347             return None
2348
2349         return int(sum(
2350             float(line[len('#EXTINF:'):].split(',')[0])
2351             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2352
2353     @staticmethod
2354     def _xpath_ns(path, namespace=None):
2355         if not namespace:
2356             return path
2357         out = []
2358         for c in path.split('/'):
2359             if not c or c == '.':
2360                 out.append(c)
2361             else:
2362                 out.append('{%s}%s' % (namespace, c))
2363         return '/'.join(out)
2364
2365     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2366         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2367
2368         if smil is False:
2369             assert not fatal
2370             return [], {}
2371
2372         namespace = self._parse_smil_namespace(smil)
2373
2374         fmts = self._parse_smil_formats(
2375             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2376         subs = self._parse_smil_subtitles(
2377             smil, namespace=namespace)
2378
2379         return fmts, subs
2380
2381     def _extract_smil_formats(self, *args, **kwargs):
2382         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2383         if subs:
2384             self._report_ignoring_subs('SMIL')
2385         return fmts
2386
2387     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2388         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2389         if smil is False:
2390             return {}
2391         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2392
2393     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2394         return self._download_xml(
2395             smil_url, video_id, 'Downloading SMIL file',
2396             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2397
2398     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2399         namespace = self._parse_smil_namespace(smil)
2400
2401         formats = self._parse_smil_formats(
2402             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2403         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2404
2405         video_id = os.path.splitext(url_basename(smil_url))[0]
2406         title = None
2407         description = None
2408         upload_date = None
2409         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2410             name = meta.attrib.get('name')
2411             content = meta.attrib.get('content')
2412             if not name or not content:
2413                 continue
2414             if not title and name == 'title':
2415                 title = content
2416             elif not description and name in ('description', 'abstract'):
2417                 description = content
2418             elif not upload_date and name == 'date':
2419                 upload_date = unified_strdate(content)
2420
2421         thumbnails = [{
2422             'id': image.get('type'),
2423             'url': image.get('src'),
2424             'width': int_or_none(image.get('width')),
2425             'height': int_or_none(image.get('height')),
2426         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2427
2428         return {
2429             'id': video_id,
2430             'title': title or video_id,
2431             'description': description,
2432             'upload_date': upload_date,
2433             'thumbnails': thumbnails,
2434             'formats': formats,
2435             'subtitles': subtitles,
2436         }
2437
2438     def _parse_smil_namespace(self, smil):
2439         return self._search_regex(
2440             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2441
2442     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2443         base = smil_url
2444         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2445             b = meta.get('base') or meta.get('httpBase')
2446             if b:
2447                 base = b
2448                 break
2449
2450         formats = []
2451         rtmp_count = 0
2452         http_count = 0
2453         m3u8_count = 0
2454         imgs_count = 0
2455
2456         srcs = set()
2457         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2458         for medium in media:
2459             src = medium.get('src')
2460             if not src or src in srcs:
2461                 continue
2462             srcs.add(src)
2463
2464             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2465             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2466             width = int_or_none(medium.get('width'))
2467             height = int_or_none(medium.get('height'))
2468             proto = medium.get('proto')
2469             ext = medium.get('ext')
2470             src_ext = determine_ext(src)
2471             streamer = medium.get('streamer') or base
2472
2473             if proto == 'rtmp' or streamer.startswith('rtmp'):
2474                 rtmp_count += 1
2475                 formats.append({
2476                     'url': streamer,
2477                     'play_path': src,
2478                     'ext': 'flv',
2479                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2480                     'tbr': bitrate,
2481                     'filesize': filesize,
2482                     'width': width,
2483                     'height': height,
2484                 })
2485                 if transform_rtmp_url:
2486                     streamer, src = transform_rtmp_url(streamer, src)
2487                     formats[-1].update({
2488                         'url': streamer,
2489                         'play_path': src,
2490                     })
2491                 continue
2492
2493             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2494             src_url = src_url.strip()
2495
2496             if proto == 'm3u8' or src_ext == 'm3u8':
2497                 m3u8_formats = self._extract_m3u8_formats(
2498                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2499                 if len(m3u8_formats) == 1:
2500                     m3u8_count += 1
2501                     m3u8_formats[0].update({
2502                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2503                         'tbr': bitrate,
2504                         'width': width,
2505                         'height': height,
2506                     })
2507                 formats.extend(m3u8_formats)
2508             elif src_ext == 'f4m':
2509                 f4m_url = src_url
2510                 if not f4m_params:
2511                     f4m_params = {
2512                         'hdcore': '3.2.0',
2513                         'plugin': 'flowplayer-3.2.0.1',
2514                     }
2515                 f4m_url += '&' if '?' in f4m_url else '?'
2516                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2517                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2518             elif src_ext == 'mpd':
2519                 formats.extend(self._extract_mpd_formats(
2520                     src_url, video_id, mpd_id='dash', fatal=False))
2521             elif re.search(r'\.ism/[Mm]anifest', src_url):
2522                 formats.extend(self._extract_ism_formats(
2523                     src_url, video_id, ism_id='mss', fatal=False))
2524             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2525                 http_count += 1
2526                 formats.append({
2527                     'url': src_url,
2528                     'ext': ext or src_ext or 'flv',
2529                     'format_id': 'http-%d' % (bitrate or http_count),
2530                     'tbr': bitrate,
2531                     'filesize': filesize,
2532                     'width': width,
2533                     'height': height,
2534                 })
2535
2536         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2537             src = medium.get('src')
2538             if not src or src in srcs:
2539                 continue
2540             srcs.add(src)
2541
2542             imgs_count += 1
2543             formats.append({
2544                 'format_id': 'imagestream-%d' % (imgs_count),
2545                 'url': src,
2546                 'ext': mimetype2ext(medium.get('type')),
2547                 'acodec': 'none',
2548                 'vcodec': 'none',
2549                 'width': int_or_none(medium.get('width')),
2550                 'height': int_or_none(medium.get('height')),
2551                 'format_note': 'SMIL storyboards',
2552             })
2553
2554         return formats
2555
2556     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2557         urls = []
2558         subtitles = {}
2559         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2560             src = textstream.get('src')
2561             if not src or src in urls:
2562                 continue
2563             urls.append(src)
2564             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2565             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2566             subtitles.setdefault(lang, []).append({
2567                 'url': src,
2568                 'ext': ext,
2569             })
2570         return subtitles
2571
2572     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2573         xspf = self._download_xml(
2574             xspf_url, playlist_id, 'Downloading xpsf playlist',
2575             'Unable to download xspf manifest', fatal=fatal)
2576         if xspf is False:
2577             return []
2578         return self._parse_xspf(
2579             xspf, playlist_id, xspf_url=xspf_url,
2580             xspf_base_url=base_url(xspf_url))
2581
2582     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2583         NS_MAP = {
2584             'xspf': 'http://xspf.org/ns/0/',
2585             's1': 'http://static.streamone.nl/player/ns/0',
2586         }
2587
2588         entries = []
2589         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2590             title = xpath_text(
2591                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2592             description = xpath_text(
2593                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2594             thumbnail = xpath_text(
2595                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2596             duration = float_or_none(
2597                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2598
2599             formats = []
2600             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2601                 format_url = urljoin(xspf_base_url, location.text)
2602                 if not format_url:
2603                     continue
2604                 formats.append({
2605                     'url': format_url,
2606                     'manifest_url': xspf_url,
2607                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2608                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2609                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2610                 })
2611             self._sort_formats(formats)
2612
2613             entries.append({
2614                 'id': playlist_id,
2615                 'title': title,
2616                 'description': description,
2617                 'thumbnail': thumbnail,
2618                 'duration': duration,
2619                 'formats': formats,
2620             })
2621         return entries
2622
2623     def _extract_mpd_formats(self, *args, **kwargs):
2624         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2625         if subs:
2626             self._report_ignoring_subs('DASH')
2627         return fmts
2628
2629     def _extract_mpd_formats_and_subtitles(
2630             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2631             fatal=True, data=None, headers={}, query={}):
2632         res = self._download_xml_handle(
2633             mpd_url, video_id,
2634             note='Downloading MPD manifest' if note is None else note,
2635             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2636             fatal=fatal, data=data, headers=headers, query=query)
2637         if res is False:
2638             return [], {}
2639         mpd_doc, urlh = res
2640         if mpd_doc is None:
2641             return [], {}
2642         mpd_base_url = base_url(urlh.geturl())
2643
2644         return self._parse_mpd_formats_and_subtitles(
2645             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2646
2647     def _parse_mpd_formats(self, *args, **kwargs):
2648         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2649         if subs:
2650             self._report_ignoring_subs('DASH')
2651         return fmts
2652
2653     def _parse_mpd_formats_and_subtitles(
2654             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2655         """
2656         Parse formats from MPD manifest.
2657         References:
2658          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2659             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2660          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2661         """
2662         if not self.get_param('dynamic_mpd', True):
2663             if mpd_doc.get('type') == 'dynamic':
2664                 return [], {}
2665
2666         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2667
2668         def _add_ns(path):
2669             return self._xpath_ns(path, namespace)
2670
2671         def is_drm_protected(element):
2672             return element.find(_add_ns('ContentProtection')) is not None
2673
2674         def extract_multisegment_info(element, ms_parent_info):
2675             ms_info = ms_parent_info.copy()
2676
2677             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2678             # common attributes and elements.  We will only extract relevant
2679             # for us.
2680             def extract_common(source):
2681                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2682                 if segment_timeline is not None:
2683                     s_e = segment_timeline.findall(_add_ns('S'))
2684                     if s_e:
2685                         ms_info['total_number'] = 0
2686                         ms_info['s'] = []
2687                         for s in s_e:
2688                             r = int(s.get('r', 0))
2689                             ms_info['total_number'] += 1 + r
2690                             ms_info['s'].append({
2691                                 't': int(s.get('t', 0)),
2692                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2693                                 'd': int(s.attrib['d']),
2694                                 'r': r,
2695                             })
2696                 start_number = source.get('startNumber')
2697                 if start_number:
2698                     ms_info['start_number'] = int(start_number)
2699                 timescale = source.get('timescale')
2700                 if timescale:
2701                     ms_info['timescale'] = int(timescale)
2702                 segment_duration = source.get('duration')
2703                 if segment_duration:
2704                     ms_info['segment_duration'] = float(segment_duration)
2705
2706             def extract_Initialization(source):
2707                 initialization = source.find(_add_ns('Initialization'))
2708                 if initialization is not None:
2709                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2710
2711             segment_list = element.find(_add_ns('SegmentList'))
2712             if segment_list is not None:
2713                 extract_common(segment_list)
2714                 extract_Initialization(segment_list)
2715                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2716                 if segment_urls_e:
2717                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2718             else:
2719                 segment_template = element.find(_add_ns('SegmentTemplate'))
2720                 if segment_template is not None:
2721                     extract_common(segment_template)
2722                     media = segment_template.get('media')
2723                     if media:
2724                         ms_info['media'] = media
2725                     initialization = segment_template.get('initialization')
2726                     if initialization:
2727                         ms_info['initialization'] = initialization
2728                     else:
2729                         extract_Initialization(segment_template)
2730             return ms_info
2731
2732         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2733         formats, subtitles = [], {}
2734         stream_numbers = collections.defaultdict(int)
2735         for period in mpd_doc.findall(_add_ns('Period')):
2736             period_duration = parse_duration(period.get('duration')) or mpd_duration
2737             period_ms_info = extract_multisegment_info(period, {
2738                 'start_number': 1,
2739                 'timescale': 1,
2740             })
2741             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2742                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2743                 for representation in adaptation_set.findall(_add_ns('Representation')):
2744                     representation_attrib = adaptation_set.attrib.copy()
2745                     representation_attrib.update(representation.attrib)
2746                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2747                     mime_type = representation_attrib['mimeType']
2748                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2749
2750                     codecs = parse_codecs(representation_attrib.get('codecs', ''))
2751                     if content_type not in ('video', 'audio', 'text'):
2752                         if mime_type == 'image/jpeg':
2753                             content_type = mime_type
2754                         elif codecs['vcodec'] != 'none':
2755                             content_type = 'video'
2756                         elif codecs['acodec'] != 'none':
2757                             content_type = 'audio'
2758                         elif codecs.get('tcodec', 'none') != 'none':
2759                             content_type = 'text'
2760                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2761                             content_type = 'text'
2762                         else:
2763                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2764                             continue
2765
2766                     base_url = ''
2767                     for element in (representation, adaptation_set, period, mpd_doc):
2768                         base_url_e = element.find(_add_ns('BaseURL'))
2769                         if base_url_e is not None:
2770                             base_url = base_url_e.text + base_url
2771                             if re.match(r'^https?://', base_url):
2772                                 break
2773                     if mpd_base_url and base_url.startswith('/'):
2774                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2775                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2776                         if not mpd_base_url.endswith('/'):
2777                             mpd_base_url += '/'
2778                         base_url = mpd_base_url + base_url
2779                     representation_id = representation_attrib.get('id')
2780                     lang = representation_attrib.get('lang')
2781                     url_el = representation.find(_add_ns('BaseURL'))
2782                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2783                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2784                     if representation_id is not None:
2785                         format_id = representation_id
2786                     else:
2787                         format_id = content_type
2788                     if mpd_id:
2789                         format_id = mpd_id + '-' + format_id
2790                     if content_type in ('video', 'audio'):
2791                         f = {
2792                             'format_id': format_id,
2793                             'manifest_url': mpd_url,
2794                             'ext': mimetype2ext(mime_type),
2795                             'width': int_or_none(representation_attrib.get('width')),
2796                             'height': int_or_none(representation_attrib.get('height')),
2797                             'tbr': float_or_none(bandwidth, 1000),
2798                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2799                             'fps': int_or_none(representation_attrib.get('frameRate')),
2800                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2801                             'format_note': 'DASH %s' % content_type,
2802                             'filesize': filesize,
2803                             'container': mimetype2ext(mime_type) + '_dash',
2804                             **codecs
2805                         }
2806                     elif content_type == 'text':
2807                         f = {
2808                             'ext': mimetype2ext(mime_type),
2809                             'manifest_url': mpd_url,
2810                             'filesize': filesize,
2811                         }
2812                     elif content_type == 'image/jpeg':
2813                         # See test case in VikiIE
2814                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2815                         f = {
2816                             'format_id': format_id,
2817                             'ext': 'mhtml',
2818                             'manifest_url': mpd_url,
2819                             'format_note': 'DASH storyboards (jpeg)',
2820                             'acodec': 'none',
2821                             'vcodec': 'none',
2822                         }
2823                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2824                         f['has_drm'] = True
2825                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2826
2827                     def prepare_template(template_name, identifiers):
2828                         tmpl = representation_ms_info[template_name]
2829                         # First of, % characters outside $...$ templates
2830                         # must be escaped by doubling for proper processing
2831                         # by % operator string formatting used further (see
2832                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2833                         t = ''
2834                         in_template = False
2835                         for c in tmpl:
2836                             t += c
2837                             if c == '$':
2838                                 in_template = not in_template
2839                             elif c == '%' and not in_template:
2840                                 t += c
2841                         # Next, $...$ templates are translated to their
2842                         # %(...) counterparts to be used with % operator
2843                         if representation_id is not None:
2844                             t = t.replace('$RepresentationID$', representation_id)
2845                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2846                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2847                         t.replace('$$', '$')
2848                         return t
2849
2850                     # @initialization is a regular template like @media one
2851                     # so it should be handled just the same way (see
2852                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2853                     if 'initialization' in representation_ms_info:
2854                         initialization_template = prepare_template(
2855                             'initialization',
2856                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2857                             # $Time$ shall not be included for @initialization thus
2858                             # only $Bandwidth$ remains
2859                             ('Bandwidth', ))
2860                         representation_ms_info['initialization_url'] = initialization_template % {
2861                             'Bandwidth': bandwidth,
2862                         }
2863
2864                     def location_key(location):
2865                         return 'url' if re.match(r'^https?://', location) else 'path'
2866
2867                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2868
2869                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2870                         media_location_key = location_key(media_template)
2871
2872                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2873                         # can't be used at the same time
2874                         if '%(Number' in media_template and 's' not in representation_ms_info:
2875                             segment_duration = None
2876                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2877                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2878                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2879                             representation_ms_info['fragments'] = [{
2880                                 media_location_key: media_template % {
2881                                     'Number': segment_number,
2882                                     'Bandwidth': bandwidth,
2883                                 },
2884                                 'duration': segment_duration,
2885                             } for segment_number in range(
2886                                 representation_ms_info['start_number'],
2887                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2888                         else:
2889                             # $Number*$ or $Time$ in media template with S list available
2890                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2891                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2892                             representation_ms_info['fragments'] = []
2893                             segment_time = 0
2894                             segment_d = None
2895                             segment_number = representation_ms_info['start_number']
2896
2897                             def add_segment_url():
2898                                 segment_url = media_template % {
2899                                     'Time': segment_time,
2900                                     'Bandwidth': bandwidth,
2901                                     'Number': segment_number,
2902                                 }
2903                                 representation_ms_info['fragments'].append({
2904                                     media_location_key: segment_url,
2905                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2906                                 })
2907
2908                             for num, s in enumerate(representation_ms_info['s']):
2909                                 segment_time = s.get('t') or segment_time
2910                                 segment_d = s['d']
2911                                 add_segment_url()
2912                                 segment_number += 1
2913                                 for r in range(s.get('r', 0)):
2914                                     segment_time += segment_d
2915                                     add_segment_url()
2916                                     segment_number += 1
2917                                 segment_time += segment_d
2918                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2919                         # No media template
2920                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2921                         # or any YouTube dashsegments video
2922                         fragments = []
2923                         segment_index = 0
2924                         timescale = representation_ms_info['timescale']
2925                         for s in representation_ms_info['s']:
2926                             duration = float_or_none(s['d'], timescale)
2927                             for r in range(s.get('r', 0) + 1):
2928                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2929                                 fragments.append({
2930                                     location_key(segment_uri): segment_uri,
2931                                     'duration': duration,
2932                                 })
2933                                 segment_index += 1
2934                         representation_ms_info['fragments'] = fragments
2935                     elif 'segment_urls' in representation_ms_info:
2936                         # Segment URLs with no SegmentTimeline
2937                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2938                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2939                         fragments = []
2940                         segment_duration = float_or_none(
2941                             representation_ms_info['segment_duration'],
2942                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2943                         for segment_url in representation_ms_info['segment_urls']:
2944                             fragment = {
2945                                 location_key(segment_url): segment_url,
2946                             }
2947                             if segment_duration:
2948                                 fragment['duration'] = segment_duration
2949                             fragments.append(fragment)
2950                         representation_ms_info['fragments'] = fragments
2951                     # If there is a fragments key available then we correctly recognized fragmented media.
2952                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2953                     # assumption is not necessarily correct since we may simply have no support for
2954                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2955                     if 'fragments' in representation_ms_info:
2956                         f.update({
2957                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2958                             'url': mpd_url or base_url,
2959                             'fragment_base_url': base_url,
2960                             'fragments': [],
2961                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2962                         })
2963                         if 'initialization_url' in representation_ms_info:
2964                             initialization_url = representation_ms_info['initialization_url']
2965                             if not f.get('url'):
2966                                 f['url'] = initialization_url
2967                             f['fragments'].append({location_key(initialization_url): initialization_url})
2968                         f['fragments'].extend(representation_ms_info['fragments'])
2969                     else:
2970                         # Assuming direct URL to unfragmented media.
2971                         f['url'] = base_url
2972                     if content_type in ('video', 'audio', 'image/jpeg'):
2973                         f['manifest_stream_number'] = stream_numbers[f['url']]
2974                         stream_numbers[f['url']] += 1
2975                         formats.append(f)
2976                     elif content_type == 'text':
2977                         subtitles.setdefault(lang or 'und', []).append(f)
2978
2979         return formats, subtitles
2980
2981     def _extract_ism_formats(self, *args, **kwargs):
2982         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2983         if subs:
2984             self._report_ignoring_subs('ISM')
2985         return fmts
2986
2987     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2988         res = self._download_xml_handle(
2989             ism_url, video_id,
2990             note='Downloading ISM manifest' if note is None else note,
2991             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2992             fatal=fatal, data=data, headers=headers, query=query)
2993         if res is False:
2994             return [], {}
2995         ism_doc, urlh = res
2996         if ism_doc is None:
2997             return [], {}
2998
2999         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3000
3001     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3002         """
3003         Parse formats from ISM manifest.
3004         References:
3005          1. [MS-SSTR]: Smooth Streaming Protocol,
3006             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3007         """
3008         if ism_doc.get('IsLive') == 'TRUE':
3009             return [], {}
3010
3011         duration = int(ism_doc.attrib['Duration'])
3012         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3013
3014         formats = []
3015         subtitles = {}
3016         for stream in ism_doc.findall('StreamIndex'):
3017             stream_type = stream.get('Type')
3018             if stream_type not in ('video', 'audio', 'text'):
3019                 continue
3020             url_pattern = stream.attrib['Url']
3021             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3022             stream_name = stream.get('Name')
3023             stream_language = stream.get('Language', 'und')
3024             for track in stream.findall('QualityLevel'):
3025                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3026                 # TODO: add support for WVC1 and WMAP
3027                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3028                     self.report_warning('%s is not a supported codec' % fourcc)
3029                     continue
3030                 tbr = int(track.attrib['Bitrate']) // 1000
3031                 # [1] does not mention Width and Height attributes. However,
3032                 # they're often present while MaxWidth and MaxHeight are
3033                 # missing, so should be used as fallbacks
3034                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3035                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3036                 sampling_rate = int_or_none(track.get('SamplingRate'))
3037
3038                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3039                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3040
3041                 fragments = []
3042                 fragment_ctx = {
3043                     'time': 0,
3044                 }
3045                 stream_fragments = stream.findall('c')
3046                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3047                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3048                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3049                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3050                     if not fragment_ctx['duration']:
3051                         try:
3052                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3053                         except IndexError:
3054                             next_fragment_time = duration
3055                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3056                     for _ in range(fragment_repeat):
3057                         fragments.append({
3058                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3059                             'duration': fragment_ctx['duration'] / stream_timescale,
3060                         })
3061                         fragment_ctx['time'] += fragment_ctx['duration']
3062
3063                 if stream_type == 'text':
3064                     subtitles.setdefault(stream_language, []).append({
3065                         'ext': 'ismt',
3066                         'protocol': 'ism',
3067                         'url': ism_url,
3068                         'manifest_url': ism_url,
3069                         'fragments': fragments,
3070                         '_download_params': {
3071                             'stream_type': stream_type,
3072                             'duration': duration,
3073                             'timescale': stream_timescale,
3074                             'fourcc': fourcc,
3075                             'language': stream_language,
3076                             'codec_private_data': track.get('CodecPrivateData'),
3077                         }
3078                     })
3079                 elif stream_type in ('video', 'audio'):
3080                     formats.append({
3081                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3082                         'url': ism_url,
3083                         'manifest_url': ism_url,
3084                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3085                         'width': width,
3086                         'height': height,
3087                         'tbr': tbr,
3088                         'asr': sampling_rate,
3089                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3090                         'acodec': 'none' if stream_type == 'video' else fourcc,
3091                         'protocol': 'ism',
3092                         'fragments': fragments,
3093                         'has_drm': ism_doc.find('Protection') is not None,
3094                         '_download_params': {
3095                             'stream_type': stream_type,
3096                             'duration': duration,
3097                             'timescale': stream_timescale,
3098                             'width': width or 0,
3099                             'height': height or 0,
3100                             'fourcc': fourcc,
3101                             'language': stream_language,
3102                             'codec_private_data': track.get('CodecPrivateData'),
3103                             'sampling_rate': sampling_rate,
3104                             'channels': int_or_none(track.get('Channels', 2)),
3105                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3106                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3107                         },
3108                     })
3109         return formats, subtitles
3110
3111     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3112         def absolute_url(item_url):
3113             return urljoin(base_url, item_url)
3114
3115         def parse_content_type(content_type):
3116             if not content_type:
3117                 return {}
3118             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3119             if ctr:
3120                 mimetype, codecs = ctr.groups()
3121                 f = parse_codecs(codecs)
3122                 f['ext'] = mimetype2ext(mimetype)
3123                 return f
3124             return {}
3125
3126         def _media_formats(src, cur_media_type, type_info={}):
3127             full_url = absolute_url(src)
3128             ext = type_info.get('ext') or determine_ext(full_url)
3129             if ext == 'm3u8':
3130                 is_plain_url = False
3131                 formats = self._extract_m3u8_formats(
3132                     full_url, video_id, ext='mp4',
3133                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3134                     preference=preference, quality=quality, fatal=False)
3135             elif ext == 'mpd':
3136                 is_plain_url = False
3137                 formats = self._extract_mpd_formats(
3138                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3139             else:
3140                 is_plain_url = True
3141                 formats = [{
3142                     'url': full_url,
3143                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3144                 }]
3145             return is_plain_url, formats
3146
3147         entries = []
3148         # amp-video and amp-audio are very similar to their HTML5 counterparts
3149         # so we wll include them right here (see
3150         # https://www.ampproject.org/docs/reference/components/amp-video)
3151         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3152         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3153         media_tags = [(media_tag, media_tag_name, media_type, '')
3154                       for media_tag, media_tag_name, media_type
3155                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3156         media_tags.extend(re.findall(
3157             # We only allow video|audio followed by a whitespace or '>'.
3158             # Allowing more characters may end up in significant slow down (see
3159             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3160             # http://www.porntrex.com/maps/videositemap.xml).
3161             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3162         for media_tag, _, media_type, media_content in media_tags:
3163             media_info = {
3164                 'formats': [],
3165                 'subtitles': {},
3166             }
3167             media_attributes = extract_attributes(media_tag)
3168             src = strip_or_none(media_attributes.get('src'))
3169             if src:
3170                 _, formats = _media_formats(src, media_type)
3171                 media_info['formats'].extend(formats)
3172             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3173             if media_content:
3174                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3175                     s_attr = extract_attributes(source_tag)
3176                     # data-video-src and data-src are non standard but seen
3177                     # several times in the wild
3178                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3179                     if not src:
3180                         continue
3181                     f = parse_content_type(s_attr.get('type'))
3182                     is_plain_url, formats = _media_formats(src, media_type, f)
3183                     if is_plain_url:
3184                         # width, height, res, label and title attributes are
3185                         # all not standard but seen several times in the wild
3186                         labels = [
3187                             s_attr.get(lbl)
3188                             for lbl in ('label', 'title')
3189                             if str_or_none(s_attr.get(lbl))
3190                         ]
3191                         width = int_or_none(s_attr.get('width'))
3192                         height = (int_or_none(s_attr.get('height'))
3193                                   or int_or_none(s_attr.get('res')))
3194                         if not width or not height:
3195                             for lbl in labels:
3196                                 resolution = parse_resolution(lbl)
3197                                 if not resolution:
3198                                     continue
3199                                 width = width or resolution.get('width')
3200                                 height = height or resolution.get('height')
3201                         for lbl in labels:
3202                             tbr = parse_bitrate(lbl)
3203                             if tbr:
3204                                 break
3205                         else:
3206                             tbr = None
3207                         f.update({
3208                             'width': width,
3209                             'height': height,
3210                             'tbr': tbr,
3211                             'format_id': s_attr.get('label') or s_attr.get('title'),
3212                         })
3213                         f.update(formats[0])
3214                         media_info['formats'].append(f)
3215                     else:
3216                         media_info['formats'].extend(formats)
3217                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3218                     track_attributes = extract_attributes(track_tag)
3219                     kind = track_attributes.get('kind')
3220                     if not kind or kind in ('subtitles', 'captions'):
3221                         src = strip_or_none(track_attributes.get('src'))
3222                         if not src:
3223                             continue
3224                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3225                         media_info['subtitles'].setdefault(lang, []).append({
3226                             'url': absolute_url(src),
3227                         })
3228             for f in media_info['formats']:
3229                 f.setdefault('http_headers', {})['Referer'] = base_url
3230             if media_info['formats'] or media_info['subtitles']:
3231                 entries.append(media_info)
3232         return entries
3233
3234     def _extract_akamai_formats(self, *args, **kwargs):
3235         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3236         if subs:
3237             self._report_ignoring_subs('akamai')
3238         return fmts
3239
3240     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3241         signed = 'hdnea=' in manifest_url
3242         if not signed:
3243             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3244             manifest_url = re.sub(
3245                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3246                 '', manifest_url).strip('?')
3247
3248         formats = []
3249         subtitles = {}
3250
3251         hdcore_sign = 'hdcore=3.7.0'
3252         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3253         hds_host = hosts.get('hds')
3254         if hds_host:
3255             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3256         if 'hdcore=' not in f4m_url:
3257             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3258         f4m_formats = self._extract_f4m_formats(
3259             f4m_url, video_id, f4m_id='hds', fatal=False)
3260         for entry in f4m_formats:
3261             entry.update({'extra_param_to_segment_url': hdcore_sign})
3262         formats.extend(f4m_formats)
3263
3264         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3265         hls_host = hosts.get('hls')
3266         if hls_host:
3267             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3268         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3269             m3u8_url, video_id, 'mp4', 'm3u8_native',
3270             m3u8_id='hls', fatal=False)
3271         formats.extend(m3u8_formats)
3272         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3273
3274         http_host = hosts.get('http')
3275         if http_host and m3u8_formats and not signed:
3276             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3277             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3278             qualities_length = len(qualities)
3279             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3280                 i = 0
3281                 for f in m3u8_formats:
3282                     if f['vcodec'] != 'none':
3283                         for protocol in ('http', 'https'):
3284                             http_f = f.copy()
3285                             del http_f['manifest_url']
3286                             http_url = re.sub(
3287                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3288                             http_f.update({
3289                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3290                                 'url': http_url,
3291                                 'protocol': protocol,
3292                             })
3293                             formats.append(http_f)
3294                         i += 1
3295
3296         return formats, subtitles
3297
3298     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3299         query = compat_urlparse.urlparse(url).query
3300         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3301         mobj = re.search(
3302             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3303         url_base = mobj.group('url')
3304         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3305         formats = []
3306
3307         def manifest_url(manifest):
3308             m_url = '%s/%s' % (http_base_url, manifest)
3309             if query:
3310                 m_url += '?%s' % query
3311             return m_url
3312
3313         if 'm3u8' not in skip_protocols:
3314             formats.extend(self._extract_m3u8_formats(
3315                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3316                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3317         if 'f4m' not in skip_protocols:
3318             formats.extend(self._extract_f4m_formats(
3319                 manifest_url('manifest.f4m'),
3320                 video_id, f4m_id='hds', fatal=False))
3321         if 'dash' not in skip_protocols:
3322             formats.extend(self._extract_mpd_formats(
3323                 manifest_url('manifest.mpd'),
3324                 video_id, mpd_id='dash', fatal=False))
3325         if re.search(r'(?:/smil:|\.smil)', url_base):
3326             if 'smil' not in skip_protocols:
3327                 rtmp_formats = self._extract_smil_formats(
3328                     manifest_url('jwplayer.smil'),
3329                     video_id, fatal=False)
3330                 for rtmp_format in rtmp_formats:
3331                     rtsp_format = rtmp_format.copy()
3332                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3333                     del rtsp_format['play_path']
3334                     del rtsp_format['ext']
3335                     rtsp_format.update({
3336                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3337                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3338                         'protocol': 'rtsp',
3339                     })
3340                     formats.extend([rtmp_format, rtsp_format])
3341         else:
3342             for protocol in ('rtmp', 'rtsp'):
3343                 if protocol not in skip_protocols:
3344                     formats.append({
3345                         'url': '%s:%s' % (protocol, url_base),
3346                         'format_id': protocol,
3347                         'protocol': protocol,
3348                     })
3349         return formats
3350
3351     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3352         mobj = re.search(
3353             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3354             webpage)
3355         if mobj:
3356             try:
3357                 jwplayer_data = self._parse_json(mobj.group('options'),
3358                                                  video_id=video_id,
3359                                                  transform_source=transform_source)
3360             except ExtractorError:
3361                 pass
3362             else:
3363                 if isinstance(jwplayer_data, dict):
3364                     return jwplayer_data
3365
3366     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3367         jwplayer_data = self._find_jwplayer_data(
3368             webpage, video_id, transform_source=js_to_json)
3369         return self._parse_jwplayer_data(
3370             jwplayer_data, video_id, *args, **kwargs)
3371
3372     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3373                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3374         # JWPlayer backward compatibility: flattened playlists
3375         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3376         if 'playlist' not in jwplayer_data:
3377             jwplayer_data = {'playlist': [jwplayer_data]}
3378
3379         entries = []
3380
3381         # JWPlayer backward compatibility: single playlist item
3382         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3383         if not isinstance(jwplayer_data['playlist'], list):
3384             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3385
3386         for video_data in jwplayer_data['playlist']:
3387             # JWPlayer backward compatibility: flattened sources
3388             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3389             if 'sources' not in video_data:
3390                 video_data['sources'] = [video_data]
3391
3392             this_video_id = video_id or video_data['mediaid']
3393
3394             formats = self._parse_jwplayer_formats(
3395                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3396                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3397
3398             subtitles = {}
3399             tracks = video_data.get('tracks')
3400             if tracks and isinstance(tracks, list):
3401                 for track in tracks:
3402                     if not isinstance(track, dict):
3403                         continue
3404                     track_kind = track.get('kind')
3405                     if not track_kind or not isinstance(track_kind, compat_str):
3406                         continue
3407                     if track_kind.lower() not in ('captions', 'subtitles'):
3408                         continue
3409                     track_url = urljoin(base_url, track.get('file'))
3410                     if not track_url:
3411                         continue
3412                     subtitles.setdefault(track.get('label') or 'en', []).append({
3413                         'url': self._proto_relative_url(track_url)
3414                     })
3415
3416             entry = {
3417                 'id': this_video_id,
3418                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3419                 'description': clean_html(video_data.get('description')),
3420                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3421                 'timestamp': int_or_none(video_data.get('pubdate')),
3422                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3423                 'subtitles': subtitles,
3424             }
3425             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3426             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3427                 entry.update({
3428                     '_type': 'url_transparent',
3429                     'url': formats[0]['url'],
3430                 })
3431             else:
3432                 self._sort_formats(formats)
3433                 entry['formats'] = formats
3434             entries.append(entry)
3435         if len(entries) == 1:
3436             return entries[0]
3437         else:
3438             return self.playlist_result(entries)
3439
3440     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3441                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3442         urls = []
3443         formats = []
3444         for source in jwplayer_sources_data:
3445             if not isinstance(source, dict):
3446                 continue
3447             source_url = urljoin(
3448                 base_url, self._proto_relative_url(source.get('file')))
3449             if not source_url or source_url in urls:
3450                 continue
3451             urls.append(source_url)
3452             source_type = source.get('type') or ''
3453             ext = mimetype2ext(source_type) or determine_ext(source_url)
3454             if source_type == 'hls' or ext == 'm3u8':
3455                 formats.extend(self._extract_m3u8_formats(
3456                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3457                     m3u8_id=m3u8_id, fatal=False))
3458             elif source_type == 'dash' or ext == 'mpd':
3459                 formats.extend(self._extract_mpd_formats(
3460                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3461             elif ext == 'smil':
3462                 formats.extend(self._extract_smil_formats(
3463                     source_url, video_id, fatal=False))
3464             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3465             elif source_type.startswith('audio') or ext in (
3466                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3467                 formats.append({
3468                     'url': source_url,
3469                     'vcodec': 'none',
3470                     'ext': ext,
3471                 })
3472             else:
3473                 height = int_or_none(source.get('height'))
3474                 if height is None:
3475                     # Often no height is provided but there is a label in
3476                     # format like "1080p", "720p SD", or 1080.
3477                     height = int_or_none(self._search_regex(
3478                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3479                         'height', default=None))
3480                 a_format = {
3481                     'url': source_url,
3482                     'width': int_or_none(source.get('width')),
3483                     'height': height,
3484                     'tbr': int_or_none(source.get('bitrate')),
3485                     'ext': ext,
3486                 }
3487                 if source_url.startswith('rtmp'):
3488                     a_format['ext'] = 'flv'
3489                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3490                     # of jwplayer.flash.swf
3491                     rtmp_url_parts = re.split(
3492                         r'((?:mp4|mp3|flv):)', source_url, 1)
3493                     if len(rtmp_url_parts) == 3:
3494                         rtmp_url, prefix, play_path = rtmp_url_parts
3495                         a_format.update({
3496                             'url': rtmp_url,
3497                             'play_path': prefix + play_path,
3498                         })
3499                     if rtmp_params:
3500                         a_format.update(rtmp_params)
3501                 formats.append(a_format)
3502         return formats
3503
3504     def _live_title(self, name):
3505         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3506         return name
3507
3508     def _int(self, v, name, fatal=False, **kwargs):
3509         res = int_or_none(v, **kwargs)
3510         if res is None:
3511             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3512             if fatal:
3513                 raise ExtractorError(msg)
3514             else:
3515                 self.report_warning(msg)
3516         return res
3517
3518     def _float(self, v, name, fatal=False, **kwargs):
3519         res = float_or_none(v, **kwargs)
3520         if res is None:
3521             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3522             if fatal:
3523                 raise ExtractorError(msg)
3524             else:
3525                 self.report_warning(msg)
3526         return res
3527
3528     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3529                     path='/', secure=False, discard=False, rest={}, **kwargs):
3530         cookie = compat_cookiejar_Cookie(
3531             0, name, value, port, port is not None, domain, True,
3532             domain.startswith('.'), path, True, secure, expire_time,
3533             discard, None, None, rest)
3534         self._downloader.cookiejar.set_cookie(cookie)
3535
3536     def _get_cookies(self, url):
3537         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3538         req = sanitized_Request(url)
3539         self._downloader.cookiejar.add_cookie_header(req)
3540         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3541
3542     def _apply_first_set_cookie_header(self, url_handle, cookie):
3543         """
3544         Apply first Set-Cookie header instead of the last. Experimental.
3545
3546         Some sites (e.g. [1-3]) may serve two cookies under the same name
3547         in Set-Cookie header and expect the first (old) one to be set rather
3548         than second (new). However, as of RFC6265 the newer one cookie
3549         should be set into cookie store what actually happens.
3550         We will workaround this issue by resetting the cookie to
3551         the first one manually.
3552         1. https://new.vk.com/
3553         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3554         3. https://learning.oreilly.com/
3555         """
3556         for header, cookies in url_handle.headers.items():
3557             if header.lower() != 'set-cookie':
3558                 continue
3559             if sys.version_info[0] >= 3:
3560                 cookies = cookies.encode('iso-8859-1')
3561             cookies = cookies.decode('utf-8')
3562             cookie_value = re.search(
3563                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3564             if cookie_value:
3565                 value, domain = cookie_value.groups()
3566                 self._set_cookie(domain, cookie, value)
3567                 break
3568
3569     def get_testcases(self, include_onlymatching=False):
3570         t = getattr(self, '_TEST', None)
3571         if t:
3572             assert not hasattr(self, '_TESTS'), \
3573                 '%s has _TEST and _TESTS' % type(self).__name__
3574             tests = [t]
3575         else:
3576             tests = getattr(self, '_TESTS', [])
3577         for t in tests:
3578             if not include_onlymatching and t.get('only_matching', False):
3579                 continue
3580             t['name'] = type(self).__name__[:-len('IE')]
3581             yield t
3582
3583     def is_suitable(self, age_limit):
3584         """ Test whether the extractor is generally suitable for the given
3585         age limit (i.e. pornographic sites are not, all others usually are) """
3586
3587         any_restricted = False
3588         for tc in self.get_testcases(include_onlymatching=False):
3589             if tc.get('playlist', []):
3590                 tc = tc['playlist'][0]
3591             is_restricted = age_restricted(
3592                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3593             if not is_restricted:
3594                 return True
3595             any_restricted = any_restricted or is_restricted
3596         return not any_restricted
3597
3598     def extract_subtitles(self, *args, **kwargs):
3599         if (self.get_param('writesubtitles', False)
3600                 or self.get_param('listsubtitles')):
3601             return self._get_subtitles(*args, **kwargs)
3602         return {}
3603
3604     def _get_subtitles(self, *args, **kwargs):
3605         raise NotImplementedError('This method must be implemented by subclasses')
3606
3607     def extract_comments(self, *args, **kwargs):
3608         if not self.get_param('getcomments'):
3609             return None
3610         generator = self._get_comments(*args, **kwargs)
3611
3612         def extractor():
3613             comments = []
3614             interrupted = True
3615             try:
3616                 while True:
3617                     comments.append(next(generator))
3618             except StopIteration:
3619                 interrupted = False
3620             except KeyboardInterrupt:
3621                 self.to_screen('Interrupted by user')
3622             except Exception as e:
3623                 if self.get_param('ignoreerrors') is not True:
3624                     raise
3625                 self._downloader.report_error(e)
3626             comment_count = len(comments)
3627             self.to_screen(f'Extracted {comment_count} comments')
3628             return {
3629                 'comments': comments,
3630                 'comment_count': None if interrupted else comment_count
3631             }
3632         return extractor
3633
3634     def _get_comments(self, *args, **kwargs):
3635         raise NotImplementedError('This method must be implemented by subclasses')
3636
3637     @staticmethod
3638     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3639         """ Merge subtitle items for one language. Items with duplicated URLs
3640         will be dropped. """
3641         list1_urls = set([item['url'] for item in subtitle_list1])
3642         ret = list(subtitle_list1)
3643         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3644         return ret
3645
3646     @classmethod
3647     def _merge_subtitles(cls, *dicts, target=None):
3648         """ Merge subtitle dictionaries, language by language. """
3649         if target is None:
3650             target = {}
3651         for d in dicts:
3652             for lang, subs in d.items():
3653                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3654         return target
3655
3656     def extract_automatic_captions(self, *args, **kwargs):
3657         if (self.get_param('writeautomaticsub', False)
3658                 or self.get_param('listsubtitles')):
3659             return self._get_automatic_captions(*args, **kwargs)
3660         return {}
3661
3662     def _get_automatic_captions(self, *args, **kwargs):
3663         raise NotImplementedError('This method must be implemented by subclasses')
3664
3665     def mark_watched(self, *args, **kwargs):
3666         if not self.get_param('mark_watched', False):
3667             return
3668         if (self._get_login_info()[0] is not None
3669                 or self.get_param('cookiefile')
3670                 or self.get_param('cookiesfrombrowser')):
3671             self._mark_watched(*args, **kwargs)
3672
3673     def _mark_watched(self, *args, **kwargs):
3674         raise NotImplementedError('This method must be implemented by subclasses')
3675
3676     def geo_verification_headers(self):
3677         headers = {}
3678         geo_verification_proxy = self.get_param('geo_verification_proxy')
3679         if geo_verification_proxy:
3680             headers['Ytdl-request-proxy'] = geo_verification_proxy
3681         return headers
3682
3683     def _generic_id(self, url):
3684         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3685
3686     def _generic_title(self, url):
3687         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3688
3689     @staticmethod
3690     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3691         all_known = all(map(
3692             lambda x: x is not None,
3693             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3694         return (
3695             'private' if is_private
3696             else 'premium_only' if needs_premium
3697             else 'subscriber_only' if needs_subscription
3698             else 'needs_auth' if needs_auth
3699             else 'unlisted' if is_unlisted
3700             else 'public' if all_known
3701             else None)
3702
3703     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3704         '''
3705         @returns            A list of values for the extractor argument given by "key"
3706                             or "default" if no such key is present
3707         @param default      The default value to return when the key is not present (default: [])
3708         @param casesense    When false, the values are converted to lower case
3709         '''
3710         val = traverse_obj(
3711             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3712         if val is None:
3713             return [] if default is NO_DEFAULT else default
3714         return list(val) if casesense else [x.lower() for x in val]
3715
3716     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3717         if not playlist_id or not video_id:
3718             return not video_id
3719
3720         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3721         if no_playlist is not None:
3722             return not no_playlist
3723
3724         video_id = '' if video_id is True else f' {video_id}'
3725         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3726         if self.get_param('noplaylist'):
3727             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3728             return False
3729         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3730         return True
3731
3732
3733 class SearchInfoExtractor(InfoExtractor):
3734     """
3735     Base class for paged search queries extractors.
3736     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3737     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3738     """
3739
3740     _MAX_RESULTS = float('inf')
3741
3742     @classmethod
3743     def _make_valid_url(cls):
3744         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3745
3746     def _real_extract(self, query):
3747         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3748         if prefix == '':
3749             return self._get_n_results(query, 1)
3750         elif prefix == 'all':
3751             return self._get_n_results(query, self._MAX_RESULTS)
3752         else:
3753             n = int(prefix)
3754             if n <= 0:
3755                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3756             elif n > self._MAX_RESULTS:
3757                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3758                 n = self._MAX_RESULTS
3759             return self._get_n_results(query, n)
3760
3761     def _get_n_results(self, query, n):
3762         """Get a specified number of results for a query.
3763         Either this function or _search_results must be overridden by subclasses """
3764         return self.playlist_result(
3765             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3766             query, query)
3767
3768     def _search_results(self, query):
3769         """Returns an iterator of search results"""
3770         raise NotImplementedError('This method must be implemented by subclasses')
3771
3772     @property
3773     def SEARCH_KEY(self):
3774         return self._SEARCH_KEY