yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import collections
   6 import xml.etree.ElementTree
   7 import hashlib
   8 import itertools
   9 import json
  10 import netrc
  11 import os
  12 import random
  13 import re
  14 import sys
  15 import time
  16 import math
  17
  18 from ..compat import (
  19     compat_cookiejar_Cookie,
  20     compat_cookies_SimpleCookie,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_Pattern,
  27     compat_str,
  28     compat_urllib_error,
  29     compat_urllib_parse_unquote,
  30     compat_urllib_parse_urlencode,
  31     compat_urllib_request,
  32     compat_urlparse,
  33 )
  34 from ..downloader import FileDownloader
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     encode_data_uri,
  48     error_to_compat_str,
  49     extract_attributes,
  50     ExtractorError,
  51     filter_dict,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     join_nonempty,
  59     js_to_json,
  60     JSON_LD_RE,
  61     mimetype2ext,
  62     network_exceptions,
  63     NO_DEFAULT,
  64     orderedSet,
  65     parse_bitrate,
  66     parse_codecs,
  67     parse_duration,
  68     parse_iso8601,
  69     parse_m3u8_attributes,
  70     parse_resolution,
  71     RegexNotFoundError,
  72     sanitize_filename,
  73     sanitized_Request,
  74     str_or_none,
  75     str_to_int,
  76     strip_or_none,
  77     traverse_obj,
  78     try_get,
  79     unescapeHTML,
  80     UnsupportedError,
  81     unified_strdate,
  82     unified_timestamp,
  83     update_Request,
  84     update_url_query,
  85     url_basename,
  86     url_or_none,
  87     urljoin,
  88     variadic,
  89     xpath_element,
  90     xpath_text,
  91     xpath_with_ns,
  92 )
  93
  94
  95 class InfoExtractor(object):
  96     """Information Extractor class.
  97
  98     Information extractors are the classes that, given a URL, extract
  99     information about the video (or videos) the URL refers to. This
 100     information includes the real video URL, the video title, author and
 101     others. The information is stored in a dictionary which is then
 102     passed to the YoutubeDL. The YoutubeDL processes this
 103     information possibly downloading the video to the file system, among
 104     other possible outcomes.
 105
 106     The type field determines the type of the result.
 107     By far the most common value (and the default if _type is missing) is
 108     "video", which indicates a single video.
 109
 110     For a video, the dictionaries must include the following fields:
 111
 112     id:             Video identifier.
 113     title:          Video title, unescaped.
 114
 115     Additionally, it must contain either a formats entry or a url one:
 116
 117     formats:        A list of dictionaries for each format available, ordered
 118                     from worst to best quality.
 119
 120                     Potential fields:
 121                     * url        The mandatory URL representing the media:
 122                                    for plain file media - HTTP URL of this file,
 123                                    for RTMP - RTMP URL,
 124                                    for HLS - URL of the M3U8 media playlist,
 125                                    for HDS - URL of the F4M manifest,
 126                                    for DASH
 127                                      - HTTP URL to plain file media (in case of
 128                                        unfragmented media)
 129                                      - URL of the MPD manifest or base URL
 130                                        representing the media if MPD manifest
 131                                        is parsed from a string (in case of
 132                                        fragmented media)
 133                                    for MSS - URL of the ISM manifest.
 134                     * manifest_url
 135                                  The URL of the manifest file in case of
 136                                  fragmented media:
 137                                    for HLS - URL of the M3U8 master playlist,
 138                                    for HDS - URL of the F4M manifest,
 139                                    for DASH - URL of the MPD manifest,
 140                                    for MSS - URL of the ISM manifest.
 141                     * manifest_stream_number  (For internal use only)
 142                                  The index of the stream in the manifest file
 143                     * ext        Will be calculated from URL if missing
 144                     * format     A human-readable description of the format
 145                                  ("mp4 container with h264/opus").
 146                                  Calculated from the format_id, width, height.
 147                                  and format_note fields if missing.
 148                     * format_id  A short description of the format
 149                                  ("mp4_h264_opus" or "19").
 150                                 Technically optional, but strongly recommended.
 151                     * format_note Additional info about the format
 152                                  ("3D" or "DASH video")
 153                     * width      Width of the video, if known
 154                     * height     Height of the video, if known
 155                     * resolution Textual description of width and height
 156                     * dynamic_range The dynamic range of the video. One of:
 157                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 158                     * tbr        Average bitrate of audio and video in KBit/s
 159                     * abr        Average audio bitrate in KBit/s
 160                     * acodec     Name of the audio codec in use
 161                     * asr        Audio sampling rate in Hertz
 162                     * vbr        Average video bitrate in KBit/s
 163                     * fps        Frame rate
 164                     * vcodec     Name of the video codec in use
 165                     * container  Name of the container format
 166                     * filesize   The number of bytes, if known in advance
 167                     * filesize_approx  An estimate for the number of bytes
 168                     * player_url SWF Player URL (used for rtmpdump).
 169                     * protocol   The protocol that will be used for the actual
 170                                  download, lower-case. One of "http", "https" or
 171                                  one of the protocols defined in downloader.PROTOCOL_MAP
 172                     * fragment_base_url
 173                                  Base URL for fragments. Each fragment's path
 174                                  value (if present) will be relative to
 175                                  this URL.
 176                     * fragments  A list of fragments of a fragmented media.
 177                                  Each fragment entry must contain either an url
 178                                  or a path. If an url is present it should be
 179                                  considered by a client. Otherwise both path and
 180                                  fragment_base_url must be present. Here is
 181                                  the list of all potential fields:
 182                                  * "url" - fragment's URL
 183                                  * "path" - fragment's path relative to
 184                                             fragment_base_url
 185                                  * "duration" (optional, int or float)
 186                                  * "filesize" (optional, int)
 187                     * is_from_start  Is a live format that can be downloaded
 188                                 from the start. Boolean
 189                     * preference Order number of this format. If this field is
 190                                  present and not None, the formats get sorted
 191                                  by this field, regardless of all other values.
 192                                  -1 for default (order by other properties),
 193                                  -2 or smaller for less than default.
 194                                  < -1000 to hide the format (if there is
 195                                     another one which is strictly better)
 196                     * language   Language code, e.g. "de" or "en-US".
 197                     * language_preference  Is this in the language mentioned in
 198                                  the URL?
 199                                  10 if it's what the URL is about,
 200                                  -1 for default (don't know),
 201                                  -10 otherwise, other values reserved for now.
 202                     * quality    Order number of the video quality of this
 203                                  format, irrespective of the file format.
 204                                  -1 for default (order by other properties),
 205                                  -2 or smaller for less than default.
 206                     * source_preference  Order number for this video source
 207                                   (quality takes higher priority)
 208                                  -1 for default (order by other properties),
 209                                  -2 or smaller for less than default.
 210                     * http_headers  A dictionary of additional HTTP headers
 211                                  to add to the request.
 212                     * stretched_ratio  If given and not 1, indicates that the
 213                                  video's pixels are not square.
 214                                  width : height ratio as float.
 215                     * no_resume  The server does not support resuming the
 216                                  (HTTP or RTMP) download. Boolean.
 217                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 218                     * downloader_options  A dictionary of downloader options as
 219                                  described in FileDownloader (For internal use only)
 220                     RTMP formats can also have the additional fields: page_url,
 221                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 222                     rtmp_protocol, rtmp_real_time
 223
 224     url:            Final video URL.
 225     ext:            Video filename extension.
 226     format:         The video format, defaults to ext (used for --get-format)
 227     player_url:     SWF Player URL (used for rtmpdump).
 228
 229     The following fields are optional:
 230
 231     direct:         True if a direct video file was given (must only be set by GenericIE)
 232     alt_title:      A secondary title of the video.
 233     display_id      An alternative identifier for the video, not necessarily
 234                     unique, but available before title. Typically, id is
 235                     something like "4234987", title "Dancing naked mole rats",
 236                     and display_id "dancing-naked-mole-rats"
 237     thumbnails:     A list of dictionaries, with the following entries:
 238                         * "id" (optional, string) - Thumbnail format ID
 239                         * "url"
 240                         * "preference" (optional, int) - quality of the image
 241                         * "width" (optional, int)
 242                         * "height" (optional, int)
 243                         * "resolution" (optional, string "{width}x{height}",
 244                                         deprecated)
 245                         * "filesize" (optional, int)
 246                         * "http_headers" (dict) - HTTP headers for the request
 247     thumbnail:      Full URL to a video thumbnail image.
 248     description:    Full video description.
 249     uploader:       Full name of the video uploader.
 250     license:        License name the video is licensed under.
 251     creator:        The creator of the video.
 252     timestamp:      UNIX timestamp of the moment the video was uploaded
 253     upload_date:    Video upload date in UTC (YYYYMMDD).
 254                     If not explicitly set, calculated from timestamp
 255     release_timestamp: UNIX timestamp of the moment the video was released.
 256                     If it is not clear whether to use timestamp or this, use the former
 257     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 258                     If not explicitly set, calculated from release_timestamp
 259     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 260     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 261                     If not explicitly set, calculated from modified_timestamp
 262     uploader_id:    Nickname or id of the video uploader.
 263     uploader_url:   Full URL to a personal webpage of the video uploader.
 264     channel:        Full name of the channel the video is uploaded on.
 265                     Note that channel fields may or may not repeat uploader
 266                     fields. This depends on a particular extractor.
 267     channel_id:     Id of the channel.
 268     channel_url:    Full URL to a channel webpage.
 269     channel_follower_count: Number of followers of the channel.
 270     location:       Physical location where the video was filmed.
 271     subtitles:      The available subtitles as a dictionary in the format
 272                     {tag: subformats}. "tag" is usually a language code, and
 273                     "subformats" is a list sorted from lower to higher
 274                     preference, each element is a dictionary with the "ext"
 275                     entry and one of:
 276                         * "data": The subtitles file contents
 277                         * "url": A URL pointing to the subtitles file
 278                     It can optionally also have:
 279                         * "name": Name or description of the subtitles
 280                         * "http_headers": A dictionary of additional HTTP headers
 281                                   to add to the request.
 282                     "ext" will be calculated from URL if missing
 283     automatic_captions: Like 'subtitles'; contains automatically generated
 284                     captions instead of normal subtitles
 285     duration:       Length of the video in seconds, as an integer or float.
 286     view_count:     How many users have watched the video on the platform.
 287     like_count:     Number of positive ratings of the video
 288     dislike_count:  Number of negative ratings of the video
 289     repost_count:   Number of reposts of the video
 290     average_rating: Average rating give by users, the scale used depends on the webpage
 291     comment_count:  Number of comments on the video
 292     comments:       A list of comments, each with one or more of the following
 293                     properties (all but one of text or html optional):
 294                         * "author" - human-readable name of the comment author
 295                         * "author_id" - user ID of the comment author
 296                         * "author_thumbnail" - The thumbnail of the comment author
 297                         * "id" - Comment ID
 298                         * "html" - Comment as HTML
 299                         * "text" - Plain text of the comment
 300                         * "timestamp" - UNIX timestamp of comment
 301                         * "parent" - ID of the comment this one is replying to.
 302                                      Set to "root" to indicate that this is a
 303                                      comment to the original video.
 304                         * "like_count" - Number of positive ratings of the comment
 305                         * "dislike_count" - Number of negative ratings of the comment
 306                         * "is_favorited" - Whether the comment is marked as
 307                                            favorite by the video uploader
 308                         * "author_is_uploader" - Whether the comment is made by
 309                                                  the video uploader
 310     age_limit:      Age restriction for the video, as an integer (years)
 311     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 312                     should allow to get the same result again. (It will be set
 313                     by YoutubeDL if it's missing)
 314     categories:     A list of categories that the video falls in, for example
 315                     ["Sports", "Berlin"]
 316     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 317     cast:           A list of the video cast
 318     is_live:        True, False, or None (=unknown). Whether this video is a
 319                     live stream that goes on instead of a fixed-length video.
 320     was_live:       True, False, or None (=unknown). Whether this video was
 321                     originally a live stream.
 322     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 323                     If absent, automatically set from is_live, was_live
 324     start_time:     Time in seconds where the reproduction should start, as
 325                     specified in the URL.
 326     end_time:       Time in seconds where the reproduction should end, as
 327                     specified in the URL.
 328     chapters:       A list of dictionaries, with the following entries:
 329                         * "start_time" - The start time of the chapter in seconds
 330                         * "end_time" - The end time of the chapter in seconds
 331                         * "title" (optional, string)
 332     playable_in_embed: Whether this video is allowed to play in embedded
 333                     players on other sites. Can be True (=always allowed),
 334                     False (=never allowed), None (=unknown), or a string
 335                     specifying the criteria for embedability (Eg: 'whitelist')
 336     availability:   Under what condition the video is available. One of
 337                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 338                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 339                     to set it
 340     __post_extractor: A function to be called just before the metadata is
 341                     written to either disk, logger or console. The function
 342                     must return a dict which will be added to the info_dict.
 343                     This is usefull for additional information that is
 344                     time-consuming to extract. Note that the fields thus
 345                     extracted will not be available to output template and
 346                     match_filter. So, only "comments" and "comment_count" are
 347                     currently allowed to be extracted via this method.
 348
 349     The following fields should only be used when the video belongs to some logical
 350     chapter or section:
 351
 352     chapter:        Name or title of the chapter the video belongs to.
 353     chapter_number: Number of the chapter the video belongs to, as an integer.
 354     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 355
 356     The following fields should only be used when the video is an episode of some
 357     series, programme or podcast:
 358
 359     series:         Title of the series or programme the video episode belongs to.
 360     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 361     season:         Title of the season the video episode belongs to.
 362     season_number:  Number of the season the video episode belongs to, as an integer.
 363     season_id:      Id of the season the video episode belongs to, as a unicode string.
 364     episode:        Title of the video episode. Unlike mandatory video title field,
 365                     this field should denote the exact title of the video episode
 366                     without any kind of decoration.
 367     episode_number: Number of the video episode within a season, as an integer.
 368     episode_id:     Id of the video episode, as a unicode string.
 369
 370     The following fields should only be used when the media is a track or a part of
 371     a music album:
 372
 373     track:          Title of the track.
 374     track_number:   Number of the track within an album or a disc, as an integer.
 375     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 376                     as a unicode string.
 377     artist:         Artist(s) of the track.
 378     genre:          Genre(s) of the track.
 379     album:          Title of the album the track belongs to.
 380     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 381     album_artist:   List of all artists appeared on the album (e.g.
 382                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 383                     and compilations).
 384     disc_number:    Number of the disc or other physical medium the track belongs to,
 385                     as an integer.
 386     release_year:   Year (YYYY) when the album was released.
 387     composer:       Composer of the piece
 388
 389     Unless mentioned otherwise, the fields should be Unicode strings.
 390
 391     Unless mentioned otherwise, None is equivalent to absence of information.
 392
 393
 394     _type "playlist" indicates multiple videos.
 395     There must be a key "entries", which is a list, an iterable, or a PagedList
 396     object, each element of which is a valid dictionary by this specification.
 397
 398     Additionally, playlists can have "id", "title", and any other relevent
 399     attributes with the same semantics as videos (see above).
 400
 401     It can also have the following optional fields:
 402
 403     playlist_count: The total number of videos in a playlist. If not given,
 404                     YoutubeDL tries to calculate it from "entries"
 405
 406
 407     _type "multi_video" indicates that there are multiple videos that
 408     form a single show, for examples multiple acts of an opera or TV episode.
 409     It must have an entries key like a playlist and contain all the keys
 410     required for a video at the same time.
 411
 412
 413     _type "url" indicates that the video must be extracted from another
 414     location, possibly by a different extractor. Its only required key is:
 415     "url" - the next URL to extract.
 416     The key "ie_key" can be set to the class name (minus the trailing "IE",
 417     e.g. "Youtube") if the extractor class is known in advance.
 418     Additionally, the dictionary may have any properties of the resolved entity
 419     known in advance, for example "title" if the title of the referred video is
 420     known ahead of time.
 421
 422
 423     _type "url_transparent" entities have the same specification as "url", but
 424     indicate that the given additional information is more precise than the one
 425     associated with the resolved URL.
 426     This is useful when a site employs a video service that hosts the video and
 427     its technical metadata, but that video service does not embed a useful
 428     title, description etc.
 429
 430
 431     Subclasses of this should define a _VALID_URL regexp and, re-define the
 432     _real_extract() and (optionally) _real_initialize() methods.
 433     Probably, they should also be added to the list of extractors.
 434
 435     Subclasses may also override suitable() if necessary, but ensure the function
 436     signature is preserved and that this function imports everything it needs
 437     (except other extractors), so that lazy_extractors works correctly.
 438
 439     To support username + password (or netrc) login, the extractor must define a
 440     _NETRC_MACHINE and re-define _perform_login(username, password) and
 441     (optionally) _initialize_pre_login() methods. The _perform_login method will
 442     be called between _initialize_pre_login and _real_initialize if credentials
 443     are passed by the user. In cases where it is necessary to have the login
 444     process as part of the extraction rather than initialization, _perform_login
 445     can be left undefined.
 446
 447     _GEO_BYPASS attribute may be set to False in order to disable
 448     geo restriction bypass mechanisms for a particular extractor.
 449     Though it won't disable explicit geo restriction bypass based on
 450     country code provided with geo_bypass_country.
 451
 452     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 453     countries for this extractor. One of these countries will be used by
 454     geo restriction bypass mechanism right away in order to bypass
 455     geo restriction, of course, if the mechanism is not disabled.
 456
 457     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 458     IP blocks in CIDR notation for this extractor. One of these IP blocks
 459     will be used by geo restriction bypass mechanism similarly
 460     to _GEO_COUNTRIES.
 461
 462     The _WORKING attribute should be set to False for broken IEs
 463     in order to warn the users and skip the tests.
 464     """
 465
 466     _ready = False
 467     _downloader = None
 468     _x_forwarded_for_ip = None
 469     _GEO_BYPASS = True
 470     _GEO_COUNTRIES = None
 471     _GEO_IP_BLOCKS = None
 472     _WORKING = True
 473     _NETRC_MACHINE = None
 474     IE_DESC = None
 475
 476     _LOGIN_HINTS = {
 477         'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials',
 478         'cookies': (
 479             'Use --cookies-from-browser or --cookies for the authentication. '
 480             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 481         'password': 'Use --username and --password, or --netrc to provide account credentials',
 482     }
 483
 484     def __init__(self, downloader=None):
 485         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 486         If a downloader is not passed during initialization,
 487         it must be set using "set_downloader()" before "extract()" is called"""
 488         self._ready = False
 489         self._x_forwarded_for_ip = None
 490         self._printed_messages = set()
 491         self.set_downloader(downloader)
 492
 493     @classmethod
 494     def _match_valid_url(cls, url):
 495         # This does not use has/getattr intentionally - we want to know whether
 496         # we have cached the regexp for *this* class, whereas getattr would also
 497         # match the superclass
 498         if '_VALID_URL_RE' not in cls.__dict__:
 499             if '_VALID_URL' not in cls.__dict__:
 500                 cls._VALID_URL = cls._make_valid_url()
 501             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 502         return cls._VALID_URL_RE.match(url)
 503
 504     @classmethod
 505     def suitable(cls, url):
 506         """Receives a URL and returns True if suitable for this IE."""
 507         # This function must import everything it needs (except other extractors),
 508         # so that lazy_extractors works correctly
 509         return cls._match_valid_url(url) is not None
 510
 511     @classmethod
 512     def _match_id(cls, url):
 513         return cls._match_valid_url(url).group('id')
 514
 515     @classmethod
 516     def get_temp_id(cls, url):
 517         try:
 518             return cls._match_id(url)
 519         except (IndexError, AttributeError):
 520             return None
 521
 522     @classmethod
 523     def working(cls):
 524         """Getter method for _WORKING."""
 525         return cls._WORKING
 526
 527     @classmethod
 528     def supports_login(cls):
 529         return bool(cls._NETRC_MACHINE)
 530
 531     def initialize(self):
 532         """Initializes an instance (authentication, etc)."""
 533         self._printed_messages = set()
 534         self._initialize_geo_bypass({
 535             'countries': self._GEO_COUNTRIES,
 536             'ip_blocks': self._GEO_IP_BLOCKS,
 537         })
 538         if not self._ready:
 539             self._initialize_pre_login()
 540             if self.supports_login():
 541                 username, password = self._get_login_info()
 542                 if username:
 543                     self._perform_login(username, password)
 544             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 545                 self.report_warning(f'Login with password is not supported for this website. {self._LOGIN_HINTS["cookies"]}')
 546             self._real_initialize()
 547             self._ready = True
 548
 549     def _initialize_geo_bypass(self, geo_bypass_context):
 550         """
 551         Initialize geo restriction bypass mechanism.
 552
 553         This method is used to initialize geo bypass mechanism based on faking
 554         X-Forwarded-For HTTP header. A random country from provided country list
 555         is selected and a random IP belonging to this country is generated. This
 556         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 557         HTTP requests.
 558
 559         This method will be used for initial geo bypass mechanism initialization
 560         during the instance initialization with _GEO_COUNTRIES and
 561         _GEO_IP_BLOCKS.
 562
 563         You may also manually call it from extractor's code if geo bypass
 564         information is not available beforehand (e.g. obtained during
 565         extraction) or due to some other reason. In this case you should pass
 566         this information in geo bypass context passed as first argument. It may
 567         contain following fields:
 568
 569         countries:  List of geo unrestricted countries (similar
 570                     to _GEO_COUNTRIES)
 571         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 572                     (similar to _GEO_IP_BLOCKS)
 573
 574         """
 575         if not self._x_forwarded_for_ip:
 576
 577             # Geo bypass mechanism is explicitly disabled by user
 578             if not self.get_param('geo_bypass', True):
 579                 return
 580
 581             if not geo_bypass_context:
 582                 geo_bypass_context = {}
 583
 584             # Backward compatibility: previously _initialize_geo_bypass
 585             # expected a list of countries, some 3rd party code may still use
 586             # it this way
 587             if isinstance(geo_bypass_context, (list, tuple)):
 588                 geo_bypass_context = {
 589                     'countries': geo_bypass_context,
 590                 }
 591
 592             # The whole point of geo bypass mechanism is to fake IP
 593             # as X-Forwarded-For HTTP header based on some IP block or
 594             # country code.
 595
 596             # Path 1: bypassing based on IP block in CIDR notation
 597
 598             # Explicit IP block specified by user, use it right away
 599             # regardless of whether extractor is geo bypassable or not
 600             ip_block = self.get_param('geo_bypass_ip_block', None)
 601
 602             # Otherwise use random IP block from geo bypass context but only
 603             # if extractor is known as geo bypassable
 604             if not ip_block:
 605                 ip_blocks = geo_bypass_context.get('ip_blocks')
 606                 if self._GEO_BYPASS and ip_blocks:
 607                     ip_block = random.choice(ip_blocks)
 608
 609             if ip_block:
 610                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 611                 self._downloader.write_debug(
 612                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 613                 return
 614
 615             # Path 2: bypassing based on country code
 616
 617             # Explicit country code specified by user, use it right away
 618             # regardless of whether extractor is geo bypassable or not
 619             country = self.get_param('geo_bypass_country', None)
 620
 621             # Otherwise use random country code from geo bypass context but
 622             # only if extractor is known as geo bypassable
 623             if not country:
 624                 countries = geo_bypass_context.get('countries')
 625                 if self._GEO_BYPASS and countries:
 626                     country = random.choice(countries)
 627
 628             if country:
 629                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 630                 self._downloader.write_debug(
 631                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 632
 633     def extract(self, url):
 634         """Extracts URL information and returns it in list of dicts."""
 635         try:
 636             for _ in range(2):
 637                 try:
 638                     self.initialize()
 639                     self.write_debug('Extracting URL: %s' % url)
 640                     ie_result = self._real_extract(url)
 641                     if ie_result is None:
 642                         return None
 643                     if self._x_forwarded_for_ip:
 644                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 645                     subtitles = ie_result.get('subtitles')
 646                     if (subtitles and 'live_chat' in subtitles
 647                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 648                         del subtitles['live_chat']
 649                     return ie_result
 650                 except GeoRestrictedError as e:
 651                     if self.__maybe_fake_ip_and_retry(e.countries):
 652                         continue
 653                     raise
 654         except UnsupportedError:
 655             raise
 656         except ExtractorError as e:
 657             kwargs = {
 658                 'video_id': e.video_id or self.get_temp_id(url),
 659                 'ie': self.IE_NAME,
 660                 'tb': e.traceback or sys.exc_info()[2],
 661                 'expected': e.expected,
 662                 'cause': e.cause
 663             }
 664             if hasattr(e, 'countries'):
 665                 kwargs['countries'] = e.countries
 666             raise type(e)(e.orig_msg, **kwargs)
 667         except compat_http_client.IncompleteRead as e:
 668             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 669         except (KeyError, StopIteration) as e:
 670             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 671
 672     def __maybe_fake_ip_and_retry(self, countries):
 673         if (not self.get_param('geo_bypass_country', None)
 674                 and self._GEO_BYPASS
 675                 and self.get_param('geo_bypass', True)
 676                 and not self._x_forwarded_for_ip
 677                 and countries):
 678             country_code = random.choice(countries)
 679             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 680             if self._x_forwarded_for_ip:
 681                 self.report_warning(
 682                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 683                     % (self._x_forwarded_for_ip, country_code.upper()))
 684                 return True
 685         return False
 686
 687     def set_downloader(self, downloader):
 688         """Sets a YoutubeDL instance as the downloader for this IE."""
 689         self._downloader = downloader
 690
 691     def _initialize_pre_login(self):
 692         """ Intialization before login. Redefine in subclasses."""
 693         pass
 694
 695     def _perform_login(self, username, password):
 696         """ Login with username and password. Redefine in subclasses."""
 697         pass
 698
 699     def _real_initialize(self):
 700         """Real initialization process. Redefine in subclasses."""
 701         pass
 702
 703     def _real_extract(self, url):
 704         """Real extraction process. Redefine in subclasses."""
 705         raise NotImplementedError('This method must be implemented by subclasses')
 706
 707     @classmethod
 708     def ie_key(cls):
 709         """A string for getting the InfoExtractor with get_info_extractor"""
 710         return cls.__name__[:-2]
 711
 712     @property
 713     def IE_NAME(self):
 714         return compat_str(type(self).__name__[:-2])
 715
 716     @staticmethod
 717     def __can_accept_status_code(err, expected_status):
 718         assert isinstance(err, compat_urllib_error.HTTPError)
 719         if expected_status is None:
 720             return False
 721         elif callable(expected_status):
 722             return expected_status(err.code) is True
 723         else:
 724             return err.code in variadic(expected_status)
 725
 726     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 727         """
 728         Return the response handle.
 729
 730         See _download_webpage docstring for arguments specification.
 731         """
 732         if not self._downloader._first_webpage_request:
 733             sleep_interval = self.get_param('sleep_interval_requests') or 0
 734             if sleep_interval > 0:
 735                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 736                 time.sleep(sleep_interval)
 737         else:
 738             self._downloader._first_webpage_request = False
 739
 740         if note is None:
 741             self.report_download_webpage(video_id)
 742         elif note is not False:
 743             if video_id is None:
 744                 self.to_screen('%s' % (note,))
 745             else:
 746                 self.to_screen('%s: %s' % (video_id, note))
 747
 748         # Some sites check X-Forwarded-For HTTP header in order to figure out
 749         # the origin of the client behind proxy. This allows bypassing geo
 750         # restriction by faking this header's value to IP that belongs to some
 751         # geo unrestricted country. We will do so once we encounter any
 752         # geo restriction error.
 753         if self._x_forwarded_for_ip:
 754             if 'X-Forwarded-For' not in headers:
 755                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 756
 757         if isinstance(url_or_request, compat_urllib_request.Request):
 758             url_or_request = update_Request(
 759                 url_or_request, data=data, headers=headers, query=query)
 760         else:
 761             if query:
 762                 url_or_request = update_url_query(url_or_request, query)
 763             if data is not None or headers:
 764                 url_or_request = sanitized_Request(url_or_request, data, headers)
 765         try:
 766             return self._downloader.urlopen(url_or_request)
 767         except network_exceptions as err:
 768             if isinstance(err, compat_urllib_error.HTTPError):
 769                 if self.__can_accept_status_code(err, expected_status):
 770                     # Retain reference to error to prevent file object from
 771                     # being closed before it can be read. Works around the
 772                     # effects of <https://bugs.python.org/issue15002>
 773                     # introduced in Python 3.4.1.
 774                     err.fp._error = err
 775                     return err.fp
 776
 777             if errnote is False:
 778                 return False
 779             if errnote is None:
 780                 errnote = 'Unable to download webpage'
 781
 782             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 783             if fatal:
 784                 raise ExtractorError(errmsg, cause=err)
 785             else:
 786                 self.report_warning(errmsg)
 787                 return False
 788
 789     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 790         """
 791         Return a tuple (page content as string, URL handle).
 792
 793         See _download_webpage docstring for arguments specification.
 794         """
 795         # Strip hashes from the URL (#1038)
 796         if isinstance(url_or_request, (compat_str, str)):
 797             url_or_request = url_or_request.partition('#')[0]
 798
 799         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 800         if urlh is False:
 801             assert not fatal
 802             return False
 803         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 804         return (content, urlh)
 805
 806     @staticmethod
 807     def _guess_encoding_from_content(content_type, webpage_bytes):
 808         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 809         if m:
 810             encoding = m.group(1)
 811         else:
 812             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 813                           webpage_bytes[:1024])
 814             if m:
 815                 encoding = m.group(1).decode('ascii')
 816             elif webpage_bytes.startswith(b'\xff\xfe'):
 817                 encoding = 'utf-16'
 818             else:
 819                 encoding = 'utf-8'
 820
 821         return encoding
 822
 823     def __check_blocked(self, content):
 824         first_block = content[:512]
 825         if ('<title>Access to this site is blocked</title>' in content
 826                 and 'Websense' in first_block):
 827             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 828             blocked_iframe = self._html_search_regex(
 829                 r'<iframe src="([^"]+)"', content,
 830                 'Websense information URL', default=None)
 831             if blocked_iframe:
 832                 msg += ' Visit %s for more details' % blocked_iframe
 833             raise ExtractorError(msg, expected=True)
 834         if '<title>The URL you requested has been blocked</title>' in first_block:
 835             msg = (
 836                 'Access to this webpage has been blocked by Indian censorship. '
 837                 'Use a VPN or proxy server (with --proxy) to route around it.')
 838             block_msg = self._html_search_regex(
 839                 r'</h1><p>(.*?)</p>',
 840                 content, 'block message', default=None)
 841             if block_msg:
 842                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 843             raise ExtractorError(msg, expected=True)
 844         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 845                 and 'blocklist.rkn.gov.ru' in content):
 846             raise ExtractorError(
 847                 'Access to this webpage has been blocked by decision of the Russian government. '
 848                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 849                 expected=True)
 850
 851     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 852         content_type = urlh.headers.get('Content-Type', '')
 853         webpage_bytes = urlh.read()
 854         if prefix is not None:
 855             webpage_bytes = prefix + webpage_bytes
 856         if not encoding:
 857             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 858         if self.get_param('dump_intermediate_pages', False):
 859             self.to_screen('Dumping request to ' + urlh.geturl())
 860             dump = base64.b64encode(webpage_bytes).decode('ascii')
 861             self._downloader.to_screen(dump)
 862         if self.get_param('write_pages', False):
 863             basen = '%s_%s' % (video_id, urlh.geturl())
 864             trim_length = self.get_param('trim_file_name') or 240
 865             if len(basen) > trim_length:
 866                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 867                 basen = basen[:trim_length - len(h)] + h
 868             raw_filename = basen + '.dump'
 869             filename = sanitize_filename(raw_filename, restricted=True)
 870             self.to_screen('Saving request to ' + filename)
 871             # Working around MAX_PATH limitation on Windows (see
 872             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 873             if compat_os_name == 'nt':
 874                 absfilepath = os.path.abspath(filename)
 875                 if len(absfilepath) > 259:
 876                     filename = '\\\\?\\' + absfilepath
 877             with open(filename, 'wb') as outf:
 878                 outf.write(webpage_bytes)
 879
 880         try:
 881             content = webpage_bytes.decode(encoding, 'replace')
 882         except LookupError:
 883             content = webpage_bytes.decode('utf-8', 'replace')
 884
 885         self.__check_blocked(content)
 886
 887         return content
 888
 889     def _download_webpage(
 890             self, url_or_request, video_id, note=None, errnote=None,
 891             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 892             headers={}, query={}, expected_status=None):
 893         """
 894         Return the data of the page as a string.
 895
 896         Arguments:
 897         url_or_request -- plain text URL as a string or
 898             a compat_urllib_request.Requestobject
 899         video_id -- Video/playlist/item identifier (string)
 900
 901         Keyword arguments:
 902         note -- note printed before downloading (string)
 903         errnote -- note printed in case of an error (string)
 904         fatal -- flag denoting whether error should be considered fatal,
 905             i.e. whether it should cause ExtractionError to be raised,
 906             otherwise a warning will be reported and extraction continued
 907         tries -- number of tries
 908         timeout -- sleep interval between tries
 909         encoding -- encoding for a page content decoding, guessed automatically
 910             when not explicitly specified
 911         data -- POST data (bytes)
 912         headers -- HTTP headers (dict)
 913         query -- URL query (dict)
 914         expected_status -- allows to accept failed HTTP requests (non 2xx
 915             status code) by explicitly specifying a set of accepted status
 916             codes. Can be any of the following entities:
 917                 - an integer type specifying an exact failed status code to
 918                   accept
 919                 - a list or a tuple of integer types specifying a list of
 920                   failed status codes to accept
 921                 - a callable accepting an actual failed status code and
 922                   returning True if it should be accepted
 923             Note that this argument does not affect success status codes (2xx)
 924             which are always accepted.
 925         """
 926
 927         success = False
 928         try_count = 0
 929         while success is False:
 930             try:
 931                 res = self._download_webpage_handle(
 932                     url_or_request, video_id, note, errnote, fatal,
 933                     encoding=encoding, data=data, headers=headers, query=query,
 934                     expected_status=expected_status)
 935                 success = True
 936             except compat_http_client.IncompleteRead as e:
 937                 try_count += 1
 938                 if try_count >= tries:
 939                     raise e
 940                 self._sleep(timeout, video_id)
 941         if res is False:
 942             return res
 943         else:
 944             content, _ = res
 945             return content
 946
 947     def _download_xml_handle(
 948             self, url_or_request, video_id, note='Downloading XML',
 949             errnote='Unable to download XML', transform_source=None,
 950             fatal=True, encoding=None, data=None, headers={}, query={},
 951             expected_status=None):
 952         """
 953         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
 954
 955         See _download_webpage docstring for arguments specification.
 956         """
 957         res = self._download_webpage_handle(
 958             url_or_request, video_id, note, errnote, fatal=fatal,
 959             encoding=encoding, data=data, headers=headers, query=query,
 960             expected_status=expected_status)
 961         if res is False:
 962             return res
 963         xml_string, urlh = res
 964         return self._parse_xml(
 965             xml_string, video_id, transform_source=transform_source,
 966             fatal=fatal), urlh
 967
 968     def _download_xml(
 969             self, url_or_request, video_id,
 970             note='Downloading XML', errnote='Unable to download XML',
 971             transform_source=None, fatal=True, encoding=None,
 972             data=None, headers={}, query={}, expected_status=None):
 973         """
 974         Return the xml as an xml.etree.ElementTree.Element.
 975
 976         See _download_webpage docstring for arguments specification.
 977         """
 978         res = self._download_xml_handle(
 979             url_or_request, video_id, note=note, errnote=errnote,
 980             transform_source=transform_source, fatal=fatal, encoding=encoding,
 981             data=data, headers=headers, query=query,
 982             expected_status=expected_status)
 983         return res if res is False else res[0]
 984
 985     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 986         if transform_source:
 987             xml_string = transform_source(xml_string)
 988         try:
 989             return compat_etree_fromstring(xml_string.encode('utf-8'))
 990         except xml.etree.ElementTree.ParseError as ve:
 991             errmsg = '%s: Failed to parse XML ' % video_id
 992             if fatal:
 993                 raise ExtractorError(errmsg, cause=ve)
 994             else:
 995                 self.report_warning(errmsg + str(ve))
 996
 997     def _download_json_handle(
 998             self, url_or_request, video_id, note='Downloading JSON metadata',
 999             errnote='Unable to download JSON metadata', transform_source=None,
1000             fatal=True, encoding=None, data=None, headers={}, query={},
1001             expected_status=None):
1002         """
1003         Return a tuple (JSON object, URL handle).
1004
1005         See _download_webpage docstring for arguments specification.
1006         """
1007         res = self._download_webpage_handle(
1008             url_or_request, video_id, note, errnote, fatal=fatal,
1009             encoding=encoding, data=data, headers=headers, query=query,
1010             expected_status=expected_status)
1011         if res is False:
1012             return res
1013         json_string, urlh = res
1014         return self._parse_json(
1015             json_string, video_id, transform_source=transform_source,
1016             fatal=fatal), urlh
1017
1018     def _download_json(
1019             self, url_or_request, video_id, note='Downloading JSON metadata',
1020             errnote='Unable to download JSON metadata', transform_source=None,
1021             fatal=True, encoding=None, data=None, headers={}, query={},
1022             expected_status=None):
1023         """
1024         Return the JSON object as a dict.
1025
1026         See _download_webpage docstring for arguments specification.
1027         """
1028         res = self._download_json_handle(
1029             url_or_request, video_id, note=note, errnote=errnote,
1030             transform_source=transform_source, fatal=fatal, encoding=encoding,
1031             data=data, headers=headers, query=query,
1032             expected_status=expected_status)
1033         return res if res is False else res[0]
1034
1035     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
1036         if transform_source:
1037             json_string = transform_source(json_string)
1038         try:
1039             return json.loads(json_string, strict=False)
1040         except ValueError as ve:
1041             errmsg = '%s: Failed to parse JSON ' % video_id
1042             if fatal:
1043                 raise ExtractorError(errmsg, cause=ve)
1044             else:
1045                 self.report_warning(errmsg + str(ve))
1046
1047     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1048         return self._parse_json(
1049             data[data.find('{'):data.rfind('}') + 1],
1050             video_id, transform_source, fatal)
1051
1052     def _download_socket_json_handle(
1053             self, url_or_request, video_id, note='Polling socket',
1054             errnote='Unable to poll socket', transform_source=None,
1055             fatal=True, encoding=None, data=None, headers={}, query={},
1056             expected_status=None):
1057         """
1058         Return a tuple (JSON object, URL handle).
1059
1060         See _download_webpage docstring for arguments specification.
1061         """
1062         res = self._download_webpage_handle(
1063             url_or_request, video_id, note, errnote, fatal=fatal,
1064             encoding=encoding, data=data, headers=headers, query=query,
1065             expected_status=expected_status)
1066         if res is False:
1067             return res
1068         webpage, urlh = res
1069         return self._parse_socket_response_as_json(
1070             webpage, video_id, transform_source=transform_source,
1071             fatal=fatal), urlh
1072
1073     def _download_socket_json(
1074             self, url_or_request, video_id, note='Polling socket',
1075             errnote='Unable to poll socket', transform_source=None,
1076             fatal=True, encoding=None, data=None, headers={}, query={},
1077             expected_status=None):
1078         """
1079         Return the JSON object as a dict.
1080
1081         See _download_webpage docstring for arguments specification.
1082         """
1083         res = self._download_socket_json_handle(
1084             url_or_request, video_id, note=note, errnote=errnote,
1085             transform_source=transform_source, fatal=fatal, encoding=encoding,
1086             data=data, headers=headers, query=query,
1087             expected_status=expected_status)
1088         return res if res is False else res[0]
1089
1090     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1091         idstr = format_field(video_id, template='%s: ')
1092         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1093         if only_once:
1094             if f'WARNING: {msg}' in self._printed_messages:
1095                 return
1096             self._printed_messages.add(f'WARNING: {msg}')
1097         self._downloader.report_warning(msg, *args, **kwargs)
1098
1099     def to_screen(self, msg, *args, **kwargs):
1100         """Print msg to screen, prefixing it with '[ie_name]'"""
1101         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1102
1103     def write_debug(self, msg, *args, **kwargs):
1104         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1105
1106     def get_param(self, name, default=None, *args, **kwargs):
1107         if self._downloader:
1108             return self._downloader.params.get(name, default, *args, **kwargs)
1109         return default
1110
1111     def report_drm(self, video_id, partial=False):
1112         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1113
1114     def report_extraction(self, id_or_name):
1115         """Report information extraction."""
1116         self.to_screen('%s: Extracting information' % id_or_name)
1117
1118     def report_download_webpage(self, video_id):
1119         """Report webpage download."""
1120         self.to_screen('%s: Downloading webpage' % video_id)
1121
1122     def report_age_confirmation(self):
1123         """Report attempt to confirm age."""
1124         self.to_screen('Confirming age')
1125
1126     def report_login(self):
1127         """Report attempt to log in."""
1128         self.to_screen('Logging in')
1129
1130     def raise_login_required(
1131             self, msg='This video is only available for registered users',
1132             metadata_available=False, method=NO_DEFAULT):
1133         if metadata_available and (
1134                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1135             self.report_warning(msg)
1136             return
1137         if method is NO_DEFAULT:
1138             method = 'any' if self.supports_login() else 'cookies'
1139         if method is not None:
1140             assert method in self._LOGIN_HINTS, 'Invalid login method'
1141             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1142         raise ExtractorError(msg, expected=True)
1143
1144     def raise_geo_restricted(
1145             self, msg='This video is not available from your location due to geo restriction',
1146             countries=None, metadata_available=False):
1147         if metadata_available and (
1148                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1149             self.report_warning(msg)
1150         else:
1151             raise GeoRestrictedError(msg, countries=countries)
1152
1153     def raise_no_formats(self, msg, expected=False, video_id=None):
1154         if expected and (
1155                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1156             self.report_warning(msg, video_id)
1157         elif isinstance(msg, ExtractorError):
1158             raise msg
1159         else:
1160             raise ExtractorError(msg, expected=expected, video_id=video_id)
1161
1162     # Methods for following #608
1163     @staticmethod
1164     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1165         """Returns a URL that points to a page that should be processed"""
1166         if ie is not None:
1167             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1168         if video_id is not None:
1169             kwargs['id'] = video_id
1170         if video_title is not None:
1171             kwargs['title'] = video_title
1172         return {
1173             **kwargs,
1174             '_type': 'url_transparent' if url_transparent else 'url',
1175             'url': url,
1176         }
1177
1178     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1179         urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
1180                 for m in orderedSet(map(getter, matches) if getter else matches))
1181         return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1182
1183     @staticmethod
1184     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1185         """Returns a playlist"""
1186         if playlist_id:
1187             kwargs['id'] = playlist_id
1188         if playlist_title:
1189             kwargs['title'] = playlist_title
1190         if playlist_description is not None:
1191             kwargs['description'] = playlist_description
1192         return {
1193             **kwargs,
1194             '_type': 'multi_video' if multi_video else 'playlist',
1195             'entries': entries,
1196         }
1197
1198     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1199         """
1200         Perform a regex search on the given string, using a single or a list of
1201         patterns returning the first matching group.
1202         In case of failure return a default value or raise a WARNING or a
1203         RegexNotFoundError, depending on fatal, specifying the field name.
1204         """
1205         if string is None:
1206             mobj = None
1207         elif isinstance(pattern, (str, compat_Pattern)):
1208             mobj = re.search(pattern, string, flags)
1209         else:
1210             for p in pattern:
1211                 mobj = re.search(p, string, flags)
1212                 if mobj:
1213                     break
1214
1215         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1216
1217         if mobj:
1218             if group is None:
1219                 # return the first matching group
1220                 return next(g for g in mobj.groups() if g is not None)
1221             elif isinstance(group, (list, tuple)):
1222                 return tuple(mobj.group(g) for g in group)
1223             else:
1224                 return mobj.group(group)
1225         elif default is not NO_DEFAULT:
1226             return default
1227         elif fatal:
1228             raise RegexNotFoundError('Unable to extract %s' % _name)
1229         else:
1230             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1231             return None
1232
1233     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1234         """
1235         Like _search_regex, but strips HTML tags and unescapes entities.
1236         """
1237         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1238         if res:
1239             return clean_html(res).strip()
1240         else:
1241             return res
1242
1243     def _get_netrc_login_info(self, netrc_machine=None):
1244         username = None
1245         password = None
1246         netrc_machine = netrc_machine or self._NETRC_MACHINE
1247
1248         if self.get_param('usenetrc', False):
1249             try:
1250                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1251                 if os.path.isdir(netrc_file):
1252                     netrc_file = os.path.join(netrc_file, '.netrc')
1253                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1254                 if info is not None:
1255                     username = info[0]
1256                     password = info[2]
1257                 else:
1258                     raise netrc.NetrcParseError(
1259                         'No authenticators for %s' % netrc_machine)
1260             except (IOError, netrc.NetrcParseError) as err:
1261                 self.report_warning(
1262                     'parsing .netrc: %s' % error_to_compat_str(err))
1263
1264         return username, password
1265
1266     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1267         """
1268         Get the login info as (username, password)
1269         First look for the manually specified credentials using username_option
1270         and password_option as keys in params dictionary. If no such credentials
1271         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1272         value.
1273         If there's no info available, return (None, None)
1274         """
1275
1276         # Attempt to use provided username and password or .netrc data
1277         username = self.get_param(username_option)
1278         if username is not None:
1279             password = self.get_param(password_option)
1280         else:
1281             username, password = self._get_netrc_login_info(netrc_machine)
1282
1283         return username, password
1284
1285     def _get_tfa_info(self, note='two-factor verification code'):
1286         """
1287         Get the two-factor authentication info
1288         TODO - asking the user will be required for sms/phone verify
1289         currently just uses the command line option
1290         If there's no info available, return None
1291         """
1292
1293         tfa = self.get_param('twofactor')
1294         if tfa is not None:
1295             return tfa
1296
1297         return compat_getpass('Type %s and press [Return]: ' % note)
1298
1299     # Helper functions for extracting OpenGraph info
1300     @staticmethod
1301     def _og_regexes(prop):
1302         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1303         property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1304                        % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1305         template = r'<meta[^>]+?%s[^>]+?%s'
1306         return [
1307             template % (property_re, content_re),
1308             template % (content_re, property_re),
1309         ]
1310
1311     @staticmethod
1312     def _meta_regex(prop):
1313         return r'''(?isx)<meta
1314                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1315                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1316
1317     def _og_search_property(self, prop, html, name=None, **kargs):
1318         prop = variadic(prop)
1319         if name is None:
1320             name = 'OpenGraph %s' % prop[0]
1321         og_regexes = []
1322         for p in prop:
1323             og_regexes.extend(self._og_regexes(p))
1324         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1325         if escaped is None:
1326             return None
1327         return unescapeHTML(escaped)
1328
1329     def _og_search_thumbnail(self, html, **kargs):
1330         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1331
1332     def _og_search_description(self, html, **kargs):
1333         return self._og_search_property('description', html, fatal=False, **kargs)
1334
1335     def _og_search_title(self, html, *, fatal=False, **kargs):
1336         return self._og_search_property('title', html, fatal=fatal, **kargs)
1337
1338     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1339         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1340         if secure:
1341             regexes = self._og_regexes('video:secure_url') + regexes
1342         return self._html_search_regex(regexes, html, name, **kargs)
1343
1344     def _og_search_url(self, html, **kargs):
1345         return self._og_search_property('url', html, **kargs)
1346
1347     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1348         return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1349
1350     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1351         name = variadic(name)
1352         if display_name is None:
1353             display_name = name[0]
1354         return self._html_search_regex(
1355             [self._meta_regex(n) for n in name],
1356             html, display_name, fatal=fatal, group='content', **kwargs)
1357
1358     def _dc_search_uploader(self, html):
1359         return self._html_search_meta('dc.creator', html, 'uploader')
1360
1361     def _rta_search(self, html):
1362         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1363         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1364                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1365                      html):
1366             return 18
1367         return 0
1368
1369     def _media_rating_search(self, html):
1370         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1371         rating = self._html_search_meta('rating', html)
1372
1373         if not rating:
1374             return None
1375
1376         RATING_TABLE = {
1377             'safe for kids': 0,
1378             'general': 8,
1379             '14 years': 14,
1380             'mature': 17,
1381             'restricted': 19,
1382         }
1383         return RATING_TABLE.get(rating.lower())
1384
1385     def _family_friendly_search(self, html):
1386         # See http://schema.org/VideoObject
1387         family_friendly = self._html_search_meta(
1388             'isFamilyFriendly', html, default=None)
1389
1390         if not family_friendly:
1391             return None
1392
1393         RATING_TABLE = {
1394             '1': 0,
1395             'true': 0,
1396             '0': 18,
1397             'false': 18,
1398         }
1399         return RATING_TABLE.get(family_friendly.lower())
1400
1401     def _twitter_search_player(self, html):
1402         return self._html_search_meta('twitter:player', html,
1403                                       'twitter card player')
1404
1405     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1406         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1407         default = kwargs.get('default', NO_DEFAULT)
1408         # JSON-LD may be malformed and thus `fatal` should be respected.
1409         # At the same time `default` may be passed that assumes `fatal=False`
1410         # for _search_regex. Let's simulate the same behavior here as well.
1411         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1412         json_ld = []
1413         for mobj in json_ld_list:
1414             json_ld_item = self._parse_json(
1415                 mobj.group('json_ld'), video_id, fatal=fatal)
1416             if not json_ld_item:
1417                 continue
1418             if isinstance(json_ld_item, dict):
1419                 json_ld.append(json_ld_item)
1420             elif isinstance(json_ld_item, (list, tuple)):
1421                 json_ld.extend(json_ld_item)
1422         if json_ld:
1423             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1424         if json_ld:
1425             return json_ld
1426         if default is not NO_DEFAULT:
1427             return default
1428         elif fatal:
1429             raise RegexNotFoundError('Unable to extract JSON-LD')
1430         else:
1431             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1432             return {}
1433
1434     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1435         if isinstance(json_ld, compat_str):
1436             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1437         if not json_ld:
1438             return {}
1439         info = {}
1440         if not isinstance(json_ld, (list, tuple, dict)):
1441             return info
1442         if isinstance(json_ld, dict):
1443             json_ld = [json_ld]
1444
1445         INTERACTION_TYPE_MAP = {
1446             'CommentAction': 'comment',
1447             'AgreeAction': 'like',
1448             'DisagreeAction': 'dislike',
1449             'LikeAction': 'like',
1450             'DislikeAction': 'dislike',
1451             'ListenAction': 'view',
1452             'WatchAction': 'view',
1453             'ViewAction': 'view',
1454         }
1455
1456         def extract_interaction_type(e):
1457             interaction_type = e.get('interactionType')
1458             if isinstance(interaction_type, dict):
1459                 interaction_type = interaction_type.get('@type')
1460             return str_or_none(interaction_type)
1461
1462         def extract_interaction_statistic(e):
1463             interaction_statistic = e.get('interactionStatistic')
1464             if isinstance(interaction_statistic, dict):
1465                 interaction_statistic = [interaction_statistic]
1466             if not isinstance(interaction_statistic, list):
1467                 return
1468             for is_e in interaction_statistic:
1469                 if not isinstance(is_e, dict):
1470                     continue
1471                 if is_e.get('@type') != 'InteractionCounter':
1472                     continue
1473                 interaction_type = extract_interaction_type(is_e)
1474                 if not interaction_type:
1475                     continue
1476                 # For interaction count some sites provide string instead of
1477                 # an integer (as per spec) with non digit characters (e.g. ",")
1478                 # so extracting count with more relaxed str_to_int
1479                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1480                 if interaction_count is None:
1481                     continue
1482                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1483                 if not count_kind:
1484                     continue
1485                 count_key = '%s_count' % count_kind
1486                 if info.get(count_key) is not None:
1487                     continue
1488                 info[count_key] = interaction_count
1489
1490         def extract_chapter_information(e):
1491             chapters = [{
1492                 'title': part.get('name'),
1493                 'start_time': part.get('startOffset'),
1494                 'end_time': part.get('endOffset'),
1495             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1496             for idx, (last_c, current_c, next_c) in enumerate(zip(
1497                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1498                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1499                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1500                 if None in current_c.values():
1501                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1502                     return
1503             if chapters:
1504                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1505                 info['chapters'] = chapters
1506
1507         def extract_video_object(e):
1508             assert e['@type'] == 'VideoObject'
1509             author = e.get('author')
1510             info.update({
1511                 'url': url_or_none(e.get('contentUrl')),
1512                 'title': unescapeHTML(e.get('name')),
1513                 'description': unescapeHTML(e.get('description')),
1514                 'thumbnails': [{'url': url_or_none(url)}
1515                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1516                 'duration': parse_duration(e.get('duration')),
1517                 'timestamp': unified_timestamp(e.get('uploadDate')),
1518                 # author can be an instance of 'Organization' or 'Person' types.
1519                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1520                 # however some websites are using 'Text' type instead.
1521                 # 1. https://schema.org/VideoObject
1522                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1523                 'filesize': float_or_none(e.get('contentSize')),
1524                 'tbr': int_or_none(e.get('bitrate')),
1525                 'width': int_or_none(e.get('width')),
1526                 'height': int_or_none(e.get('height')),
1527                 'view_count': int_or_none(e.get('interactionCount')),
1528             })
1529             extract_interaction_statistic(e)
1530             extract_chapter_information(e)
1531
1532         def traverse_json_ld(json_ld, at_top_level=True):
1533             for e in json_ld:
1534                 if at_top_level and '@context' not in e:
1535                     continue
1536                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1537                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1538                     break
1539                 item_type = e.get('@type')
1540                 if expected_type is not None and expected_type != item_type:
1541                     continue
1542                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1543                 if rating is not None:
1544                     info['average_rating'] = rating
1545                 if item_type in ('TVEpisode', 'Episode'):
1546                     episode_name = unescapeHTML(e.get('name'))
1547                     info.update({
1548                         'episode': episode_name,
1549                         'episode_number': int_or_none(e.get('episodeNumber')),
1550                         'description': unescapeHTML(e.get('description')),
1551                     })
1552                     if not info.get('title') and episode_name:
1553                         info['title'] = episode_name
1554                     part_of_season = e.get('partOfSeason')
1555                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1556                         info.update({
1557                             'season': unescapeHTML(part_of_season.get('name')),
1558                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1559                         })
1560                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1561                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1562                         info['series'] = unescapeHTML(part_of_series.get('name'))
1563                 elif item_type == 'Movie':
1564                     info.update({
1565                         'title': unescapeHTML(e.get('name')),
1566                         'description': unescapeHTML(e.get('description')),
1567                         'duration': parse_duration(e.get('duration')),
1568                         'timestamp': unified_timestamp(e.get('dateCreated')),
1569                     })
1570                 elif item_type in ('Article', 'NewsArticle'):
1571                     info.update({
1572                         'timestamp': parse_iso8601(e.get('datePublished')),
1573                         'title': unescapeHTML(e.get('headline')),
1574                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1575                     })
1576                     if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1577                         extract_video_object(e['video'][0])
1578                 elif item_type == 'VideoObject':
1579                     extract_video_object(e)
1580                     if expected_type is None:
1581                         continue
1582                     else:
1583                         break
1584                 video = e.get('video')
1585                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1586                     extract_video_object(video)
1587                 if expected_type is None:
1588                     continue
1589                 else:
1590                     break
1591         traverse_json_ld(json_ld)
1592
1593         return filter_dict(info)
1594
1595     def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1596         return self._parse_json(
1597             self._search_regex(
1598                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1599                 webpage, 'next.js data', fatal=fatal, **kw),
1600             video_id, transform_source=transform_source, fatal=fatal)
1601
1602     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1603         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1604         # not all website do this, but it can be changed
1605         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1606         rectx = re.escape(context_name)
1607         js, arg_keys, arg_vals = self._search_regex(
1608             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1609              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1610             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1611
1612         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1613
1614         for key, val in args.items():
1615             if val in ('undefined', 'void 0'):
1616                 args[key] = 'null'
1617
1618         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1619
1620     @staticmethod
1621     def _hidden_inputs(html):
1622         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1623         hidden_inputs = {}
1624         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1625             attrs = extract_attributes(input)
1626             if not input:
1627                 continue
1628             if attrs.get('type') not in ('hidden', 'submit'):
1629                 continue
1630             name = attrs.get('name') or attrs.get('id')
1631             value = attrs.get('value')
1632             if name and value is not None:
1633                 hidden_inputs[name] = value
1634         return hidden_inputs
1635
1636     def _form_hidden_inputs(self, form_id, html):
1637         form = self._search_regex(
1638             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1639             html, '%s form' % form_id, group='form')
1640         return self._hidden_inputs(form)
1641
1642     class FormatSort:
1643         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1644
1645         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1646                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1647                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1648         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1649                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1650                         'fps', 'fs_approx', 'source', 'id')
1651
1652         settings = {
1653             'vcodec': {'type': 'ordered', 'regex': True,
1654                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1655             'acodec': {'type': 'ordered', 'regex': True,
1656                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1657             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1658                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1659             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1660                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1661             'vext': {'type': 'ordered', 'field': 'video_ext',
1662                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1663                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1664             'aext': {'type': 'ordered', 'field': 'audio_ext',
1665                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1666                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1667             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1668             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1669                            'field': ('vcodec', 'acodec'),
1670                            'function': lambda it: int(any(v != 'none' for v in it))},
1671             'ie_pref': {'priority': True, 'type': 'extractor'},
1672             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1673             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1674             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1675             'quality': {'convert': 'float', 'default': -1},
1676             'filesize': {'convert': 'bytes'},
1677             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1678             'id': {'convert': 'string', 'field': 'format_id'},
1679             'height': {'convert': 'float_none'},
1680             'width': {'convert': 'float_none'},
1681             'fps': {'convert': 'float_none'},
1682             'tbr': {'convert': 'float_none'},
1683             'vbr': {'convert': 'float_none'},
1684             'abr': {'convert': 'float_none'},
1685             'asr': {'convert': 'float_none'},
1686             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1687
1688             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1689             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1690             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1691             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1692             'res': {'type': 'multiple', 'field': ('height', 'width'),
1693                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1694
1695             # For compatibility with youtube-dl
1696             'format_id': {'type': 'alias', 'field': 'id'},
1697             'preference': {'type': 'alias', 'field': 'ie_pref'},
1698             'language_preference': {'type': 'alias', 'field': 'lang'},
1699             'source_preference': {'type': 'alias', 'field': 'source'},
1700             'protocol': {'type': 'alias', 'field': 'proto'},
1701             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1702
1703             # Deprecated
1704             'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1705             'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1706             'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1707             'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1708             'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1709             'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1710             'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1711             'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1712             'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1713             'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1714             'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1715             'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1716             'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1717             'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1718             'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1719             'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1720             'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1721             'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1722             'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1723             'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1724         }
1725
1726         def __init__(self, ie, field_preference):
1727             self._order = []
1728             self.ydl = ie._downloader
1729             self.evaluate_params(self.ydl.params, field_preference)
1730             if ie.get_param('verbose'):
1731                 self.print_verbose_info(self.ydl.write_debug)
1732
1733         def _get_field_setting(self, field, key):
1734             if field not in self.settings:
1735                 if key in ('forced', 'priority'):
1736                     return False
1737                 self.ydl.deprecation_warning(
1738                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1739                     'and may be removed in a future version')
1740                 self.settings[field] = {}
1741             propObj = self.settings[field]
1742             if key not in propObj:
1743                 type = propObj.get('type')
1744                 if key == 'field':
1745                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1746                 elif key == 'convert':
1747                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1748                 else:
1749                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1750                 propObj[key] = default
1751             return propObj[key]
1752
1753         def _resolve_field_value(self, field, value, convertNone=False):
1754             if value is None:
1755                 if not convertNone:
1756                     return None
1757             else:
1758                 value = value.lower()
1759             conversion = self._get_field_setting(field, 'convert')
1760             if conversion == 'ignore':
1761                 return None
1762             if conversion == 'string':
1763                 return value
1764             elif conversion == 'float_none':
1765                 return float_or_none(value)
1766             elif conversion == 'bytes':
1767                 return FileDownloader.parse_bytes(value)
1768             elif conversion == 'order':
1769                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1770                 use_regex = self._get_field_setting(field, 'regex')
1771                 list_length = len(order_list)
1772                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1773                 if use_regex and value is not None:
1774                     for i, regex in enumerate(order_list):
1775                         if regex and re.match(regex, value):
1776                             return list_length - i
1777                     return list_length - empty_pos  # not in list
1778                 else:  # not regex or  value = None
1779                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1780             else:
1781                 if value.isnumeric():
1782                     return float(value)
1783                 else:
1784                     self.settings[field]['convert'] = 'string'
1785                     return value
1786
1787         def evaluate_params(self, params, sort_extractor):
1788             self._use_free_order = params.get('prefer_free_formats', False)
1789             self._sort_user = params.get('format_sort', [])
1790             self._sort_extractor = sort_extractor
1791
1792             def add_item(field, reverse, closest, limit_text):
1793                 field = field.lower()
1794                 if field in self._order:
1795                     return
1796                 self._order.append(field)
1797                 limit = self._resolve_field_value(field, limit_text)
1798                 data = {
1799                     'reverse': reverse,
1800                     'closest': False if limit is None else closest,
1801                     'limit_text': limit_text,
1802                     'limit': limit}
1803                 if field in self.settings:
1804                     self.settings[field].update(data)
1805                 else:
1806                     self.settings[field] = data
1807
1808             sort_list = (
1809                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1810                 + (tuple() if params.get('format_sort_force', False)
1811                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1812                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1813
1814             for item in sort_list:
1815                 match = re.match(self.regex, item)
1816                 if match is None:
1817                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1818                 field = match.group('field')
1819                 if field is None:
1820                     continue
1821                 if self._get_field_setting(field, 'type') == 'alias':
1822                     alias, field = field, self._get_field_setting(field, 'field')
1823                     if self._get_field_setting(alias, 'deprecated'):
1824                         self.ydl.deprecation_warning(
1825                             f'Format sorting alias {alias} is deprecated '
1826                             f'and may be removed in a future version. Please use {field} instead')
1827                 reverse = match.group('reverse') is not None
1828                 closest = match.group('separator') == '~'
1829                 limit_text = match.group('limit')
1830
1831                 has_limit = limit_text is not None
1832                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1833                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1834
1835                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1836                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1837                 limit_count = len(limits)
1838                 for (i, f) in enumerate(fields):
1839                     add_item(f, reverse, closest,
1840                              limits[i] if i < limit_count
1841                              else limits[0] if has_limit and not has_multiple_limits
1842                              else None)
1843
1844         def print_verbose_info(self, write_debug):
1845             if self._sort_user:
1846                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1847             if self._sort_extractor:
1848                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1849             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1850                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1851                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1852                               self._get_field_setting(field, 'limit_text'),
1853                               self._get_field_setting(field, 'limit'))
1854                 if self._get_field_setting(field, 'limit_text') is not None else '')
1855                 for field in self._order if self._get_field_setting(field, 'visible')]))
1856
1857         def _calculate_field_preference_from_value(self, format, field, type, value):
1858             reverse = self._get_field_setting(field, 'reverse')
1859             closest = self._get_field_setting(field, 'closest')
1860             limit = self._get_field_setting(field, 'limit')
1861
1862             if type == 'extractor':
1863                 maximum = self._get_field_setting(field, 'max')
1864                 if value is None or (maximum is not None and value >= maximum):
1865                     value = -1
1866             elif type == 'boolean':
1867                 in_list = self._get_field_setting(field, 'in_list')
1868                 not_in_list = self._get_field_setting(field, 'not_in_list')
1869                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1870             elif type == 'ordered':
1871                 value = self._resolve_field_value(field, value, True)
1872
1873             # try to convert to number
1874             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1875             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1876             if is_num:
1877                 value = val_num
1878
1879             return ((-10, 0) if value is None
1880                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1881                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1882                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1883                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1884                     else (-1, value, 0))
1885
1886         def _calculate_field_preference(self, format, field):
1887             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1888             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1889             if type == 'multiple':
1890                 type = 'field'  # Only 'field' is allowed in multiple for now
1891                 actual_fields = self._get_field_setting(field, 'field')
1892
1893                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1894             else:
1895                 value = get_value(field)
1896             return self._calculate_field_preference_from_value(format, field, type, value)
1897
1898         def calculate_preference(self, format):
1899             # Determine missing protocol
1900             if not format.get('protocol'):
1901                 format['protocol'] = determine_protocol(format)
1902
1903             # Determine missing ext
1904             if not format.get('ext') and 'url' in format:
1905                 format['ext'] = determine_ext(format['url'])
1906             if format.get('vcodec') == 'none':
1907                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1908                 format['video_ext'] = 'none'
1909             else:
1910                 format['video_ext'] = format['ext']
1911                 format['audio_ext'] = 'none'
1912             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1913             #    format['preference'] = -1000
1914
1915             # Determine missing bitrates
1916             if format.get('tbr') is None:
1917                 if format.get('vbr') is not None and format.get('abr') is not None:
1918                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1919             else:
1920                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1921                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1922                 if format.get('acodec') != 'none' and format.get('abr') is None:
1923                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1924
1925             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1926
1927     def _sort_formats(self, formats, field_preference=[]):
1928         if not formats:
1929             return
1930         format_sort = self.FormatSort(self, field_preference)
1931         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1932
1933     def _check_formats(self, formats, video_id):
1934         if formats:
1935             formats[:] = filter(
1936                 lambda f: self._is_valid_url(
1937                     f['url'], video_id,
1938                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1939                 formats)
1940
1941     @staticmethod
1942     def _remove_duplicate_formats(formats):
1943         format_urls = set()
1944         unique_formats = []
1945         for f in formats:
1946             if f['url'] not in format_urls:
1947                 format_urls.add(f['url'])
1948                 unique_formats.append(f)
1949         formats[:] = unique_formats
1950
1951     def _is_valid_url(self, url, video_id, item='video', headers={}):
1952         url = self._proto_relative_url(url, scheme='http:')
1953         # For now assume non HTTP(S) URLs always valid
1954         if not (url.startswith('http://') or url.startswith('https://')):
1955             return True
1956         try:
1957             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1958             return True
1959         except ExtractorError as e:
1960             self.to_screen(
1961                 '%s: %s URL is invalid, skipping: %s'
1962                 % (video_id, item, error_to_compat_str(e.cause)))
1963             return False
1964
1965     def http_scheme(self):
1966         """ Either "http:" or "https:", depending on the user's preferences """
1967         return (
1968             'http:'
1969             if self.get_param('prefer_insecure', False)
1970             else 'https:')
1971
1972     def _proto_relative_url(self, url, scheme=None):
1973         if url is None:
1974             return url
1975         if url.startswith('//'):
1976             if scheme is None:
1977                 scheme = self.http_scheme()
1978             return scheme + url
1979         else:
1980             return url
1981
1982     def _sleep(self, timeout, video_id, msg_template=None):
1983         if msg_template is None:
1984             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1985         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1986         self.to_screen(msg)
1987         time.sleep(timeout)
1988
1989     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1990                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1991                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1992         manifest = self._download_xml(
1993             manifest_url, video_id, 'Downloading f4m manifest',
1994             'Unable to download f4m manifest',
1995             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1996             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1997             transform_source=transform_source,
1998             fatal=fatal, data=data, headers=headers, query=query)
1999
2000         if manifest is False:
2001             return []
2002
2003         return self._parse_f4m_formats(
2004             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2005             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2006
2007     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2008                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
2009                            fatal=True, m3u8_id=None):
2010         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2011             return []
2012
2013         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2014         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2015         if akamai_pv is not None and ';' in akamai_pv.text:
2016             playerVerificationChallenge = akamai_pv.text.split(';')[0]
2017             if playerVerificationChallenge.strip() != '':
2018                 return []
2019
2020         formats = []
2021         manifest_version = '1.0'
2022         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2023         if not media_nodes:
2024             manifest_version = '2.0'
2025             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2026         # Remove unsupported DRM protected media from final formats
2027         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2028         media_nodes = remove_encrypted_media(media_nodes)
2029         if not media_nodes:
2030             return formats
2031
2032         manifest_base_url = get_base_url(manifest)
2033
2034         bootstrap_info = xpath_element(
2035             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2036             'bootstrap info', default=None)
2037
2038         vcodec = None
2039         mime_type = xpath_text(
2040             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2041             'base URL', default=None)
2042         if mime_type and mime_type.startswith('audio/'):
2043             vcodec = 'none'
2044
2045         for i, media_el in enumerate(media_nodes):
2046             tbr = int_or_none(media_el.attrib.get('bitrate'))
2047             width = int_or_none(media_el.attrib.get('width'))
2048             height = int_or_none(media_el.attrib.get('height'))
2049             format_id = join_nonempty(f4m_id, tbr or i)
2050             # If <bootstrapInfo> is present, the specified f4m is a
2051             # stream-level manifest, and only set-level manifests may refer to
2052             # external resources.  See section 11.4 and section 4 of F4M spec
2053             if bootstrap_info is None:
2054                 media_url = None
2055                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2056                 if manifest_version == '2.0':
2057                     media_url = media_el.attrib.get('href')
2058                 if media_url is None:
2059                     media_url = media_el.attrib.get('url')
2060                 if not media_url:
2061                     continue
2062                 manifest_url = (
2063                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2064                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2065                 # If media_url is itself a f4m manifest do the recursive extraction
2066                 # since bitrates in parent manifest (this one) and media_url manifest
2067                 # may differ leading to inability to resolve the format by requested
2068                 # bitrate in f4m downloader
2069                 ext = determine_ext(manifest_url)
2070                 if ext == 'f4m':
2071                     f4m_formats = self._extract_f4m_formats(
2072                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2073                         transform_source=transform_source, fatal=fatal)
2074                     # Sometimes stream-level manifest contains single media entry that
2075                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2076                     # At the same time parent's media entry in set-level manifest may
2077                     # contain it. We will copy it from parent in such cases.
2078                     if len(f4m_formats) == 1:
2079                         f = f4m_formats[0]
2080                         f.update({
2081                             'tbr': f.get('tbr') or tbr,
2082                             'width': f.get('width') or width,
2083                             'height': f.get('height') or height,
2084                             'format_id': f.get('format_id') if not tbr else format_id,
2085                             'vcodec': vcodec,
2086                         })
2087                     formats.extend(f4m_formats)
2088                     continue
2089                 elif ext == 'm3u8':
2090                     formats.extend(self._extract_m3u8_formats(
2091                         manifest_url, video_id, 'mp4', preference=preference,
2092                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2093                     continue
2094             formats.append({
2095                 'format_id': format_id,
2096                 'url': manifest_url,
2097                 'manifest_url': manifest_url,
2098                 'ext': 'flv' if bootstrap_info is not None else None,
2099                 'protocol': 'f4m',
2100                 'tbr': tbr,
2101                 'width': width,
2102                 'height': height,
2103                 'vcodec': vcodec,
2104                 'preference': preference,
2105                 'quality': quality,
2106             })
2107         return formats
2108
2109     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2110         return {
2111             'format_id': join_nonempty(m3u8_id, 'meta'),
2112             'url': m3u8_url,
2113             'ext': ext,
2114             'protocol': 'm3u8',
2115             'preference': preference - 100 if preference else -100,
2116             'quality': quality,
2117             'resolution': 'multiple',
2118             'format_note': 'Quality selection URL',
2119         }
2120
2121     def _report_ignoring_subs(self, name):
2122         self.report_warning(bug_reports_message(
2123             f'Ignoring subtitle tracks found in the {name} manifest; '
2124             'if any subtitle tracks are missing,'
2125         ), only_once=True)
2126
2127     def _extract_m3u8_formats(self, *args, **kwargs):
2128         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2129         if subs:
2130             self._report_ignoring_subs('HLS')
2131         return fmts
2132
2133     def _extract_m3u8_formats_and_subtitles(
2134             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2135             preference=None, quality=None, m3u8_id=None, note=None,
2136             errnote=None, fatal=True, live=False, data=None, headers={},
2137             query={}):
2138
2139         res = self._download_webpage_handle(
2140             m3u8_url, video_id,
2141             note='Downloading m3u8 information' if note is None else note,
2142             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2143             fatal=fatal, data=data, headers=headers, query=query)
2144
2145         if res is False:
2146             return [], {}
2147
2148         m3u8_doc, urlh = res
2149         m3u8_url = urlh.geturl()
2150
2151         return self._parse_m3u8_formats_and_subtitles(
2152             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2153             preference=preference, quality=quality, m3u8_id=m3u8_id,
2154             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2155             headers=headers, query=query, video_id=video_id)
2156
2157     def _parse_m3u8_formats_and_subtitles(
2158             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2159             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2160             errnote=None, fatal=True, data=None, headers={}, query={},
2161             video_id=None):
2162         formats, subtitles = [], {}
2163
2164         has_drm = re.search('|'.join([
2165             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2166             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2167         ]), m3u8_doc)
2168
2169         def format_url(url):
2170             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2171
2172         if self.get_param('hls_split_discontinuity', False):
2173             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2174                 if not m3u8_doc:
2175                     if not manifest_url:
2176                         return []
2177                     m3u8_doc = self._download_webpage(
2178                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2179                         note=False, errnote='Failed to download m3u8 playlist information')
2180                     if m3u8_doc is False:
2181                         return []
2182                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2183
2184         else:
2185             def _extract_m3u8_playlist_indices(*args, **kwargs):
2186                 return [None]
2187
2188         # References:
2189         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2190         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2191         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2192
2193         # We should try extracting formats only from master playlists [1, 4.3.4],
2194         # i.e. playlists that describe available qualities. On the other hand
2195         # media playlists [1, 4.3.3] should be returned as is since they contain
2196         # just the media without qualities renditions.
2197         # Fortunately, master playlist can be easily distinguished from media
2198         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2199         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2200         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2201         # media playlist and MUST NOT appear in master playlist thus we can
2202         # clearly detect media playlist with this criterion.
2203
2204         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2205             formats = [{
2206                 'format_id': join_nonempty(m3u8_id, idx),
2207                 'format_index': idx,
2208                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2209                 'ext': ext,
2210                 'protocol': entry_protocol,
2211                 'preference': preference,
2212                 'quality': quality,
2213                 'has_drm': has_drm,
2214             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2215
2216             return formats, subtitles
2217
2218         groups = {}
2219         last_stream_inf = {}
2220
2221         def extract_media(x_media_line):
2222             media = parse_m3u8_attributes(x_media_line)
2223             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2224             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2225             if not (media_type and group_id and name):
2226                 return
2227             groups.setdefault(group_id, []).append(media)
2228             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2229             if media_type == 'SUBTITLES':
2230                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2231                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2232                 # However, lack of URI has been spotted in the wild.
2233                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2234                 if not media.get('URI'):
2235                     return
2236                 url = format_url(media['URI'])
2237                 sub_info = {
2238                     'url': url,
2239                     'ext': determine_ext(url),
2240                 }
2241                 if sub_info['ext'] == 'm3u8':
2242                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2243                     # files may contain is WebVTT:
2244                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2245                     sub_info['ext'] = 'vtt'
2246                     sub_info['protocol'] = 'm3u8_native'
2247                 lang = media.get('LANGUAGE') or 'und'
2248                 subtitles.setdefault(lang, []).append(sub_info)
2249             if media_type not in ('VIDEO', 'AUDIO'):
2250                 return
2251             media_url = media.get('URI')
2252             if media_url:
2253                 manifest_url = format_url(media_url)
2254                 formats.extend({
2255                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2256                     'format_note': name,
2257                     'format_index': idx,
2258                     'url': manifest_url,
2259                     'manifest_url': m3u8_url,
2260                     'language': media.get('LANGUAGE'),
2261                     'ext': ext,
2262                     'protocol': entry_protocol,
2263                     'preference': preference,
2264                     'quality': quality,
2265                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2266                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2267
2268         def build_stream_name():
2269             # Despite specification does not mention NAME attribute for
2270             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2271             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2272             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2273             stream_name = last_stream_inf.get('NAME')
2274             if stream_name:
2275                 return stream_name
2276             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2277             # from corresponding rendition group
2278             stream_group_id = last_stream_inf.get('VIDEO')
2279             if not stream_group_id:
2280                 return
2281             stream_group = groups.get(stream_group_id)
2282             if not stream_group:
2283                 return stream_group_id
2284             rendition = stream_group[0]
2285             return rendition.get('NAME') or stream_group_id
2286
2287         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2288         # chance to detect video only formats when EXT-X-STREAM-INF tags
2289         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2290         for line in m3u8_doc.splitlines():
2291             if line.startswith('#EXT-X-MEDIA:'):
2292                 extract_media(line)
2293
2294         for line in m3u8_doc.splitlines():
2295             if line.startswith('#EXT-X-STREAM-INF:'):
2296                 last_stream_inf = parse_m3u8_attributes(line)
2297             elif line.startswith('#') or not line.strip():
2298                 continue
2299             else:
2300                 tbr = float_or_none(
2301                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2302                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2303                 manifest_url = format_url(line.strip())
2304
2305                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2306                     format_id = [m3u8_id, None, idx]
2307                     # Bandwidth of live streams may differ over time thus making
2308                     # format_id unpredictable. So it's better to keep provided
2309                     # format_id intact.
2310                     if not live:
2311                         stream_name = build_stream_name()
2312                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2313                     f = {
2314                         'format_id': join_nonempty(*format_id),
2315                         'format_index': idx,
2316                         'url': manifest_url,
2317                         'manifest_url': m3u8_url,
2318                         'tbr': tbr,
2319                         'ext': ext,
2320                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2321                         'protocol': entry_protocol,
2322                         'preference': preference,
2323                         'quality': quality,
2324                     }
2325                     resolution = last_stream_inf.get('RESOLUTION')
2326                     if resolution:
2327                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2328                         if mobj:
2329                             f['width'] = int(mobj.group('width'))
2330                             f['height'] = int(mobj.group('height'))
2331                     # Unified Streaming Platform
2332                     mobj = re.search(
2333                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2334                     if mobj:
2335                         abr, vbr = mobj.groups()
2336                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2337                         f.update({
2338                             'vbr': vbr,
2339                             'abr': abr,
2340                         })
2341                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2342                     f.update(codecs)
2343                     audio_group_id = last_stream_inf.get('AUDIO')
2344                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2345                     # references a rendition group MUST have a CODECS attribute.
2346                     # However, this is not always respected, for example, [2]
2347                     # contains EXT-X-STREAM-INF tag which references AUDIO
2348                     # rendition group but does not have CODECS and despite
2349                     # referencing an audio group it represents a complete
2350                     # (with audio and video) format. So, for such cases we will
2351                     # ignore references to rendition groups and treat them
2352                     # as complete formats.
2353                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2354                         audio_group = groups.get(audio_group_id)
2355                         if audio_group and audio_group[0].get('URI'):
2356                             # TODO: update acodec for audio only formats with
2357                             # the same GROUP-ID
2358                             f['acodec'] = 'none'
2359                     if not f.get('ext'):
2360                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2361                     formats.append(f)
2362
2363                     # for DailyMotion
2364                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2365                     if progressive_uri:
2366                         http_f = f.copy()
2367                         del http_f['manifest_url']
2368                         http_f.update({
2369                             'format_id': f['format_id'].replace('hls-', 'http-'),
2370                             'protocol': 'http',
2371                             'url': progressive_uri,
2372                         })
2373                         formats.append(http_f)
2374
2375                 last_stream_inf = {}
2376         return formats, subtitles
2377
2378     def _extract_m3u8_vod_duration(
2379             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2380
2381         m3u8_vod = self._download_webpage(
2382             m3u8_vod_url, video_id,
2383             note='Downloading m3u8 VOD manifest' if note is None else note,
2384             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2385             fatal=False, data=data, headers=headers, query=query)
2386
2387         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2388
2389     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2390         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2391             return None
2392
2393         return int(sum(
2394             float(line[len('#EXTINF:'):].split(',')[0])
2395             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2396
2397     @staticmethod
2398     def _xpath_ns(path, namespace=None):
2399         if not namespace:
2400             return path
2401         out = []
2402         for c in path.split('/'):
2403             if not c or c == '.':
2404                 out.append(c)
2405             else:
2406                 out.append('{%s}%s' % (namespace, c))
2407         return '/'.join(out)
2408
2409     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2410         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2411
2412         if smil is False:
2413             assert not fatal
2414             return [], {}
2415
2416         namespace = self._parse_smil_namespace(smil)
2417
2418         fmts = self._parse_smil_formats(
2419             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2420         subs = self._parse_smil_subtitles(
2421             smil, namespace=namespace)
2422
2423         return fmts, subs
2424
2425     def _extract_smil_formats(self, *args, **kwargs):
2426         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2427         if subs:
2428             self._report_ignoring_subs('SMIL')
2429         return fmts
2430
2431     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2432         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2433         if smil is False:
2434             return {}
2435         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2436
2437     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2438         return self._download_xml(
2439             smil_url, video_id, 'Downloading SMIL file',
2440             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2441
2442     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2443         namespace = self._parse_smil_namespace(smil)
2444
2445         formats = self._parse_smil_formats(
2446             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2447         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2448
2449         video_id = os.path.splitext(url_basename(smil_url))[0]
2450         title = None
2451         description = None
2452         upload_date = None
2453         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2454             name = meta.attrib.get('name')
2455             content = meta.attrib.get('content')
2456             if not name or not content:
2457                 continue
2458             if not title and name == 'title':
2459                 title = content
2460             elif not description and name in ('description', 'abstract'):
2461                 description = content
2462             elif not upload_date and name == 'date':
2463                 upload_date = unified_strdate(content)
2464
2465         thumbnails = [{
2466             'id': image.get('type'),
2467             'url': image.get('src'),
2468             'width': int_or_none(image.get('width')),
2469             'height': int_or_none(image.get('height')),
2470         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2471
2472         return {
2473             'id': video_id,
2474             'title': title or video_id,
2475             'description': description,
2476             'upload_date': upload_date,
2477             'thumbnails': thumbnails,
2478             'formats': formats,
2479             'subtitles': subtitles,
2480         }
2481
2482     def _parse_smil_namespace(self, smil):
2483         return self._search_regex(
2484             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2485
2486     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2487         base = smil_url
2488         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2489             b = meta.get('base') or meta.get('httpBase')
2490             if b:
2491                 base = b
2492                 break
2493
2494         formats = []
2495         rtmp_count = 0
2496         http_count = 0
2497         m3u8_count = 0
2498         imgs_count = 0
2499
2500         srcs = set()
2501         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2502         for medium in media:
2503             src = medium.get('src')
2504             if not src or src in srcs:
2505                 continue
2506             srcs.add(src)
2507
2508             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2509             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2510             width = int_or_none(medium.get('width'))
2511             height = int_or_none(medium.get('height'))
2512             proto = medium.get('proto')
2513             ext = medium.get('ext')
2514             src_ext = determine_ext(src)
2515             streamer = medium.get('streamer') or base
2516
2517             if proto == 'rtmp' or streamer.startswith('rtmp'):
2518                 rtmp_count += 1
2519                 formats.append({
2520                     'url': streamer,
2521                     'play_path': src,
2522                     'ext': 'flv',
2523                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2524                     'tbr': bitrate,
2525                     'filesize': filesize,
2526                     'width': width,
2527                     'height': height,
2528                 })
2529                 if transform_rtmp_url:
2530                     streamer, src = transform_rtmp_url(streamer, src)
2531                     formats[-1].update({
2532                         'url': streamer,
2533                         'play_path': src,
2534                     })
2535                 continue
2536
2537             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2538             src_url = src_url.strip()
2539
2540             if proto == 'm3u8' or src_ext == 'm3u8':
2541                 m3u8_formats = self._extract_m3u8_formats(
2542                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2543                 if len(m3u8_formats) == 1:
2544                     m3u8_count += 1
2545                     m3u8_formats[0].update({
2546                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2547                         'tbr': bitrate,
2548                         'width': width,
2549                         'height': height,
2550                     })
2551                 formats.extend(m3u8_formats)
2552             elif src_ext == 'f4m':
2553                 f4m_url = src_url
2554                 if not f4m_params:
2555                     f4m_params = {
2556                         'hdcore': '3.2.0',
2557                         'plugin': 'flowplayer-3.2.0.1',
2558                     }
2559                 f4m_url += '&' if '?' in f4m_url else '?'
2560                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2561                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2562             elif src_ext == 'mpd':
2563                 formats.extend(self._extract_mpd_formats(
2564                     src_url, video_id, mpd_id='dash', fatal=False))
2565             elif re.search(r'\.ism/[Mm]anifest', src_url):
2566                 formats.extend(self._extract_ism_formats(
2567                     src_url, video_id, ism_id='mss', fatal=False))
2568             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2569                 http_count += 1
2570                 formats.append({
2571                     'url': src_url,
2572                     'ext': ext or src_ext or 'flv',
2573                     'format_id': 'http-%d' % (bitrate or http_count),
2574                     'tbr': bitrate,
2575                     'filesize': filesize,
2576                     'width': width,
2577                     'height': height,
2578                 })
2579
2580         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2581             src = medium.get('src')
2582             if not src or src in srcs:
2583                 continue
2584             srcs.add(src)
2585
2586             imgs_count += 1
2587             formats.append({
2588                 'format_id': 'imagestream-%d' % (imgs_count),
2589                 'url': src,
2590                 'ext': mimetype2ext(medium.get('type')),
2591                 'acodec': 'none',
2592                 'vcodec': 'none',
2593                 'width': int_or_none(medium.get('width')),
2594                 'height': int_or_none(medium.get('height')),
2595                 'format_note': 'SMIL storyboards',
2596             })
2597
2598         return formats
2599
2600     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2601         urls = []
2602         subtitles = {}
2603         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2604             src = textstream.get('src')
2605             if not src or src in urls:
2606                 continue
2607             urls.append(src)
2608             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2609             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2610             subtitles.setdefault(lang, []).append({
2611                 'url': src,
2612                 'ext': ext,
2613             })
2614         return subtitles
2615
2616     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2617         xspf = self._download_xml(
2618             xspf_url, playlist_id, 'Downloading xpsf playlist',
2619             'Unable to download xspf manifest', fatal=fatal)
2620         if xspf is False:
2621             return []
2622         return self._parse_xspf(
2623             xspf, playlist_id, xspf_url=xspf_url,
2624             xspf_base_url=base_url(xspf_url))
2625
2626     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2627         NS_MAP = {
2628             'xspf': 'http://xspf.org/ns/0/',
2629             's1': 'http://static.streamone.nl/player/ns/0',
2630         }
2631
2632         entries = []
2633         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2634             title = xpath_text(
2635                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2636             description = xpath_text(
2637                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2638             thumbnail = xpath_text(
2639                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2640             duration = float_or_none(
2641                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2642
2643             formats = []
2644             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2645                 format_url = urljoin(xspf_base_url, location.text)
2646                 if not format_url:
2647                     continue
2648                 formats.append({
2649                     'url': format_url,
2650                     'manifest_url': xspf_url,
2651                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2652                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2653                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2654                 })
2655             self._sort_formats(formats)
2656
2657             entries.append({
2658                 'id': playlist_id,
2659                 'title': title,
2660                 'description': description,
2661                 'thumbnail': thumbnail,
2662                 'duration': duration,
2663                 'formats': formats,
2664             })
2665         return entries
2666
2667     def _extract_mpd_formats(self, *args, **kwargs):
2668         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2669         if subs:
2670             self._report_ignoring_subs('DASH')
2671         return fmts
2672
2673     def _extract_mpd_formats_and_subtitles(
2674             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2675             fatal=True, data=None, headers={}, query={}):
2676         res = self._download_xml_handle(
2677             mpd_url, video_id,
2678             note='Downloading MPD manifest' if note is None else note,
2679             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2680             fatal=fatal, data=data, headers=headers, query=query)
2681         if res is False:
2682             return [], {}
2683         mpd_doc, urlh = res
2684         if mpd_doc is None:
2685             return [], {}
2686         mpd_base_url = base_url(urlh.geturl())
2687
2688         return self._parse_mpd_formats_and_subtitles(
2689             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2690
2691     def _parse_mpd_formats(self, *args, **kwargs):
2692         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2693         if subs:
2694             self._report_ignoring_subs('DASH')
2695         return fmts
2696
2697     def _parse_mpd_formats_and_subtitles(
2698             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2699         """
2700         Parse formats from MPD manifest.
2701         References:
2702          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2703             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2704          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2705         """
2706         if not self.get_param('dynamic_mpd', True):
2707             if mpd_doc.get('type') == 'dynamic':
2708                 return [], {}
2709
2710         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2711
2712         def _add_ns(path):
2713             return self._xpath_ns(path, namespace)
2714
2715         def is_drm_protected(element):
2716             return element.find(_add_ns('ContentProtection')) is not None
2717
2718         def extract_multisegment_info(element, ms_parent_info):
2719             ms_info = ms_parent_info.copy()
2720
2721             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2722             # common attributes and elements.  We will only extract relevant
2723             # for us.
2724             def extract_common(source):
2725                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2726                 if segment_timeline is not None:
2727                     s_e = segment_timeline.findall(_add_ns('S'))
2728                     if s_e:
2729                         ms_info['total_number'] = 0
2730                         ms_info['s'] = []
2731                         for s in s_e:
2732                             r = int(s.get('r', 0))
2733                             ms_info['total_number'] += 1 + r
2734                             ms_info['s'].append({
2735                                 't': int(s.get('t', 0)),
2736                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2737                                 'd': int(s.attrib['d']),
2738                                 'r': r,
2739                             })
2740                 start_number = source.get('startNumber')
2741                 if start_number:
2742                     ms_info['start_number'] = int(start_number)
2743                 timescale = source.get('timescale')
2744                 if timescale:
2745                     ms_info['timescale'] = int(timescale)
2746                 segment_duration = source.get('duration')
2747                 if segment_duration:
2748                     ms_info['segment_duration'] = float(segment_duration)
2749
2750             def extract_Initialization(source):
2751                 initialization = source.find(_add_ns('Initialization'))
2752                 if initialization is not None:
2753                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2754
2755             segment_list = element.find(_add_ns('SegmentList'))
2756             if segment_list is not None:
2757                 extract_common(segment_list)
2758                 extract_Initialization(segment_list)
2759                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2760                 if segment_urls_e:
2761                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2762             else:
2763                 segment_template = element.find(_add_ns('SegmentTemplate'))
2764                 if segment_template is not None:
2765                     extract_common(segment_template)
2766                     media = segment_template.get('media')
2767                     if media:
2768                         ms_info['media'] = media
2769                     initialization = segment_template.get('initialization')
2770                     if initialization:
2771                         ms_info['initialization'] = initialization
2772                     else:
2773                         extract_Initialization(segment_template)
2774             return ms_info
2775
2776         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2777         formats, subtitles = [], {}
2778         stream_numbers = collections.defaultdict(int)
2779         for period in mpd_doc.findall(_add_ns('Period')):
2780             period_duration = parse_duration(period.get('duration')) or mpd_duration
2781             period_ms_info = extract_multisegment_info(period, {
2782                 'start_number': 1,
2783                 'timescale': 1,
2784             })
2785             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2786                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2787                 for representation in adaptation_set.findall(_add_ns('Representation')):
2788                     representation_attrib = adaptation_set.attrib.copy()
2789                     representation_attrib.update(representation.attrib)
2790                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2791                     mime_type = representation_attrib['mimeType']
2792                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2793
2794                     codecs = parse_codecs(representation_attrib.get('codecs', ''))
2795                     if content_type not in ('video', 'audio', 'text'):
2796                         if mime_type == 'image/jpeg':
2797                             content_type = mime_type
2798                         elif codecs['vcodec'] != 'none':
2799                             content_type = 'video'
2800                         elif codecs['acodec'] != 'none':
2801                             content_type = 'audio'
2802                         elif codecs.get('tcodec', 'none') != 'none':
2803                             content_type = 'text'
2804                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2805                             content_type = 'text'
2806                         else:
2807                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2808                             continue
2809
2810                     base_url = ''
2811                     for element in (representation, adaptation_set, period, mpd_doc):
2812                         base_url_e = element.find(_add_ns('BaseURL'))
2813                         if base_url_e is not None:
2814                             base_url = base_url_e.text + base_url
2815                             if re.match(r'^https?://', base_url):
2816                                 break
2817                     if mpd_base_url and base_url.startswith('/'):
2818                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2819                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2820                         if not mpd_base_url.endswith('/'):
2821                             mpd_base_url += '/'
2822                         base_url = mpd_base_url + base_url
2823                     representation_id = representation_attrib.get('id')
2824                     lang = representation_attrib.get('lang')
2825                     url_el = representation.find(_add_ns('BaseURL'))
2826                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2827                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2828                     if representation_id is not None:
2829                         format_id = representation_id
2830                     else:
2831                         format_id = content_type
2832                     if mpd_id:
2833                         format_id = mpd_id + '-' + format_id
2834                     if content_type in ('video', 'audio'):
2835                         f = {
2836                             'format_id': format_id,
2837                             'manifest_url': mpd_url,
2838                             'ext': mimetype2ext(mime_type),
2839                             'width': int_or_none(representation_attrib.get('width')),
2840                             'height': int_or_none(representation_attrib.get('height')),
2841                             'tbr': float_or_none(bandwidth, 1000),
2842                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2843                             'fps': int_or_none(representation_attrib.get('frameRate')),
2844                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2845                             'format_note': 'DASH %s' % content_type,
2846                             'filesize': filesize,
2847                             'container': mimetype2ext(mime_type) + '_dash',
2848                             **codecs
2849                         }
2850                     elif content_type == 'text':
2851                         f = {
2852                             'ext': mimetype2ext(mime_type),
2853                             'manifest_url': mpd_url,
2854                             'filesize': filesize,
2855                         }
2856                     elif content_type == 'image/jpeg':
2857                         # See test case in VikiIE
2858                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2859                         f = {
2860                             'format_id': format_id,
2861                             'ext': 'mhtml',
2862                             'manifest_url': mpd_url,
2863                             'format_note': 'DASH storyboards (jpeg)',
2864                             'acodec': 'none',
2865                             'vcodec': 'none',
2866                         }
2867                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2868                         f['has_drm'] = True
2869                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2870
2871                     def prepare_template(template_name, identifiers):
2872                         tmpl = representation_ms_info[template_name]
2873                         # First of, % characters outside $...$ templates
2874                         # must be escaped by doubling for proper processing
2875                         # by % operator string formatting used further (see
2876                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2877                         t = ''
2878                         in_template = False
2879                         for c in tmpl:
2880                             t += c
2881                             if c == '$':
2882                                 in_template = not in_template
2883                             elif c == '%' and not in_template:
2884                                 t += c
2885                         # Next, $...$ templates are translated to their
2886                         # %(...) counterparts to be used with % operator
2887                         if representation_id is not None:
2888                             t = t.replace('$RepresentationID$', representation_id)
2889                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2890                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2891                         t.replace('$$', '$')
2892                         return t
2893
2894                     # @initialization is a regular template like @media one
2895                     # so it should be handled just the same way (see
2896                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2897                     if 'initialization' in representation_ms_info:
2898                         initialization_template = prepare_template(
2899                             'initialization',
2900                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2901                             # $Time$ shall not be included for @initialization thus
2902                             # only $Bandwidth$ remains
2903                             ('Bandwidth', ))
2904                         representation_ms_info['initialization_url'] = initialization_template % {
2905                             'Bandwidth': bandwidth,
2906                         }
2907
2908                     def location_key(location):
2909                         return 'url' if re.match(r'^https?://', location) else 'path'
2910
2911                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2912
2913                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2914                         media_location_key = location_key(media_template)
2915
2916                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2917                         # can't be used at the same time
2918                         if '%(Number' in media_template and 's' not in representation_ms_info:
2919                             segment_duration = None
2920                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2921                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2922                                 representation_ms_info['total_number'] = int(math.ceil(
2923                                     float_or_none(period_duration, segment_duration, default=0)))
2924                             representation_ms_info['fragments'] = [{
2925                                 media_location_key: media_template % {
2926                                     'Number': segment_number,
2927                                     'Bandwidth': bandwidth,
2928                                 },
2929                                 'duration': segment_duration,
2930                             } for segment_number in range(
2931                                 representation_ms_info['start_number'],
2932                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2933                         else:
2934                             # $Number*$ or $Time$ in media template with S list available
2935                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2936                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2937                             representation_ms_info['fragments'] = []
2938                             segment_time = 0
2939                             segment_d = None
2940                             segment_number = representation_ms_info['start_number']
2941
2942                             def add_segment_url():
2943                                 segment_url = media_template % {
2944                                     'Time': segment_time,
2945                                     'Bandwidth': bandwidth,
2946                                     'Number': segment_number,
2947                                 }
2948                                 representation_ms_info['fragments'].append({
2949                                     media_location_key: segment_url,
2950                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2951                                 })
2952
2953                             for num, s in enumerate(representation_ms_info['s']):
2954                                 segment_time = s.get('t') or segment_time
2955                                 segment_d = s['d']
2956                                 add_segment_url()
2957                                 segment_number += 1
2958                                 for r in range(s.get('r', 0)):
2959                                     segment_time += segment_d
2960                                     add_segment_url()
2961                                     segment_number += 1
2962                                 segment_time += segment_d
2963                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2964                         # No media template
2965                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2966                         # or any YouTube dashsegments video
2967                         fragments = []
2968                         segment_index = 0
2969                         timescale = representation_ms_info['timescale']
2970                         for s in representation_ms_info['s']:
2971                             duration = float_or_none(s['d'], timescale)
2972                             for r in range(s.get('r', 0) + 1):
2973                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2974                                 fragments.append({
2975                                     location_key(segment_uri): segment_uri,
2976                                     'duration': duration,
2977                                 })
2978                                 segment_index += 1
2979                         representation_ms_info['fragments'] = fragments
2980                     elif 'segment_urls' in representation_ms_info:
2981                         # Segment URLs with no SegmentTimeline
2982                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2983                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2984                         fragments = []
2985                         segment_duration = float_or_none(
2986                             representation_ms_info['segment_duration'],
2987                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2988                         for segment_url in representation_ms_info['segment_urls']:
2989                             fragment = {
2990                                 location_key(segment_url): segment_url,
2991                             }
2992                             if segment_duration:
2993                                 fragment['duration'] = segment_duration
2994                             fragments.append(fragment)
2995                         representation_ms_info['fragments'] = fragments
2996                     # If there is a fragments key available then we correctly recognized fragmented media.
2997                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2998                     # assumption is not necessarily correct since we may simply have no support for
2999                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3000                     if 'fragments' in representation_ms_info:
3001                         f.update({
3002                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3003                             'url': mpd_url or base_url,
3004                             'fragment_base_url': base_url,
3005                             'fragments': [],
3006                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3007                         })
3008                         if 'initialization_url' in representation_ms_info:
3009                             initialization_url = representation_ms_info['initialization_url']
3010                             if not f.get('url'):
3011                                 f['url'] = initialization_url
3012                             f['fragments'].append({location_key(initialization_url): initialization_url})
3013                         f['fragments'].extend(representation_ms_info['fragments'])
3014                         if not period_duration:
3015                             period_duration = try_get(
3016                                 representation_ms_info,
3017                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3018                     else:
3019                         # Assuming direct URL to unfragmented media.
3020                         f['url'] = base_url
3021                     if content_type in ('video', 'audio', 'image/jpeg'):
3022                         f['manifest_stream_number'] = stream_numbers[f['url']]
3023                         stream_numbers[f['url']] += 1
3024                         formats.append(f)
3025                     elif content_type == 'text':
3026                         subtitles.setdefault(lang or 'und', []).append(f)
3027
3028         return formats, subtitles
3029
3030     def _extract_ism_formats(self, *args, **kwargs):
3031         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3032         if subs:
3033             self._report_ignoring_subs('ISM')
3034         return fmts
3035
3036     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3037         res = self._download_xml_handle(
3038             ism_url, video_id,
3039             note='Downloading ISM manifest' if note is None else note,
3040             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3041             fatal=fatal, data=data, headers=headers, query=query)
3042         if res is False:
3043             return [], {}
3044         ism_doc, urlh = res
3045         if ism_doc is None:
3046             return [], {}
3047
3048         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3049
3050     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3051         """
3052         Parse formats from ISM manifest.
3053         References:
3054          1. [MS-SSTR]: Smooth Streaming Protocol,
3055             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3056         """
3057         if ism_doc.get('IsLive') == 'TRUE':
3058             return [], {}
3059
3060         duration = int(ism_doc.attrib['Duration'])
3061         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3062
3063         formats = []
3064         subtitles = {}
3065         for stream in ism_doc.findall('StreamIndex'):
3066             stream_type = stream.get('Type')
3067             if stream_type not in ('video', 'audio', 'text'):
3068                 continue
3069             url_pattern = stream.attrib['Url']
3070             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3071             stream_name = stream.get('Name')
3072             stream_language = stream.get('Language', 'und')
3073             for track in stream.findall('QualityLevel'):
3074                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3075                 # TODO: add support for WVC1 and WMAP
3076                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3077                     self.report_warning('%s is not a supported codec' % fourcc)
3078                     continue
3079                 tbr = int(track.attrib['Bitrate']) // 1000
3080                 # [1] does not mention Width and Height attributes. However,
3081                 # they're often present while MaxWidth and MaxHeight are
3082                 # missing, so should be used as fallbacks
3083                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3084                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3085                 sampling_rate = int_or_none(track.get('SamplingRate'))
3086
3087                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3088                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3089
3090                 fragments = []
3091                 fragment_ctx = {
3092                     'time': 0,
3093                 }
3094                 stream_fragments = stream.findall('c')
3095                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3096                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3097                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3098                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3099                     if not fragment_ctx['duration']:
3100                         try:
3101                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3102                         except IndexError:
3103                             next_fragment_time = duration
3104                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3105                     for _ in range(fragment_repeat):
3106                         fragments.append({
3107                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3108                             'duration': fragment_ctx['duration'] / stream_timescale,
3109                         })
3110                         fragment_ctx['time'] += fragment_ctx['duration']
3111
3112                 if stream_type == 'text':
3113                     subtitles.setdefault(stream_language, []).append({
3114                         'ext': 'ismt',
3115                         'protocol': 'ism',
3116                         'url': ism_url,
3117                         'manifest_url': ism_url,
3118                         'fragments': fragments,
3119                         '_download_params': {
3120                             'stream_type': stream_type,
3121                             'duration': duration,
3122                             'timescale': stream_timescale,
3123                             'fourcc': fourcc,
3124                             'language': stream_language,
3125                             'codec_private_data': track.get('CodecPrivateData'),
3126                         }
3127                     })
3128                 elif stream_type in ('video', 'audio'):
3129                     formats.append({
3130                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3131                         'url': ism_url,
3132                         'manifest_url': ism_url,
3133                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3134                         'width': width,
3135                         'height': height,
3136                         'tbr': tbr,
3137                         'asr': sampling_rate,
3138                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3139                         'acodec': 'none' if stream_type == 'video' else fourcc,
3140                         'protocol': 'ism',
3141                         'fragments': fragments,
3142                         'has_drm': ism_doc.find('Protection') is not None,
3143                         '_download_params': {
3144                             'stream_type': stream_type,
3145                             'duration': duration,
3146                             'timescale': stream_timescale,
3147                             'width': width or 0,
3148                             'height': height or 0,
3149                             'fourcc': fourcc,
3150                             'language': stream_language,
3151                             'codec_private_data': track.get('CodecPrivateData'),
3152                             'sampling_rate': sampling_rate,
3153                             'channels': int_or_none(track.get('Channels', 2)),
3154                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3155                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3156                         },
3157                     })
3158         return formats, subtitles
3159
3160     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3161         def absolute_url(item_url):
3162             return urljoin(base_url, item_url)
3163
3164         def parse_content_type(content_type):
3165             if not content_type:
3166                 return {}
3167             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3168             if ctr:
3169                 mimetype, codecs = ctr.groups()
3170                 f = parse_codecs(codecs)
3171                 f['ext'] = mimetype2ext(mimetype)
3172                 return f
3173             return {}
3174
3175         def _media_formats(src, cur_media_type, type_info={}):
3176             full_url = absolute_url(src)
3177             ext = type_info.get('ext') or determine_ext(full_url)
3178             if ext == 'm3u8':
3179                 is_plain_url = False
3180                 formats = self._extract_m3u8_formats(
3181                     full_url, video_id, ext='mp4',
3182                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3183                     preference=preference, quality=quality, fatal=False)
3184             elif ext == 'mpd':
3185                 is_plain_url = False
3186                 formats = self._extract_mpd_formats(
3187                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3188             else:
3189                 is_plain_url = True
3190                 formats = [{
3191                     'url': full_url,
3192                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3193                 }]
3194             return is_plain_url, formats
3195
3196         entries = []
3197         # amp-video and amp-audio are very similar to their HTML5 counterparts
3198         # so we wll include them right here (see
3199         # https://www.ampproject.org/docs/reference/components/amp-video)
3200         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3201         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3202         media_tags = [(media_tag, media_tag_name, media_type, '')
3203                       for media_tag, media_tag_name, media_type
3204                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3205         media_tags.extend(re.findall(
3206             # We only allow video|audio followed by a whitespace or '>'.
3207             # Allowing more characters may end up in significant slow down (see
3208             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3209             # http://www.porntrex.com/maps/videositemap.xml).
3210             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3211         for media_tag, _, media_type, media_content in media_tags:
3212             media_info = {
3213                 'formats': [],
3214                 'subtitles': {},
3215             }
3216             media_attributes = extract_attributes(media_tag)
3217             src = strip_or_none(media_attributes.get('src'))
3218             if src:
3219                 _, formats = _media_formats(src, media_type)
3220                 media_info['formats'].extend(formats)
3221             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3222             if media_content:
3223                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3224                     s_attr = extract_attributes(source_tag)
3225                     # data-video-src and data-src are non standard but seen
3226                     # several times in the wild
3227                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3228                     if not src:
3229                         continue
3230                     f = parse_content_type(s_attr.get('type'))
3231                     is_plain_url, formats = _media_formats(src, media_type, f)
3232                     if is_plain_url:
3233                         # width, height, res, label and title attributes are
3234                         # all not standard but seen several times in the wild
3235                         labels = [
3236                             s_attr.get(lbl)
3237                             for lbl in ('label', 'title')
3238                             if str_or_none(s_attr.get(lbl))
3239                         ]
3240                         width = int_or_none(s_attr.get('width'))
3241                         height = (int_or_none(s_attr.get('height'))
3242                                   or int_or_none(s_attr.get('res')))
3243                         if not width or not height:
3244                             for lbl in labels:
3245                                 resolution = parse_resolution(lbl)
3246                                 if not resolution:
3247                                     continue
3248                                 width = width or resolution.get('width')
3249                                 height = height or resolution.get('height')
3250                         for lbl in labels:
3251                             tbr = parse_bitrate(lbl)
3252                             if tbr:
3253                                 break
3254                         else:
3255                             tbr = None
3256                         f.update({
3257                             'width': width,
3258                             'height': height,
3259                             'tbr': tbr,
3260                             'format_id': s_attr.get('label') or s_attr.get('title'),
3261                         })
3262                         f.update(formats[0])
3263                         media_info['formats'].append(f)
3264                     else:
3265                         media_info['formats'].extend(formats)
3266                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3267                     track_attributes = extract_attributes(track_tag)
3268                     kind = track_attributes.get('kind')
3269                     if not kind or kind in ('subtitles', 'captions'):
3270                         src = strip_or_none(track_attributes.get('src'))
3271                         if not src:
3272                             continue
3273                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3274                         media_info['subtitles'].setdefault(lang, []).append({
3275                             'url': absolute_url(src),
3276                         })
3277             for f in media_info['formats']:
3278                 f.setdefault('http_headers', {})['Referer'] = base_url
3279             if media_info['formats'] or media_info['subtitles']:
3280                 entries.append(media_info)
3281         return entries
3282
3283     def _extract_akamai_formats(self, *args, **kwargs):
3284         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3285         if subs:
3286             self._report_ignoring_subs('akamai')
3287         return fmts
3288
3289     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3290         signed = 'hdnea=' in manifest_url
3291         if not signed:
3292             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3293             manifest_url = re.sub(
3294                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3295                 '', manifest_url).strip('?')
3296
3297         formats = []
3298         subtitles = {}
3299
3300         hdcore_sign = 'hdcore=3.7.0'
3301         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3302         hds_host = hosts.get('hds')
3303         if hds_host:
3304             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3305         if 'hdcore=' not in f4m_url:
3306             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3307         f4m_formats = self._extract_f4m_formats(
3308             f4m_url, video_id, f4m_id='hds', fatal=False)
3309         for entry in f4m_formats:
3310             entry.update({'extra_param_to_segment_url': hdcore_sign})
3311         formats.extend(f4m_formats)
3312
3313         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3314         hls_host = hosts.get('hls')
3315         if hls_host:
3316             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3317         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3318             m3u8_url, video_id, 'mp4', 'm3u8_native',
3319             m3u8_id='hls', fatal=False)
3320         formats.extend(m3u8_formats)
3321         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3322
3323         http_host = hosts.get('http')
3324         if http_host and m3u8_formats and not signed:
3325             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3326             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3327             qualities_length = len(qualities)
3328             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3329                 i = 0
3330                 for f in m3u8_formats:
3331                     if f['vcodec'] != 'none':
3332                         for protocol in ('http', 'https'):
3333                             http_f = f.copy()
3334                             del http_f['manifest_url']
3335                             http_url = re.sub(
3336                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3337                             http_f.update({
3338                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3339                                 'url': http_url,
3340                                 'protocol': protocol,
3341                             })
3342                             formats.append(http_f)
3343                         i += 1
3344
3345         return formats, subtitles
3346
3347     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3348         query = compat_urlparse.urlparse(url).query
3349         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3350         mobj = re.search(
3351             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3352         url_base = mobj.group('url')
3353         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3354         formats = []
3355
3356         def manifest_url(manifest):
3357             m_url = '%s/%s' % (http_base_url, manifest)
3358             if query:
3359                 m_url += '?%s' % query
3360             return m_url
3361
3362         if 'm3u8' not in skip_protocols:
3363             formats.extend(self._extract_m3u8_formats(
3364                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3365                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3366         if 'f4m' not in skip_protocols:
3367             formats.extend(self._extract_f4m_formats(
3368                 manifest_url('manifest.f4m'),
3369                 video_id, f4m_id='hds', fatal=False))
3370         if 'dash' not in skip_protocols:
3371             formats.extend(self._extract_mpd_formats(
3372                 manifest_url('manifest.mpd'),
3373                 video_id, mpd_id='dash', fatal=False))
3374         if re.search(r'(?:/smil:|\.smil)', url_base):
3375             if 'smil' not in skip_protocols:
3376                 rtmp_formats = self._extract_smil_formats(
3377                     manifest_url('jwplayer.smil'),
3378                     video_id, fatal=False)
3379                 for rtmp_format in rtmp_formats:
3380                     rtsp_format = rtmp_format.copy()
3381                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3382                     del rtsp_format['play_path']
3383                     del rtsp_format['ext']
3384                     rtsp_format.update({
3385                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3386                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3387                         'protocol': 'rtsp',
3388                     })
3389                     formats.extend([rtmp_format, rtsp_format])
3390         else:
3391             for protocol in ('rtmp', 'rtsp'):
3392                 if protocol not in skip_protocols:
3393                     formats.append({
3394                         'url': '%s:%s' % (protocol, url_base),
3395                         'format_id': protocol,
3396                         'protocol': protocol,
3397                     })
3398         return formats
3399
3400     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3401         mobj = re.search(
3402             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3403             webpage)
3404         if mobj:
3405             try:
3406                 jwplayer_data = self._parse_json(mobj.group('options'),
3407                                                  video_id=video_id,
3408                                                  transform_source=transform_source)
3409             except ExtractorError:
3410                 pass
3411             else:
3412                 if isinstance(jwplayer_data, dict):
3413                     return jwplayer_data
3414
3415     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3416         jwplayer_data = self._find_jwplayer_data(
3417             webpage, video_id, transform_source=js_to_json)
3418         return self._parse_jwplayer_data(
3419             jwplayer_data, video_id, *args, **kwargs)
3420
3421     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3422                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3423         # JWPlayer backward compatibility: flattened playlists
3424         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3425         if 'playlist' not in jwplayer_data:
3426             jwplayer_data = {'playlist': [jwplayer_data]}
3427
3428         entries = []
3429
3430         # JWPlayer backward compatibility: single playlist item
3431         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3432         if not isinstance(jwplayer_data['playlist'], list):
3433             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3434
3435         for video_data in jwplayer_data['playlist']:
3436             # JWPlayer backward compatibility: flattened sources
3437             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3438             if 'sources' not in video_data:
3439                 video_data['sources'] = [video_data]
3440
3441             this_video_id = video_id or video_data['mediaid']
3442
3443             formats = self._parse_jwplayer_formats(
3444                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3445                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3446
3447             subtitles = {}
3448             tracks = video_data.get('tracks')
3449             if tracks and isinstance(tracks, list):
3450                 for track in tracks:
3451                     if not isinstance(track, dict):
3452                         continue
3453                     track_kind = track.get('kind')
3454                     if not track_kind or not isinstance(track_kind, compat_str):
3455                         continue
3456                     if track_kind.lower() not in ('captions', 'subtitles'):
3457                         continue
3458                     track_url = urljoin(base_url, track.get('file'))
3459                     if not track_url:
3460                         continue
3461                     subtitles.setdefault(track.get('label') or 'en', []).append({
3462                         'url': self._proto_relative_url(track_url)
3463                     })
3464
3465             entry = {
3466                 'id': this_video_id,
3467                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3468                 'description': clean_html(video_data.get('description')),
3469                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3470                 'timestamp': int_or_none(video_data.get('pubdate')),
3471                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3472                 'subtitles': subtitles,
3473             }
3474             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3475             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3476                 entry.update({
3477                     '_type': 'url_transparent',
3478                     'url': formats[0]['url'],
3479                 })
3480             else:
3481                 self._sort_formats(formats)
3482                 entry['formats'] = formats
3483             entries.append(entry)
3484         if len(entries) == 1:
3485             return entries[0]
3486         else:
3487             return self.playlist_result(entries)
3488
3489     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3490                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3491         urls = []
3492         formats = []
3493         for source in jwplayer_sources_data:
3494             if not isinstance(source, dict):
3495                 continue
3496             source_url = urljoin(
3497                 base_url, self._proto_relative_url(source.get('file')))
3498             if not source_url or source_url in urls:
3499                 continue
3500             urls.append(source_url)
3501             source_type = source.get('type') or ''
3502             ext = mimetype2ext(source_type) or determine_ext(source_url)
3503             if source_type == 'hls' or ext == 'm3u8':
3504                 formats.extend(self._extract_m3u8_formats(
3505                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3506                     m3u8_id=m3u8_id, fatal=False))
3507             elif source_type == 'dash' or ext == 'mpd':
3508                 formats.extend(self._extract_mpd_formats(
3509                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3510             elif ext == 'smil':
3511                 formats.extend(self._extract_smil_formats(
3512                     source_url, video_id, fatal=False))
3513             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3514             elif source_type.startswith('audio') or ext in (
3515                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3516                 formats.append({
3517                     'url': source_url,
3518                     'vcodec': 'none',
3519                     'ext': ext,
3520                 })
3521             else:
3522                 height = int_or_none(source.get('height'))
3523                 if height is None:
3524                     # Often no height is provided but there is a label in
3525                     # format like "1080p", "720p SD", or 1080.
3526                     height = int_or_none(self._search_regex(
3527                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3528                         'height', default=None))
3529                 a_format = {
3530                     'url': source_url,
3531                     'width': int_or_none(source.get('width')),
3532                     'height': height,
3533                     'tbr': int_or_none(source.get('bitrate')),
3534                     'ext': ext,
3535                 }
3536                 if source_url.startswith('rtmp'):
3537                     a_format['ext'] = 'flv'
3538                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3539                     # of jwplayer.flash.swf
3540                     rtmp_url_parts = re.split(
3541                         r'((?:mp4|mp3|flv):)', source_url, 1)
3542                     if len(rtmp_url_parts) == 3:
3543                         rtmp_url, prefix, play_path = rtmp_url_parts
3544                         a_format.update({
3545                             'url': rtmp_url,
3546                             'play_path': prefix + play_path,
3547                         })
3548                     if rtmp_params:
3549                         a_format.update(rtmp_params)
3550                 formats.append(a_format)
3551         return formats
3552
3553     def _live_title(self, name):
3554         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3555         return name
3556
3557     def _int(self, v, name, fatal=False, **kwargs):
3558         res = int_or_none(v, **kwargs)
3559         if res is None:
3560             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3561             if fatal:
3562                 raise ExtractorError(msg)
3563             else:
3564                 self.report_warning(msg)
3565         return res
3566
3567     def _float(self, v, name, fatal=False, **kwargs):
3568         res = float_or_none(v, **kwargs)
3569         if res is None:
3570             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3571             if fatal:
3572                 raise ExtractorError(msg)
3573             else:
3574                 self.report_warning(msg)
3575         return res
3576
3577     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3578                     path='/', secure=False, discard=False, rest={}, **kwargs):
3579         cookie = compat_cookiejar_Cookie(
3580             0, name, value, port, port is not None, domain, True,
3581             domain.startswith('.'), path, True, secure, expire_time,
3582             discard, None, None, rest)
3583         self._downloader.cookiejar.set_cookie(cookie)
3584
3585     def _get_cookies(self, url):
3586         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3587         req = sanitized_Request(url)
3588         self._downloader.cookiejar.add_cookie_header(req)
3589         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3590
3591     def _apply_first_set_cookie_header(self, url_handle, cookie):
3592         """
3593         Apply first Set-Cookie header instead of the last. Experimental.
3594
3595         Some sites (e.g. [1-3]) may serve two cookies under the same name
3596         in Set-Cookie header and expect the first (old) one to be set rather
3597         than second (new). However, as of RFC6265 the newer one cookie
3598         should be set into cookie store what actually happens.
3599         We will workaround this issue by resetting the cookie to
3600         the first one manually.
3601         1. https://new.vk.com/
3602         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3603         3. https://learning.oreilly.com/
3604         """
3605         for header, cookies in url_handle.headers.items():
3606             if header.lower() != 'set-cookie':
3607                 continue
3608             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3609             cookie_value = re.search(
3610                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3611             if cookie_value:
3612                 value, domain = cookie_value.groups()
3613                 self._set_cookie(domain, cookie, value)
3614                 break
3615
3616     def get_testcases(self, include_onlymatching=False):
3617         t = getattr(self, '_TEST', None)
3618         if t:
3619             assert not hasattr(self, '_TESTS'), \
3620                 '%s has _TEST and _TESTS' % type(self).__name__
3621             tests = [t]
3622         else:
3623             tests = getattr(self, '_TESTS', [])
3624         for t in tests:
3625             if not include_onlymatching and t.get('only_matching', False):
3626                 continue
3627             t['name'] = type(self).__name__[:-len('IE')]
3628             yield t
3629
3630     def is_suitable(self, age_limit):
3631         """ Test whether the extractor is generally suitable for the given
3632         age limit (i.e. pornographic sites are not, all others usually are) """
3633
3634         any_restricted = False
3635         for tc in self.get_testcases(include_onlymatching=False):
3636             if tc.get('playlist', []):
3637                 tc = tc['playlist'][0]
3638             is_restricted = age_restricted(
3639                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3640             if not is_restricted:
3641                 return True
3642             any_restricted = any_restricted or is_restricted
3643         return not any_restricted
3644
3645     def extract_subtitles(self, *args, **kwargs):
3646         if (self.get_param('writesubtitles', False)
3647                 or self.get_param('listsubtitles')):
3648             return self._get_subtitles(*args, **kwargs)
3649         return {}
3650
3651     def _get_subtitles(self, *args, **kwargs):
3652         raise NotImplementedError('This method must be implemented by subclasses')
3653
3654     def extract_comments(self, *args, **kwargs):
3655         if not self.get_param('getcomments'):
3656             return None
3657         generator = self._get_comments(*args, **kwargs)
3658
3659         def extractor():
3660             comments = []
3661             interrupted = True
3662             try:
3663                 while True:
3664                     comments.append(next(generator))
3665             except StopIteration:
3666                 interrupted = False
3667             except KeyboardInterrupt:
3668                 self.to_screen('Interrupted by user')
3669             except Exception as e:
3670                 if self.get_param('ignoreerrors') is not True:
3671                     raise
3672                 self._downloader.report_error(e)
3673             comment_count = len(comments)
3674             self.to_screen(f'Extracted {comment_count} comments')
3675             return {
3676                 'comments': comments,
3677                 'comment_count': None if interrupted else comment_count
3678             }
3679         return extractor
3680
3681     def _get_comments(self, *args, **kwargs):
3682         raise NotImplementedError('This method must be implemented by subclasses')
3683
3684     @staticmethod
3685     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3686         """ Merge subtitle items for one language. Items with duplicated URLs/data
3687         will be dropped. """
3688         list1_data = set((item.get('url'), item.get('data')) for item in subtitle_list1)
3689         ret = list(subtitle_list1)
3690         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3691         return ret
3692
3693     @classmethod
3694     def _merge_subtitles(cls, *dicts, target=None):
3695         """ Merge subtitle dictionaries, language by language. """
3696         if target is None:
3697             target = {}
3698         for d in dicts:
3699             for lang, subs in d.items():
3700                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3701         return target
3702
3703     def extract_automatic_captions(self, *args, **kwargs):
3704         if (self.get_param('writeautomaticsub', False)
3705                 or self.get_param('listsubtitles')):
3706             return self._get_automatic_captions(*args, **kwargs)
3707         return {}
3708
3709     def _get_automatic_captions(self, *args, **kwargs):
3710         raise NotImplementedError('This method must be implemented by subclasses')
3711
3712     def mark_watched(self, *args, **kwargs):
3713         if not self.get_param('mark_watched', False):
3714             return
3715         if (self.supports_login() and self._get_login_info()[0] is not None
3716                 or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')):
3717             self._mark_watched(*args, **kwargs)
3718
3719     def _mark_watched(self, *args, **kwargs):
3720         raise NotImplementedError('This method must be implemented by subclasses')
3721
3722     def geo_verification_headers(self):
3723         headers = {}
3724         geo_verification_proxy = self.get_param('geo_verification_proxy')
3725         if geo_verification_proxy:
3726             headers['Ytdl-request-proxy'] = geo_verification_proxy
3727         return headers
3728
3729     def _generic_id(self, url):
3730         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3731
3732     def _generic_title(self, url):
3733         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3734
3735     @staticmethod
3736     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3737         all_known = all(map(
3738             lambda x: x is not None,
3739             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3740         return (
3741             'private' if is_private
3742             else 'premium_only' if needs_premium
3743             else 'subscriber_only' if needs_subscription
3744             else 'needs_auth' if needs_auth
3745             else 'unlisted' if is_unlisted
3746             else 'public' if all_known
3747             else None)
3748
3749     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3750         '''
3751         @returns            A list of values for the extractor argument given by "key"
3752                             or "default" if no such key is present
3753         @param default      The default value to return when the key is not present (default: [])
3754         @param casesense    When false, the values are converted to lower case
3755         '''
3756         val = traverse_obj(
3757             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3758         if val is None:
3759             return [] if default is NO_DEFAULT else default
3760         return list(val) if casesense else [x.lower() for x in val]
3761
3762     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3763         if not playlist_id or not video_id:
3764             return not video_id
3765
3766         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3767         if no_playlist is not None:
3768             return not no_playlist
3769
3770         video_id = '' if video_id is True else f' {video_id}'
3771         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3772         if self.get_param('noplaylist'):
3773             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3774             return False
3775         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3776         return True
3777
3778
3779 class SearchInfoExtractor(InfoExtractor):
3780     """
3781     Base class for paged search queries extractors.
3782     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3783     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3784     """
3785
3786     _MAX_RESULTS = float('inf')
3787
3788     @classmethod
3789     def _make_valid_url(cls):
3790         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3791
3792     def _real_extract(self, query):
3793         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3794         if prefix == '':
3795             return self._get_n_results(query, 1)
3796         elif prefix == 'all':
3797             return self._get_n_results(query, self._MAX_RESULTS)
3798         else:
3799             n = int(prefix)
3800             if n <= 0:
3801                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3802             elif n > self._MAX_RESULTS:
3803                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3804                 n = self._MAX_RESULTS
3805             return self._get_n_results(query, n)
3806
3807     def _get_n_results(self, query, n):
3808         """Get a specified number of results for a query.
3809         Either this function or _search_results must be overridden by subclasses """
3810         return self.playlist_result(
3811             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3812             query, query)
3813
3814     def _search_results(self, query):
3815         """Returns an iterator of search results"""
3816         raise NotImplementedError('This method must be implemented by subclasses')
3817
3818     @property
3819     def SEARCH_KEY(self):
3820         return self._SEARCH_KEY