yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import itertools
   8 import json
   9 import netrc
  10 import os
  11 import random
  12 import re
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar_Cookie,
  19     compat_cookies_SimpleCookie,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader import FileDownloader
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     error_to_compat_str,
  49     extract_attributes,
  50     ExtractorError,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     GeoRestrictedError,
  55     GeoUtils,
  56     int_or_none,
  57     join_nonempty,
  58     js_to_json,
  59     JSON_LD_RE,
  60     mimetype2ext,
  61     network_exceptions,
  62     NO_DEFAULT,
  63     orderedSet,
  64     parse_bitrate,
  65     parse_codecs,
  66     parse_duration,
  67     parse_iso8601,
  68     parse_m3u8_attributes,
  69     parse_resolution,
  70     RegexNotFoundError,
  71     sanitize_filename,
  72     sanitized_Request,
  73     str_or_none,
  74     str_to_int,
  75     strip_or_none,
  76     traverse_obj,
  77     unescapeHTML,
  78     UnsupportedError,
  79     unified_strdate,
  80     unified_timestamp,
  81     update_Request,
  82     update_url_query,
  83     url_basename,
  84     url_or_none,
  85     urljoin,
  86     variadic,
  87     xpath_element,
  88     xpath_text,
  89     xpath_with_ns,
  90 )
  91
  92
  93 class InfoExtractor(object):
  94     """Information Extractor class.
  95
  96     Information extractors are the classes that, given a URL, extract
  97     information about the video (or videos) the URL refers to. This
  98     information includes the real video URL, the video title, author and
  99     others. The information is stored in a dictionary which is then
 100     passed to the YoutubeDL. The YoutubeDL processes this
 101     information possibly downloading the video to the file system, among
 102     other possible outcomes.
 103
 104     The type field determines the type of the result.
 105     By far the most common value (and the default if _type is missing) is
 106     "video", which indicates a single video.
 107
 108     For a video, the dictionaries must include the following fields:
 109
 110     id:             Video identifier.
 111     title:          Video title, unescaped.
 112
 113     Additionally, it must contain either a formats entry or a url one:
 114
 115     formats:        A list of dictionaries for each format available, ordered
 116                     from worst to best quality.
 117
 118                     Potential fields:
 119                     * url        The mandatory URL representing the media:
 120                                    for plain file media - HTTP URL of this file,
 121                                    for RTMP - RTMP URL,
 122                                    for HLS - URL of the M3U8 media playlist,
 123                                    for HDS - URL of the F4M manifest,
 124                                    for DASH
 125                                      - HTTP URL to plain file media (in case of
 126                                        unfragmented media)
 127                                      - URL of the MPD manifest or base URL
 128                                        representing the media if MPD manifest
 129                                        is parsed from a string (in case of
 130                                        fragmented media)
 131                                    for MSS - URL of the ISM manifest.
 132                     * manifest_url
 133                                  The URL of the manifest file in case of
 134                                  fragmented media:
 135                                    for HLS - URL of the M3U8 master playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH - URL of the MPD manifest,
 138                                    for MSS - URL of the ISM manifest.
 139                     * ext        Will be calculated from URL if missing
 140                     * format     A human-readable description of the format
 141                                  ("mp4 container with h264/opus").
 142                                  Calculated from the format_id, width, height.
 143                                  and format_note fields if missing.
 144                     * format_id  A short description of the format
 145                                  ("mp4_h264_opus" or "19").
 146                                 Technically optional, but strongly recommended.
 147                     * format_note Additional info about the format
 148                                  ("3D" or "DASH video")
 149                     * width      Width of the video, if known
 150                     * height     Height of the video, if known
 151                     * resolution Textual description of width and height
 152                     * dynamic_range The dynamic range of the video. One of:
 153                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 154                     * tbr        Average bitrate of audio and video in KBit/s
 155                     * abr        Average audio bitrate in KBit/s
 156                     * acodec     Name of the audio codec in use
 157                     * asr        Audio sampling rate in Hertz
 158                     * vbr        Average video bitrate in KBit/s
 159                     * fps        Frame rate
 160                     * vcodec     Name of the video codec in use
 161                     * container  Name of the container format
 162                     * filesize   The number of bytes, if known in advance
 163                     * filesize_approx  An estimate for the number of bytes
 164                     * player_url SWF Player URL (used for rtmpdump).
 165                     * protocol   The protocol that will be used for the actual
 166                                  download, lower-case.
 167                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 168                                  "m3u8", "m3u8_native" or "http_dash_segments".
 169                     * fragment_base_url
 170                                  Base URL for fragments. Each fragment's path
 171                                  value (if present) will be relative to
 172                                  this URL.
 173                     * fragments  A list of fragments of a fragmented media.
 174                                  Each fragment entry must contain either an url
 175                                  or a path. If an url is present it should be
 176                                  considered by a client. Otherwise both path and
 177                                  fragment_base_url must be present. Here is
 178                                  the list of all potential fields:
 179                                  * "url" - fragment's URL
 180                                  * "path" - fragment's path relative to
 181                                             fragment_base_url
 182                                  * "duration" (optional, int or float)
 183                                  * "filesize" (optional, int)
 184                     * preference Order number of this format. If this field is
 185                                  present and not None, the formats get sorted
 186                                  by this field, regardless of all other values.
 187                                  -1 for default (order by other properties),
 188                                  -2 or smaller for less than default.
 189                                  < -1000 to hide the format (if there is
 190                                     another one which is strictly better)
 191                     * language   Language code, e.g. "de" or "en-US".
 192                     * language_preference  Is this in the language mentioned in
 193                                  the URL?
 194                                  10 if it's what the URL is about,
 195                                  -1 for default (don't know),
 196                                  -10 otherwise, other values reserved for now.
 197                     * quality    Order number of the video quality of this
 198                                  format, irrespective of the file format.
 199                                  -1 for default (order by other properties),
 200                                  -2 or smaller for less than default.
 201                     * source_preference  Order number for this video source
 202                                   (quality takes higher priority)
 203                                  -1 for default (order by other properties),
 204                                  -2 or smaller for less than default.
 205                     * http_headers  A dictionary of additional HTTP headers
 206                                  to add to the request.
 207                     * stretched_ratio  If given and not 1, indicates that the
 208                                  video's pixels are not square.
 209                                  width : height ratio as float.
 210                     * no_resume  The server does not support resuming the
 211                                  (HTTP or RTMP) download. Boolean.
 212                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 213                     * downloader_options  A dictionary of downloader options as
 214                                  described in FileDownloader
 215                     RTMP formats can also have the additional fields: page_url,
 216                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 217                     rtmp_protocol, rtmp_real_time
 218
 219     url:            Final video URL.
 220     ext:            Video filename extension.
 221     format:         The video format, defaults to ext (used for --get-format)
 222     player_url:     SWF Player URL (used for rtmpdump).
 223
 224     The following fields are optional:
 225
 226     alt_title:      A secondary title of the video.
 227     display_id      An alternative identifier for the video, not necessarily
 228                     unique, but available before title. Typically, id is
 229                     something like "4234987", title "Dancing naked mole rats",
 230                     and display_id "dancing-naked-mole-rats"
 231     thumbnails:     A list of dictionaries, with the following entries:
 232                         * "id" (optional, string) - Thumbnail format ID
 233                         * "url"
 234                         * "preference" (optional, int) - quality of the image
 235                         * "width" (optional, int)
 236                         * "height" (optional, int)
 237                         * "resolution" (optional, string "{width}x{height}",
 238                                         deprecated)
 239                         * "filesize" (optional, int)
 240     thumbnail:      Full URL to a video thumbnail image.
 241     description:    Full video description.
 242     uploader:       Full name of the video uploader.
 243     license:        License name the video is licensed under.
 244     creator:        The creator of the video.
 245     release_timestamp: UNIX timestamp of the moment the video was released.
 246     release_date:   The date (YYYYMMDD) when the video was released.
 247     timestamp:      UNIX timestamp of the moment the video was uploaded
 248     upload_date:    Video upload date (YYYYMMDD).
 249                     If not explicitly set, calculated from timestamp.
 250     uploader_id:    Nickname or id of the video uploader.
 251     uploader_url:   Full URL to a personal webpage of the video uploader.
 252     channel:        Full name of the channel the video is uploaded on.
 253                     Note that channel fields may or may not repeat uploader
 254                     fields. This depends on a particular extractor.
 255     channel_id:     Id of the channel.
 256     channel_url:    Full URL to a channel webpage.
 257     location:       Physical location where the video was filmed.
 258     subtitles:      The available subtitles as a dictionary in the format
 259                     {tag: subformats}. "tag" is usually a language code, and
 260                     "subformats" is a list sorted from lower to higher
 261                     preference, each element is a dictionary with the "ext"
 262                     entry and one of:
 263                         * "data": The subtitles file contents
 264                         * "url": A URL pointing to the subtitles file
 265                     It can optionally also have:
 266                         * "name": Name or description of the subtitles
 267                     "ext" will be calculated from URL if missing
 268     automatic_captions: Like 'subtitles'; contains automatically generated
 269                     captions instead of normal subtitles
 270     duration:       Length of the video in seconds, as an integer or float.
 271     view_count:     How many users have watched the video on the platform.
 272     like_count:     Number of positive ratings of the video
 273     dislike_count:  Number of negative ratings of the video
 274     repost_count:   Number of reposts of the video
 275     average_rating: Average rating give by users, the scale used depends on the webpage
 276     comment_count:  Number of comments on the video
 277     comments:       A list of comments, each with one or more of the following
 278                     properties (all but one of text or html optional):
 279                         * "author" - human-readable name of the comment author
 280                         * "author_id" - user ID of the comment author
 281                         * "author_thumbnail" - The thumbnail of the comment author
 282                         * "id" - Comment ID
 283                         * "html" - Comment as HTML
 284                         * "text" - Plain text of the comment
 285                         * "timestamp" - UNIX timestamp of comment
 286                         * "parent" - ID of the comment this one is replying to.
 287                                      Set to "root" to indicate that this is a
 288                                      comment to the original video.
 289                         * "like_count" - Number of positive ratings of the comment
 290                         * "dislike_count" - Number of negative ratings of the comment
 291                         * "is_favorited" - Whether the comment is marked as
 292                                            favorite by the video uploader
 293                         * "author_is_uploader" - Whether the comment is made by
 294                                                  the video uploader
 295     age_limit:      Age restriction for the video, as an integer (years)
 296     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 297                     should allow to get the same result again. (It will be set
 298                     by YoutubeDL if it's missing)
 299     categories:     A list of categories that the video falls in, for example
 300                     ["Sports", "Berlin"]
 301     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 302     cast:           A list of the video cast
 303     is_live:        True, False, or None (=unknown). Whether this video is a
 304                     live stream that goes on instead of a fixed-length video.
 305     was_live:       True, False, or None (=unknown). Whether this video was
 306                     originally a live stream.
 307     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 308                     If absent, automatically set from is_live, was_live
 309     start_time:     Time in seconds where the reproduction should start, as
 310                     specified in the URL.
 311     end_time:       Time in seconds where the reproduction should end, as
 312                     specified in the URL.
 313     chapters:       A list of dictionaries, with the following entries:
 314                         * "start_time" - The start time of the chapter in seconds
 315                         * "end_time" - The end time of the chapter in seconds
 316                         * "title" (optional, string)
 317     playable_in_embed: Whether this video is allowed to play in embedded
 318                     players on other sites. Can be True (=always allowed),
 319                     False (=never allowed), None (=unknown), or a string
 320                     specifying the criteria for embedability (Eg: 'whitelist')
 321     availability:   Under what condition the video is available. One of
 322                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 323                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 324                     to set it
 325     __post_extractor: A function to be called just before the metadata is
 326                     written to either disk, logger or console. The function
 327                     must return a dict which will be added to the info_dict.
 328                     This is usefull for additional information that is
 329                     time-consuming to extract. Note that the fields thus
 330                     extracted will not be available to output template and
 331                     match_filter. So, only "comments" and "comment_count" are
 332                     currently allowed to be extracted via this method.
 333
 334     The following fields should only be used when the video belongs to some logical
 335     chapter or section:
 336
 337     chapter:        Name or title of the chapter the video belongs to.
 338     chapter_number: Number of the chapter the video belongs to, as an integer.
 339     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 340
 341     The following fields should only be used when the video is an episode of some
 342     series, programme or podcast:
 343
 344     series:         Title of the series or programme the video episode belongs to.
 345     season:         Title of the season the video episode belongs to.
 346     season_number:  Number of the season the video episode belongs to, as an integer.
 347     season_id:      Id of the season the video episode belongs to, as a unicode string.
 348     episode:        Title of the video episode. Unlike mandatory video title field,
 349                     this field should denote the exact title of the video episode
 350                     without any kind of decoration.
 351     episode_number: Number of the video episode within a season, as an integer.
 352     episode_id:     Id of the video episode, as a unicode string.
 353
 354     The following fields should only be used when the media is a track or a part of
 355     a music album:
 356
 357     track:          Title of the track.
 358     track_number:   Number of the track within an album or a disc, as an integer.
 359     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 360                     as a unicode string.
 361     artist:         Artist(s) of the track.
 362     genre:          Genre(s) of the track.
 363     album:          Title of the album the track belongs to.
 364     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 365     album_artist:   List of all artists appeared on the album (e.g.
 366                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 367                     and compilations).
 368     disc_number:    Number of the disc or other physical medium the track belongs to,
 369                     as an integer.
 370     release_year:   Year (YYYY) when the album was released.
 371
 372     Unless mentioned otherwise, the fields should be Unicode strings.
 373
 374     Unless mentioned otherwise, None is equivalent to absence of information.
 375
 376
 377     _type "playlist" indicates multiple videos.
 378     There must be a key "entries", which is a list, an iterable, or a PagedList
 379     object, each element of which is a valid dictionary by this specification.
 380
 381     Additionally, playlists can have "id", "title", and any other relevent
 382     attributes with the same semantics as videos (see above).
 383
 384
 385     _type "multi_video" indicates that there are multiple videos that
 386     form a single show, for examples multiple acts of an opera or TV episode.
 387     It must have an entries key like a playlist and contain all the keys
 388     required for a video at the same time.
 389
 390
 391     _type "url" indicates that the video must be extracted from another
 392     location, possibly by a different extractor. Its only required key is:
 393     "url" - the next URL to extract.
 394     The key "ie_key" can be set to the class name (minus the trailing "IE",
 395     e.g. "Youtube") if the extractor class is known in advance.
 396     Additionally, the dictionary may have any properties of the resolved entity
 397     known in advance, for example "title" if the title of the referred video is
 398     known ahead of time.
 399
 400
 401     _type "url_transparent" entities have the same specification as "url", but
 402     indicate that the given additional information is more precise than the one
 403     associated with the resolved URL.
 404     This is useful when a site employs a video service that hosts the video and
 405     its technical metadata, but that video service does not embed a useful
 406     title, description etc.
 407
 408
 409     Subclasses of this one should re-define the _real_initialize() and
 410     _real_extract() methods and define a _VALID_URL regexp.
 411     Probably, they should also be added to the list of extractors.
 412
 413     Subclasses may also override suitable() if necessary, but ensure the function
 414     signature is preserved and that this function imports everything it needs
 415     (except other extractors), so that lazy_extractors works correctly
 416
 417     _GEO_BYPASS attribute may be set to False in order to disable
 418     geo restriction bypass mechanisms for a particular extractor.
 419     Though it won't disable explicit geo restriction bypass based on
 420     country code provided with geo_bypass_country.
 421
 422     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 423     countries for this extractor. One of these countries will be used by
 424     geo restriction bypass mechanism right away in order to bypass
 425     geo restriction, of course, if the mechanism is not disabled.
 426
 427     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 428     IP blocks in CIDR notation for this extractor. One of these IP blocks
 429     will be used by geo restriction bypass mechanism similarly
 430     to _GEO_COUNTRIES.
 431
 432     The _WORKING attribute should be set to False for broken IEs
 433     in order to warn the users and skip the tests.
 434     """
 435
 436     _ready = False
 437     _downloader = None
 438     _x_forwarded_for_ip = None
 439     _GEO_BYPASS = True
 440     _GEO_COUNTRIES = None
 441     _GEO_IP_BLOCKS = None
 442     _WORKING = True
 443
 444     _LOGIN_HINTS = {
 445         'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
 446         'cookies': (
 447             'Use --cookies-from-browser or --cookies for the authentication. '
 448             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 449         'password': 'Use --username and --password, or --netrc to provide account credentials',
 450     }
 451
 452     def __init__(self, downloader=None):
 453         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 454         If a downloader is not passed during initialization,
 455         it must be set using "set_downloader()" before "extract()" is called"""
 456         self._ready = False
 457         self._x_forwarded_for_ip = None
 458         self._printed_messages = set()
 459         self.set_downloader(downloader)
 460
 461     @classmethod
 462     def _match_valid_url(cls, url):
 463         # This does not use has/getattr intentionally - we want to know whether
 464         # we have cached the regexp for *this* class, whereas getattr would also
 465         # match the superclass
 466         if '_VALID_URL_RE' not in cls.__dict__:
 467             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 468         return cls._VALID_URL_RE.match(url)
 469
 470     @classmethod
 471     def suitable(cls, url):
 472         """Receives a URL and returns True if suitable for this IE."""
 473         # This function must import everything it needs (except other extractors),
 474         # so that lazy_extractors works correctly
 475         return cls._match_valid_url(url) is not None
 476
 477     @classmethod
 478     def _match_id(cls, url):
 479         return cls._match_valid_url(url).group('id')
 480
 481     @classmethod
 482     def get_temp_id(cls, url):
 483         try:
 484             return cls._match_id(url)
 485         except (IndexError, AttributeError):
 486             return None
 487
 488     @classmethod
 489     def working(cls):
 490         """Getter method for _WORKING."""
 491         return cls._WORKING
 492
 493     def initialize(self):
 494         """Initializes an instance (authentication, etc)."""
 495         self._printed_messages = set()
 496         self._initialize_geo_bypass({
 497             'countries': self._GEO_COUNTRIES,
 498             'ip_blocks': self._GEO_IP_BLOCKS,
 499         })
 500         if not self._ready:
 501             self._real_initialize()
 502             self._ready = True
 503
 504     def _initialize_geo_bypass(self, geo_bypass_context):
 505         """
 506         Initialize geo restriction bypass mechanism.
 507
 508         This method is used to initialize geo bypass mechanism based on faking
 509         X-Forwarded-For HTTP header. A random country from provided country list
 510         is selected and a random IP belonging to this country is generated. This
 511         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 512         HTTP requests.
 513
 514         This method will be used for initial geo bypass mechanism initialization
 515         during the instance initialization with _GEO_COUNTRIES and
 516         _GEO_IP_BLOCKS.
 517
 518         You may also manually call it from extractor's code if geo bypass
 519         information is not available beforehand (e.g. obtained during
 520         extraction) or due to some other reason. In this case you should pass
 521         this information in geo bypass context passed as first argument. It may
 522         contain following fields:
 523
 524         countries:  List of geo unrestricted countries (similar
 525                     to _GEO_COUNTRIES)
 526         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 527                     (similar to _GEO_IP_BLOCKS)
 528
 529         """
 530         if not self._x_forwarded_for_ip:
 531
 532             # Geo bypass mechanism is explicitly disabled by user
 533             if not self.get_param('geo_bypass', True):
 534                 return
 535
 536             if not geo_bypass_context:
 537                 geo_bypass_context = {}
 538
 539             # Backward compatibility: previously _initialize_geo_bypass
 540             # expected a list of countries, some 3rd party code may still use
 541             # it this way
 542             if isinstance(geo_bypass_context, (list, tuple)):
 543                 geo_bypass_context = {
 544                     'countries': geo_bypass_context,
 545                 }
 546
 547             # The whole point of geo bypass mechanism is to fake IP
 548             # as X-Forwarded-For HTTP header based on some IP block or
 549             # country code.
 550
 551             # Path 1: bypassing based on IP block in CIDR notation
 552
 553             # Explicit IP block specified by user, use it right away
 554             # regardless of whether extractor is geo bypassable or not
 555             ip_block = self.get_param('geo_bypass_ip_block', None)
 556
 557             # Otherwise use random IP block from geo bypass context but only
 558             # if extractor is known as geo bypassable
 559             if not ip_block:
 560                 ip_blocks = geo_bypass_context.get('ip_blocks')
 561                 if self._GEO_BYPASS and ip_blocks:
 562                     ip_block = random.choice(ip_blocks)
 563
 564             if ip_block:
 565                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 566                 self._downloader.write_debug(
 567                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 568                 return
 569
 570             # Path 2: bypassing based on country code
 571
 572             # Explicit country code specified by user, use it right away
 573             # regardless of whether extractor is geo bypassable or not
 574             country = self.get_param('geo_bypass_country', None)
 575
 576             # Otherwise use random country code from geo bypass context but
 577             # only if extractor is known as geo bypassable
 578             if not country:
 579                 countries = geo_bypass_context.get('countries')
 580                 if self._GEO_BYPASS and countries:
 581                     country = random.choice(countries)
 582
 583             if country:
 584                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 585                 self._downloader.write_debug(
 586                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 587
 588     def extract(self, url):
 589         """Extracts URL information and returns it in list of dicts."""
 590         try:
 591             for _ in range(2):
 592                 try:
 593                     self.initialize()
 594                     self.write_debug('Extracting URL: %s' % url)
 595                     ie_result = self._real_extract(url)
 596                     if ie_result is None:
 597                         return None
 598                     if self._x_forwarded_for_ip:
 599                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 600                     subtitles = ie_result.get('subtitles')
 601                     if (subtitles and 'live_chat' in subtitles
 602                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 603                         del subtitles['live_chat']
 604                     return ie_result
 605                 except GeoRestrictedError as e:
 606                     if self.__maybe_fake_ip_and_retry(e.countries):
 607                         continue
 608                     raise
 609         except UnsupportedError:
 610             raise
 611         except ExtractorError as e:
 612             kwargs = {
 613                 'video_id': e.video_id or self.get_temp_id(url),
 614                 'ie': self.IE_NAME,
 615                 'tb': e.traceback,
 616                 'expected': e.expected,
 617                 'cause': e.cause
 618             }
 619             if hasattr(e, 'countries'):
 620                 kwargs['countries'] = e.countries
 621             raise type(e)(e.msg, **kwargs)
 622         except compat_http_client.IncompleteRead as e:
 623             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 624         except (KeyError, StopIteration) as e:
 625             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 626
 627     def __maybe_fake_ip_and_retry(self, countries):
 628         if (not self.get_param('geo_bypass_country', None)
 629                 and self._GEO_BYPASS
 630                 and self.get_param('geo_bypass', True)
 631                 and not self._x_forwarded_for_ip
 632                 and countries):
 633             country_code = random.choice(countries)
 634             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 635             if self._x_forwarded_for_ip:
 636                 self.report_warning(
 637                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 638                     % (self._x_forwarded_for_ip, country_code.upper()))
 639                 return True
 640         return False
 641
 642     def set_downloader(self, downloader):
 643         """Sets the downloader for this IE."""
 644         self._downloader = downloader
 645
 646     def _real_initialize(self):
 647         """Real initialization process. Redefine in subclasses."""
 648         pass
 649
 650     def _real_extract(self, url):
 651         """Real extraction process. Redefine in subclasses."""
 652         pass
 653
 654     @classmethod
 655     def ie_key(cls):
 656         """A string for getting the InfoExtractor with get_info_extractor"""
 657         return cls.__name__[:-2]
 658
 659     @property
 660     def IE_NAME(self):
 661         return compat_str(type(self).__name__[:-2])
 662
 663     @staticmethod
 664     def __can_accept_status_code(err, expected_status):
 665         assert isinstance(err, compat_urllib_error.HTTPError)
 666         if expected_status is None:
 667             return False
 668         elif callable(expected_status):
 669             return expected_status(err.code) is True
 670         else:
 671             return err.code in variadic(expected_status)
 672
 673     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 674         """
 675         Return the response handle.
 676
 677         See _download_webpage docstring for arguments specification.
 678         """
 679         if not self._downloader._first_webpage_request:
 680             sleep_interval = self.get_param('sleep_interval_requests') or 0
 681             if sleep_interval > 0:
 682                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 683                 time.sleep(sleep_interval)
 684         else:
 685             self._downloader._first_webpage_request = False
 686
 687         if note is None:
 688             self.report_download_webpage(video_id)
 689         elif note is not False:
 690             if video_id is None:
 691                 self.to_screen('%s' % (note,))
 692             else:
 693                 self.to_screen('%s: %s' % (video_id, note))
 694
 695         # Some sites check X-Forwarded-For HTTP header in order to figure out
 696         # the origin of the client behind proxy. This allows bypassing geo
 697         # restriction by faking this header's value to IP that belongs to some
 698         # geo unrestricted country. We will do so once we encounter any
 699         # geo restriction error.
 700         if self._x_forwarded_for_ip:
 701             if 'X-Forwarded-For' not in headers:
 702                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 703
 704         if isinstance(url_or_request, compat_urllib_request.Request):
 705             url_or_request = update_Request(
 706                 url_or_request, data=data, headers=headers, query=query)
 707         else:
 708             if query:
 709                 url_or_request = update_url_query(url_or_request, query)
 710             if data is not None or headers:
 711                 url_or_request = sanitized_Request(url_or_request, data, headers)
 712         try:
 713             return self._downloader.urlopen(url_or_request)
 714         except network_exceptions as err:
 715             if isinstance(err, compat_urllib_error.HTTPError):
 716                 if self.__can_accept_status_code(err, expected_status):
 717                     # Retain reference to error to prevent file object from
 718                     # being closed before it can be read. Works around the
 719                     # effects of <https://bugs.python.org/issue15002>
 720                     # introduced in Python 3.4.1.
 721                     err.fp._error = err
 722                     return err.fp
 723
 724             if errnote is False:
 725                 return False
 726             if errnote is None:
 727                 errnote = 'Unable to download webpage'
 728
 729             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 730             if fatal:
 731                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 732             else:
 733                 self.report_warning(errmsg)
 734                 return False
 735
 736     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 737         """
 738         Return a tuple (page content as string, URL handle).
 739
 740         See _download_webpage docstring for arguments specification.
 741         """
 742         # Strip hashes from the URL (#1038)
 743         if isinstance(url_or_request, (compat_str, str)):
 744             url_or_request = url_or_request.partition('#')[0]
 745
 746         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 747         if urlh is False:
 748             assert not fatal
 749             return False
 750         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 751         return (content, urlh)
 752
 753     @staticmethod
 754     def _guess_encoding_from_content(content_type, webpage_bytes):
 755         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 756         if m:
 757             encoding = m.group(1)
 758         else:
 759             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 760                           webpage_bytes[:1024])
 761             if m:
 762                 encoding = m.group(1).decode('ascii')
 763             elif webpage_bytes.startswith(b'\xff\xfe'):
 764                 encoding = 'utf-16'
 765             else:
 766                 encoding = 'utf-8'
 767
 768         return encoding
 769
 770     def __check_blocked(self, content):
 771         first_block = content[:512]
 772         if ('<title>Access to this site is blocked</title>' in content
 773                 and 'Websense' in first_block):
 774             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 775             blocked_iframe = self._html_search_regex(
 776                 r'<iframe src="([^"]+)"', content,
 777                 'Websense information URL', default=None)
 778             if blocked_iframe:
 779                 msg += ' Visit %s for more details' % blocked_iframe
 780             raise ExtractorError(msg, expected=True)
 781         if '<title>The URL you requested has been blocked</title>' in first_block:
 782             msg = (
 783                 'Access to this webpage has been blocked by Indian censorship. '
 784                 'Use a VPN or proxy server (with --proxy) to route around it.')
 785             block_msg = self._html_search_regex(
 786                 r'</h1><p>(.*?)</p>',
 787                 content, 'block message', default=None)
 788             if block_msg:
 789                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 790             raise ExtractorError(msg, expected=True)
 791         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 792                 and 'blocklist.rkn.gov.ru' in content):
 793             raise ExtractorError(
 794                 'Access to this webpage has been blocked by decision of the Russian government. '
 795                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 796                 expected=True)
 797
 798     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 799         content_type = urlh.headers.get('Content-Type', '')
 800         webpage_bytes = urlh.read()
 801         if prefix is not None:
 802             webpage_bytes = prefix + webpage_bytes
 803         if not encoding:
 804             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 805         if self.get_param('dump_intermediate_pages', False):
 806             self.to_screen('Dumping request to ' + urlh.geturl())
 807             dump = base64.b64encode(webpage_bytes).decode('ascii')
 808             self._downloader.to_screen(dump)
 809         if self.get_param('write_pages', False):
 810             basen = '%s_%s' % (video_id, urlh.geturl())
 811             trim_length = self.get_param('trim_file_name') or 240
 812             if len(basen) > trim_length:
 813                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 814                 basen = basen[:trim_length - len(h)] + h
 815             raw_filename = basen + '.dump'
 816             filename = sanitize_filename(raw_filename, restricted=True)
 817             self.to_screen('Saving request to ' + filename)
 818             # Working around MAX_PATH limitation on Windows (see
 819             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 820             if compat_os_name == 'nt':
 821                 absfilepath = os.path.abspath(filename)
 822                 if len(absfilepath) > 259:
 823                     filename = '\\\\?\\' + absfilepath
 824             with open(filename, 'wb') as outf:
 825                 outf.write(webpage_bytes)
 826
 827         try:
 828             content = webpage_bytes.decode(encoding, 'replace')
 829         except LookupError:
 830             content = webpage_bytes.decode('utf-8', 'replace')
 831
 832         self.__check_blocked(content)
 833
 834         return content
 835
 836     def _download_webpage(
 837             self, url_or_request, video_id, note=None, errnote=None,
 838             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 839             headers={}, query={}, expected_status=None):
 840         """
 841         Return the data of the page as a string.
 842
 843         Arguments:
 844         url_or_request -- plain text URL as a string or
 845             a compat_urllib_request.Requestobject
 846         video_id -- Video/playlist/item identifier (string)
 847
 848         Keyword arguments:
 849         note -- note printed before downloading (string)
 850         errnote -- note printed in case of an error (string)
 851         fatal -- flag denoting whether error should be considered fatal,
 852             i.e. whether it should cause ExtractionError to be raised,
 853             otherwise a warning will be reported and extraction continued
 854         tries -- number of tries
 855         timeout -- sleep interval between tries
 856         encoding -- encoding for a page content decoding, guessed automatically
 857             when not explicitly specified
 858         data -- POST data (bytes)
 859         headers -- HTTP headers (dict)
 860         query -- URL query (dict)
 861         expected_status -- allows to accept failed HTTP requests (non 2xx
 862             status code) by explicitly specifying a set of accepted status
 863             codes. Can be any of the following entities:
 864                 - an integer type specifying an exact failed status code to
 865                   accept
 866                 - a list or a tuple of integer types specifying a list of
 867                   failed status codes to accept
 868                 - a callable accepting an actual failed status code and
 869                   returning True if it should be accepted
 870             Note that this argument does not affect success status codes (2xx)
 871             which are always accepted.
 872         """
 873
 874         success = False
 875         try_count = 0
 876         while success is False:
 877             try:
 878                 res = self._download_webpage_handle(
 879                     url_or_request, video_id, note, errnote, fatal,
 880                     encoding=encoding, data=data, headers=headers, query=query,
 881                     expected_status=expected_status)
 882                 success = True
 883             except compat_http_client.IncompleteRead as e:
 884                 try_count += 1
 885                 if try_count >= tries:
 886                     raise e
 887                 self._sleep(timeout, video_id)
 888         if res is False:
 889             return res
 890         else:
 891             content, _ = res
 892             return content
 893
 894     def _download_xml_handle(
 895             self, url_or_request, video_id, note='Downloading XML',
 896             errnote='Unable to download XML', transform_source=None,
 897             fatal=True, encoding=None, data=None, headers={}, query={},
 898             expected_status=None):
 899         """
 900         Return a tuple (xml as an compat_etree_Element, URL handle).
 901
 902         See _download_webpage docstring for arguments specification.
 903         """
 904         res = self._download_webpage_handle(
 905             url_or_request, video_id, note, errnote, fatal=fatal,
 906             encoding=encoding, data=data, headers=headers, query=query,
 907             expected_status=expected_status)
 908         if res is False:
 909             return res
 910         xml_string, urlh = res
 911         return self._parse_xml(
 912             xml_string, video_id, transform_source=transform_source,
 913             fatal=fatal), urlh
 914
 915     def _download_xml(
 916             self, url_or_request, video_id,
 917             note='Downloading XML', errnote='Unable to download XML',
 918             transform_source=None, fatal=True, encoding=None,
 919             data=None, headers={}, query={}, expected_status=None):
 920         """
 921         Return the xml as an compat_etree_Element.
 922
 923         See _download_webpage docstring for arguments specification.
 924         """
 925         res = self._download_xml_handle(
 926             url_or_request, video_id, note=note, errnote=errnote,
 927             transform_source=transform_source, fatal=fatal, encoding=encoding,
 928             data=data, headers=headers, query=query,
 929             expected_status=expected_status)
 930         return res if res is False else res[0]
 931
 932     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 933         if transform_source:
 934             xml_string = transform_source(xml_string)
 935         try:
 936             return compat_etree_fromstring(xml_string.encode('utf-8'))
 937         except compat_xml_parse_error as ve:
 938             errmsg = '%s: Failed to parse XML ' % video_id
 939             if fatal:
 940                 raise ExtractorError(errmsg, cause=ve)
 941             else:
 942                 self.report_warning(errmsg + str(ve))
 943
 944     def _download_json_handle(
 945             self, url_or_request, video_id, note='Downloading JSON metadata',
 946             errnote='Unable to download JSON metadata', transform_source=None,
 947             fatal=True, encoding=None, data=None, headers={}, query={},
 948             expected_status=None):
 949         """
 950         Return a tuple (JSON object, URL handle).
 951
 952         See _download_webpage docstring for arguments specification.
 953         """
 954         res = self._download_webpage_handle(
 955             url_or_request, video_id, note, errnote, fatal=fatal,
 956             encoding=encoding, data=data, headers=headers, query=query,
 957             expected_status=expected_status)
 958         if res is False:
 959             return res
 960         json_string, urlh = res
 961         return self._parse_json(
 962             json_string, video_id, transform_source=transform_source,
 963             fatal=fatal), urlh
 964
 965     def _download_json(
 966             self, url_or_request, video_id, note='Downloading JSON metadata',
 967             errnote='Unable to download JSON metadata', transform_source=None,
 968             fatal=True, encoding=None, data=None, headers={}, query={},
 969             expected_status=None):
 970         """
 971         Return the JSON object as a dict.
 972
 973         See _download_webpage docstring for arguments specification.
 974         """
 975         res = self._download_json_handle(
 976             url_or_request, video_id, note=note, errnote=errnote,
 977             transform_source=transform_source, fatal=fatal, encoding=encoding,
 978             data=data, headers=headers, query=query,
 979             expected_status=expected_status)
 980         return res if res is False else res[0]
 981
 982     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 983         if transform_source:
 984             json_string = transform_source(json_string)
 985         try:
 986             return json.loads(json_string)
 987         except ValueError as ve:
 988             errmsg = '%s: Failed to parse JSON ' % video_id
 989             if fatal:
 990                 raise ExtractorError(errmsg, cause=ve)
 991             else:
 992                 self.report_warning(errmsg + str(ve))
 993
 994     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 995         return self._parse_json(
 996             data[data.find('{'):data.rfind('}') + 1],
 997             video_id, transform_source, fatal)
 998
 999     def _download_socket_json_handle(
1000             self, url_or_request, video_id, note='Polling socket',
1001             errnote='Unable to poll socket', transform_source=None,
1002             fatal=True, encoding=None, data=None, headers={}, query={},
1003             expected_status=None):
1004         """
1005         Return a tuple (JSON object, URL handle).
1006
1007         See _download_webpage docstring for arguments specification.
1008         """
1009         res = self._download_webpage_handle(
1010             url_or_request, video_id, note, errnote, fatal=fatal,
1011             encoding=encoding, data=data, headers=headers, query=query,
1012             expected_status=expected_status)
1013         if res is False:
1014             return res
1015         webpage, urlh = res
1016         return self._parse_socket_response_as_json(
1017             webpage, video_id, transform_source=transform_source,
1018             fatal=fatal), urlh
1019
1020     def _download_socket_json(
1021             self, url_or_request, video_id, note='Polling socket',
1022             errnote='Unable to poll socket', transform_source=None,
1023             fatal=True, encoding=None, data=None, headers={}, query={},
1024             expected_status=None):
1025         """
1026         Return the JSON object as a dict.
1027
1028         See _download_webpage docstring for arguments specification.
1029         """
1030         res = self._download_socket_json_handle(
1031             url_or_request, video_id, note=note, errnote=errnote,
1032             transform_source=transform_source, fatal=fatal, encoding=encoding,
1033             data=data, headers=headers, query=query,
1034             expected_status=expected_status)
1035         return res if res is False else res[0]
1036
1037     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1038         idstr = format_field(video_id, template='%s: ')
1039         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1040         if only_once:
1041             if f'WARNING: {msg}' in self._printed_messages:
1042                 return
1043             self._printed_messages.add(f'WARNING: {msg}')
1044         self._downloader.report_warning(msg, *args, **kwargs)
1045
1046     def to_screen(self, msg, *args, **kwargs):
1047         """Print msg to screen, prefixing it with '[ie_name]'"""
1048         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1049
1050     def write_debug(self, msg, *args, **kwargs):
1051         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1052
1053     def get_param(self, name, default=None, *args, **kwargs):
1054         if self._downloader:
1055             return self._downloader.params.get(name, default, *args, **kwargs)
1056         return default
1057
1058     def report_drm(self, video_id, partial=False):
1059         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1060
1061     def report_extraction(self, id_or_name):
1062         """Report information extraction."""
1063         self.to_screen('%s: Extracting information' % id_or_name)
1064
1065     def report_download_webpage(self, video_id):
1066         """Report webpage download."""
1067         self.to_screen('%s: Downloading webpage' % video_id)
1068
1069     def report_age_confirmation(self):
1070         """Report attempt to confirm age."""
1071         self.to_screen('Confirming age')
1072
1073     def report_login(self):
1074         """Report attempt to log in."""
1075         self.to_screen('Logging in')
1076
1077     def raise_login_required(
1078             self, msg='This video is only available for registered users',
1079             metadata_available=False, method='any'):
1080         if metadata_available and self.get_param('ignore_no_formats_error'):
1081             self.report_warning(msg)
1082         if method is not None:
1083             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1084         raise ExtractorError(msg, expected=True)
1085
1086     def raise_geo_restricted(
1087             self, msg='This video is not available from your location due to geo restriction',
1088             countries=None, metadata_available=False):
1089         if metadata_available and self.get_param('ignore_no_formats_error'):
1090             self.report_warning(msg)
1091         else:
1092             raise GeoRestrictedError(msg, countries=countries)
1093
1094     def raise_no_formats(self, msg, expected=False, video_id=None):
1095         if expected and self.get_param('ignore_no_formats_error'):
1096             self.report_warning(msg, video_id)
1097         elif isinstance(msg, ExtractorError):
1098             raise msg
1099         else:
1100             raise ExtractorError(msg, expected=expected, video_id=video_id)
1101
1102     # Methods for following #608
1103     @staticmethod
1104     def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1105         """Returns a URL that points to a page that should be processed"""
1106         # TODO: ie should be the class used for getting the info
1107         video_info = {'_type': 'url',
1108                       'url': url,
1109                       'ie_key': ie}
1110         video_info.update(kwargs)
1111         if video_id is not None:
1112             video_info['id'] = video_id
1113         if video_title is not None:
1114             video_info['title'] = video_title
1115         return video_info
1116
1117     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1118         urls = orderedSet(
1119             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1120             for m in matches)
1121         return self.playlist_result(
1122             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1123
1124     @staticmethod
1125     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1126         """Returns a playlist"""
1127         video_info = {'_type': 'playlist',
1128                       'entries': entries}
1129         video_info.update(kwargs)
1130         if playlist_id:
1131             video_info['id'] = playlist_id
1132         if playlist_title:
1133             video_info['title'] = playlist_title
1134         if playlist_description is not None:
1135             video_info['description'] = playlist_description
1136         return video_info
1137
1138     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1139         """
1140         Perform a regex search on the given string, using a single or a list of
1141         patterns returning the first matching group.
1142         In case of failure return a default value or raise a WARNING or a
1143         RegexNotFoundError, depending on fatal, specifying the field name.
1144         """
1145         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1146             mobj = re.search(pattern, string, flags)
1147         else:
1148             for p in pattern:
1149                 mobj = re.search(p, string, flags)
1150                 if mobj:
1151                     break
1152
1153         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1154
1155         if mobj:
1156             if group is None:
1157                 # return the first matching group
1158                 return next(g for g in mobj.groups() if g is not None)
1159             elif isinstance(group, (list, tuple)):
1160                 return tuple(mobj.group(g) for g in group)
1161             else:
1162                 return mobj.group(group)
1163         elif default is not NO_DEFAULT:
1164             return default
1165         elif fatal:
1166             raise RegexNotFoundError('Unable to extract %s' % _name)
1167         else:
1168             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1169             return None
1170
1171     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1172         """
1173         Like _search_regex, but strips HTML tags and unescapes entities.
1174         """
1175         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1176         if res:
1177             return clean_html(res).strip()
1178         else:
1179             return res
1180
1181     def _get_netrc_login_info(self, netrc_machine=None):
1182         username = None
1183         password = None
1184         netrc_machine = netrc_machine or self._NETRC_MACHINE
1185
1186         if self.get_param('usenetrc', False):
1187             try:
1188                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1189                 if os.path.isdir(netrc_file):
1190                     netrc_file = os.path.join(netrc_file, '.netrc')
1191                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1192                 if info is not None:
1193                     username = info[0]
1194                     password = info[2]
1195                 else:
1196                     raise netrc.NetrcParseError(
1197                         'No authenticators for %s' % netrc_machine)
1198             except (IOError, netrc.NetrcParseError) as err:
1199                 self.report_warning(
1200                     'parsing .netrc: %s' % error_to_compat_str(err))
1201
1202         return username, password
1203
1204     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1205         """
1206         Get the login info as (username, password)
1207         First look for the manually specified credentials using username_option
1208         and password_option as keys in params dictionary. If no such credentials
1209         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1210         value.
1211         If there's no info available, return (None, None)
1212         """
1213
1214         # Attempt to use provided username and password or .netrc data
1215         username = self.get_param(username_option)
1216         if username is not None:
1217             password = self.get_param(password_option)
1218         else:
1219             username, password = self._get_netrc_login_info(netrc_machine)
1220
1221         return username, password
1222
1223     def _get_tfa_info(self, note='two-factor verification code'):
1224         """
1225         Get the two-factor authentication info
1226         TODO - asking the user will be required for sms/phone verify
1227         currently just uses the command line option
1228         If there's no info available, return None
1229         """
1230
1231         tfa = self.get_param('twofactor')
1232         if tfa is not None:
1233             return tfa
1234
1235         return compat_getpass('Type %s and press [Return]: ' % note)
1236
1237     # Helper functions for extracting OpenGraph info
1238     @staticmethod
1239     def _og_regexes(prop):
1240         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1241         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1242                        % {'prop': re.escape(prop)})
1243         template = r'<meta[^>]+?%s[^>]+?%s'
1244         return [
1245             template % (property_re, content_re),
1246             template % (content_re, property_re),
1247         ]
1248
1249     @staticmethod
1250     def _meta_regex(prop):
1251         return r'''(?isx)<meta
1252                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1253                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1254
1255     def _og_search_property(self, prop, html, name=None, **kargs):
1256         prop = variadic(prop)
1257         if name is None:
1258             name = 'OpenGraph %s' % prop[0]
1259         og_regexes = []
1260         for p in prop:
1261             og_regexes.extend(self._og_regexes(p))
1262         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1263         if escaped is None:
1264             return None
1265         return unescapeHTML(escaped)
1266
1267     def _og_search_thumbnail(self, html, **kargs):
1268         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1269
1270     def _og_search_description(self, html, **kargs):
1271         return self._og_search_property('description', html, fatal=False, **kargs)
1272
1273     def _og_search_title(self, html, **kargs):
1274         return self._og_search_property('title', html, **kargs)
1275
1276     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1277         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1278         if secure:
1279             regexes = self._og_regexes('video:secure_url') + regexes
1280         return self._html_search_regex(regexes, html, name, **kargs)
1281
1282     def _og_search_url(self, html, **kargs):
1283         return self._og_search_property('url', html, **kargs)
1284
1285     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1286         name = variadic(name)
1287         if display_name is None:
1288             display_name = name[0]
1289         return self._html_search_regex(
1290             [self._meta_regex(n) for n in name],
1291             html, display_name, fatal=fatal, group='content', **kwargs)
1292
1293     def _dc_search_uploader(self, html):
1294         return self._html_search_meta('dc.creator', html, 'uploader')
1295
1296     def _rta_search(self, html):
1297         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1298         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1299                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1300                      html):
1301             return 18
1302         return 0
1303
1304     def _media_rating_search(self, html):
1305         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1306         rating = self._html_search_meta('rating', html)
1307
1308         if not rating:
1309             return None
1310
1311         RATING_TABLE = {
1312             'safe for kids': 0,
1313             'general': 8,
1314             '14 years': 14,
1315             'mature': 17,
1316             'restricted': 19,
1317         }
1318         return RATING_TABLE.get(rating.lower())
1319
1320     def _family_friendly_search(self, html):
1321         # See http://schema.org/VideoObject
1322         family_friendly = self._html_search_meta(
1323             'isFamilyFriendly', html, default=None)
1324
1325         if not family_friendly:
1326             return None
1327
1328         RATING_TABLE = {
1329             '1': 0,
1330             'true': 0,
1331             '0': 18,
1332             'false': 18,
1333         }
1334         return RATING_TABLE.get(family_friendly.lower())
1335
1336     def _twitter_search_player(self, html):
1337         return self._html_search_meta('twitter:player', html,
1338                                       'twitter card player')
1339
1340     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1341         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1342         default = kwargs.get('default', NO_DEFAULT)
1343         # JSON-LD may be malformed and thus `fatal` should be respected.
1344         # At the same time `default` may be passed that assumes `fatal=False`
1345         # for _search_regex. Let's simulate the same behavior here as well.
1346         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1347         json_ld = []
1348         for mobj in json_ld_list:
1349             json_ld_item = self._parse_json(
1350                 mobj.group('json_ld'), video_id, fatal=fatal)
1351             if not json_ld_item:
1352                 continue
1353             if isinstance(json_ld_item, dict):
1354                 json_ld.append(json_ld_item)
1355             elif isinstance(json_ld_item, (list, tuple)):
1356                 json_ld.extend(json_ld_item)
1357         if json_ld:
1358             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1359         if json_ld:
1360             return json_ld
1361         if default is not NO_DEFAULT:
1362             return default
1363         elif fatal:
1364             raise RegexNotFoundError('Unable to extract JSON-LD')
1365         else:
1366             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1367             return {}
1368
1369     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1370         if isinstance(json_ld, compat_str):
1371             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1372         if not json_ld:
1373             return {}
1374         info = {}
1375         if not isinstance(json_ld, (list, tuple, dict)):
1376             return info
1377         if isinstance(json_ld, dict):
1378             json_ld = [json_ld]
1379
1380         INTERACTION_TYPE_MAP = {
1381             'CommentAction': 'comment',
1382             'AgreeAction': 'like',
1383             'DisagreeAction': 'dislike',
1384             'LikeAction': 'like',
1385             'DislikeAction': 'dislike',
1386             'ListenAction': 'view',
1387             'WatchAction': 'view',
1388             'ViewAction': 'view',
1389         }
1390
1391         def extract_interaction_type(e):
1392             interaction_type = e.get('interactionType')
1393             if isinstance(interaction_type, dict):
1394                 interaction_type = interaction_type.get('@type')
1395             return str_or_none(interaction_type)
1396
1397         def extract_interaction_statistic(e):
1398             interaction_statistic = e.get('interactionStatistic')
1399             if isinstance(interaction_statistic, dict):
1400                 interaction_statistic = [interaction_statistic]
1401             if not isinstance(interaction_statistic, list):
1402                 return
1403             for is_e in interaction_statistic:
1404                 if not isinstance(is_e, dict):
1405                     continue
1406                 if is_e.get('@type') != 'InteractionCounter':
1407                     continue
1408                 interaction_type = extract_interaction_type(is_e)
1409                 if not interaction_type:
1410                     continue
1411                 # For interaction count some sites provide string instead of
1412                 # an integer (as per spec) with non digit characters (e.g. ",")
1413                 # so extracting count with more relaxed str_to_int
1414                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1415                 if interaction_count is None:
1416                     continue
1417                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1418                 if not count_kind:
1419                     continue
1420                 count_key = '%s_count' % count_kind
1421                 if info.get(count_key) is not None:
1422                     continue
1423                 info[count_key] = interaction_count
1424
1425         def extract_video_object(e):
1426             assert e['@type'] == 'VideoObject'
1427             author = e.get('author')
1428             info.update({
1429                 'url': url_or_none(e.get('contentUrl')),
1430                 'title': unescapeHTML(e.get('name')),
1431                 'description': unescapeHTML(e.get('description')),
1432                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1433                 'duration': parse_duration(e.get('duration')),
1434                 'timestamp': unified_timestamp(e.get('uploadDate')),
1435                 # author can be an instance of 'Organization' or 'Person' types.
1436                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1437                 # however some websites are using 'Text' type instead.
1438                 # 1. https://schema.org/VideoObject
1439                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1440                 'filesize': float_or_none(e.get('contentSize')),
1441                 'tbr': int_or_none(e.get('bitrate')),
1442                 'width': int_or_none(e.get('width')),
1443                 'height': int_or_none(e.get('height')),
1444                 'view_count': int_or_none(e.get('interactionCount')),
1445             })
1446             extract_interaction_statistic(e)
1447
1448         for e in json_ld:
1449             if '@context' in e:
1450                 item_type = e.get('@type')
1451                 if expected_type is not None and expected_type != item_type:
1452                     continue
1453                 if item_type in ('TVEpisode', 'Episode'):
1454                     episode_name = unescapeHTML(e.get('name'))
1455                     info.update({
1456                         'episode': episode_name,
1457                         'episode_number': int_or_none(e.get('episodeNumber')),
1458                         'description': unescapeHTML(e.get('description')),
1459                     })
1460                     if not info.get('title') and episode_name:
1461                         info['title'] = episode_name
1462                     part_of_season = e.get('partOfSeason')
1463                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1464                         info.update({
1465                             'season': unescapeHTML(part_of_season.get('name')),
1466                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1467                         })
1468                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1469                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1470                         info['series'] = unescapeHTML(part_of_series.get('name'))
1471                 elif item_type == 'Movie':
1472                     info.update({
1473                         'title': unescapeHTML(e.get('name')),
1474                         'description': unescapeHTML(e.get('description')),
1475                         'duration': parse_duration(e.get('duration')),
1476                         'timestamp': unified_timestamp(e.get('dateCreated')),
1477                     })
1478                 elif item_type in ('Article', 'NewsArticle'):
1479                     info.update({
1480                         'timestamp': parse_iso8601(e.get('datePublished')),
1481                         'title': unescapeHTML(e.get('headline')),
1482                         'description': unescapeHTML(e.get('articleBody')),
1483                     })
1484                 elif item_type == 'VideoObject':
1485                     extract_video_object(e)
1486                     if expected_type is None:
1487                         continue
1488                     else:
1489                         break
1490                 video = e.get('video')
1491                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1492                     extract_video_object(video)
1493                 if expected_type is None:
1494                     continue
1495                 else:
1496                     break
1497         return dict((k, v) for k, v in info.items() if v is not None)
1498
1499     def _search_nextjs_data(self, webpage, video_id, **kw):
1500         return self._parse_json(
1501             self._search_regex(
1502                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1503                 webpage, 'next.js data', **kw),
1504             video_id, **kw)
1505
1506     @staticmethod
1507     def _hidden_inputs(html):
1508         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1509         hidden_inputs = {}
1510         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1511             attrs = extract_attributes(input)
1512             if not input:
1513                 continue
1514             if attrs.get('type') not in ('hidden', 'submit'):
1515                 continue
1516             name = attrs.get('name') or attrs.get('id')
1517             value = attrs.get('value')
1518             if name and value is not None:
1519                 hidden_inputs[name] = value
1520         return hidden_inputs
1521
1522     def _form_hidden_inputs(self, form_id, html):
1523         form = self._search_regex(
1524             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1525             html, '%s form' % form_id, group='form')
1526         return self._hidden_inputs(form)
1527
1528     class FormatSort:
1529         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1530
1531         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1532                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1533                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1534         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1535                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1536                         'fps', 'fs_approx', 'source', 'format_id')
1537
1538         settings = {
1539             'vcodec': {'type': 'ordered', 'regex': True,
1540                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1541             'acodec': {'type': 'ordered', 'regex': True,
1542                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1543             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1544                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1545             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1546                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1547             'vext': {'type': 'ordered', 'field': 'video_ext',
1548                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1549                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1550             'aext': {'type': 'ordered', 'field': 'audio_ext',
1551                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1552                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1553             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1554             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1555                            'field': ('vcodec', 'acodec'),
1556                            'function': lambda it: int(any(v != 'none' for v in it))},
1557             'ie_pref': {'priority': True, 'type': 'extractor'},
1558             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1559             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1560             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1561             'quality': {'convert': 'float', 'default': -1},
1562             'filesize': {'convert': 'bytes'},
1563             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1564             'id': {'convert': 'string', 'field': 'format_id'},
1565             'height': {'convert': 'float_none'},
1566             'width': {'convert': 'float_none'},
1567             'fps': {'convert': 'float_none'},
1568             'tbr': {'convert': 'float_none'},
1569             'vbr': {'convert': 'float_none'},
1570             'abr': {'convert': 'float_none'},
1571             'asr': {'convert': 'float_none'},
1572             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1573
1574             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1575             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1576             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1577             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1578             'res': {'type': 'multiple', 'field': ('height', 'width'),
1579                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1580
1581             # Most of these exist only for compatibility reasons
1582             'dimension': {'type': 'alias', 'field': 'res'},
1583             'resolution': {'type': 'alias', 'field': 'res'},
1584             'extension': {'type': 'alias', 'field': 'ext'},
1585             'bitrate': {'type': 'alias', 'field': 'br'},
1586             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1587             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1588             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1589             'framerate': {'type': 'alias', 'field': 'fps'},
1590             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1591             'protocol': {'type': 'alias', 'field': 'proto'},
1592             'source_preference': {'type': 'alias', 'field': 'source'},
1593             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1594             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1595             'samplerate': {'type': 'alias', 'field': 'asr'},
1596             'video_ext': {'type': 'alias', 'field': 'vext'},
1597             'audio_ext': {'type': 'alias', 'field': 'aext'},
1598             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1599             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1600             'video': {'type': 'alias', 'field': 'hasvid'},
1601             'has_video': {'type': 'alias', 'field': 'hasvid'},
1602             'audio': {'type': 'alias', 'field': 'hasaud'},
1603             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1604             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1605             'preference': {'type': 'alias', 'field': 'ie_pref'},
1606             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1607             'format_id': {'type': 'alias', 'field': 'id'},
1608         }
1609
1610         _order = []
1611
1612         def _get_field_setting(self, field, key):
1613             if field not in self.settings:
1614                 self.settings[field] = {}
1615             propObj = self.settings[field]
1616             if key not in propObj:
1617                 type = propObj.get('type')
1618                 if key == 'field':
1619                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1620                 elif key == 'convert':
1621                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1622                 else:
1623                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1624                 propObj[key] = default
1625             return propObj[key]
1626
1627         def _resolve_field_value(self, field, value, convertNone=False):
1628             if value is None:
1629                 if not convertNone:
1630                     return None
1631             else:
1632                 value = value.lower()
1633             conversion = self._get_field_setting(field, 'convert')
1634             if conversion == 'ignore':
1635                 return None
1636             if conversion == 'string':
1637                 return value
1638             elif conversion == 'float_none':
1639                 return float_or_none(value)
1640             elif conversion == 'bytes':
1641                 return FileDownloader.parse_bytes(value)
1642             elif conversion == 'order':
1643                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1644                 use_regex = self._get_field_setting(field, 'regex')
1645                 list_length = len(order_list)
1646                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1647                 if use_regex and value is not None:
1648                     for i, regex in enumerate(order_list):
1649                         if regex and re.match(regex, value):
1650                             return list_length - i
1651                     return list_length - empty_pos  # not in list
1652                 else:  # not regex or  value = None
1653                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1654             else:
1655                 if value.isnumeric():
1656                     return float(value)
1657                 else:
1658                     self.settings[field]['convert'] = 'string'
1659                     return value
1660
1661         def evaluate_params(self, params, sort_extractor):
1662             self._use_free_order = params.get('prefer_free_formats', False)
1663             self._sort_user = params.get('format_sort', [])
1664             self._sort_extractor = sort_extractor
1665
1666             def add_item(field, reverse, closest, limit_text):
1667                 field = field.lower()
1668                 if field in self._order:
1669                     return
1670                 self._order.append(field)
1671                 limit = self._resolve_field_value(field, limit_text)
1672                 data = {
1673                     'reverse': reverse,
1674                     'closest': False if limit is None else closest,
1675                     'limit_text': limit_text,
1676                     'limit': limit}
1677                 if field in self.settings:
1678                     self.settings[field].update(data)
1679                 else:
1680                     self.settings[field] = data
1681
1682             sort_list = (
1683                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1684                 + (tuple() if params.get('format_sort_force', False)
1685                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1686                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1687
1688             for item in sort_list:
1689                 match = re.match(self.regex, item)
1690                 if match is None:
1691                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1692                 field = match.group('field')
1693                 if field is None:
1694                     continue
1695                 if self._get_field_setting(field, 'type') == 'alias':
1696                     field = self._get_field_setting(field, 'field')
1697                 reverse = match.group('reverse') is not None
1698                 closest = match.group('separator') == '~'
1699                 limit_text = match.group('limit')
1700
1701                 has_limit = limit_text is not None
1702                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1703                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1704
1705                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1706                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1707                 limit_count = len(limits)
1708                 for (i, f) in enumerate(fields):
1709                     add_item(f, reverse, closest,
1710                              limits[i] if i < limit_count
1711                              else limits[0] if has_limit and not has_multiple_limits
1712                              else None)
1713
1714         def print_verbose_info(self, write_debug):
1715             if self._sort_user:
1716                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1717             if self._sort_extractor:
1718                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1719             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1720                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1721                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1722                               self._get_field_setting(field, 'limit_text'),
1723                               self._get_field_setting(field, 'limit'))
1724                 if self._get_field_setting(field, 'limit_text') is not None else '')
1725                 for field in self._order if self._get_field_setting(field, 'visible')]))
1726
1727         def _calculate_field_preference_from_value(self, format, field, type, value):
1728             reverse = self._get_field_setting(field, 'reverse')
1729             closest = self._get_field_setting(field, 'closest')
1730             limit = self._get_field_setting(field, 'limit')
1731
1732             if type == 'extractor':
1733                 maximum = self._get_field_setting(field, 'max')
1734                 if value is None or (maximum is not None and value >= maximum):
1735                     value = -1
1736             elif type == 'boolean':
1737                 in_list = self._get_field_setting(field, 'in_list')
1738                 not_in_list = self._get_field_setting(field, 'not_in_list')
1739                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1740             elif type == 'ordered':
1741                 value = self._resolve_field_value(field, value, True)
1742
1743             # try to convert to number
1744             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1745             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1746             if is_num:
1747                 value = val_num
1748
1749             return ((-10, 0) if value is None
1750                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1751                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1752                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1753                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1754                     else (-1, value, 0))
1755
1756         def _calculate_field_preference(self, format, field):
1757             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1758             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1759             if type == 'multiple':
1760                 type = 'field'  # Only 'field' is allowed in multiple for now
1761                 actual_fields = self._get_field_setting(field, 'field')
1762
1763                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1764             else:
1765                 value = get_value(field)
1766             return self._calculate_field_preference_from_value(format, field, type, value)
1767
1768         def calculate_preference(self, format):
1769             # Determine missing protocol
1770             if not format.get('protocol'):
1771                 format['protocol'] = determine_protocol(format)
1772
1773             # Determine missing ext
1774             if not format.get('ext') and 'url' in format:
1775                 format['ext'] = determine_ext(format['url'])
1776             if format.get('vcodec') == 'none':
1777                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1778                 format['video_ext'] = 'none'
1779             else:
1780                 format['video_ext'] = format['ext']
1781                 format['audio_ext'] = 'none'
1782             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1783             #    format['preference'] = -1000
1784
1785             # Determine missing bitrates
1786             if format.get('tbr') is None:
1787                 if format.get('vbr') is not None and format.get('abr') is not None:
1788                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1789             else:
1790                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1791                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1792                 if format.get('acodec') != 'none' and format.get('abr') is None:
1793                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1794
1795             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1796
1797     def _sort_formats(self, formats, field_preference=[]):
1798         if not formats:
1799             return
1800         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1801         format_sort.evaluate_params(self._downloader.params, field_preference)
1802         if self.get_param('verbose', False):
1803             format_sort.print_verbose_info(self._downloader.write_debug)
1804         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1805
1806     def _check_formats(self, formats, video_id):
1807         if formats:
1808             formats[:] = filter(
1809                 lambda f: self._is_valid_url(
1810                     f['url'], video_id,
1811                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1812                 formats)
1813
1814     @staticmethod
1815     def _remove_duplicate_formats(formats):
1816         format_urls = set()
1817         unique_formats = []
1818         for f in formats:
1819             if f['url'] not in format_urls:
1820                 format_urls.add(f['url'])
1821                 unique_formats.append(f)
1822         formats[:] = unique_formats
1823
1824     def _is_valid_url(self, url, video_id, item='video', headers={}):
1825         url = self._proto_relative_url(url, scheme='http:')
1826         # For now assume non HTTP(S) URLs always valid
1827         if not (url.startswith('http://') or url.startswith('https://')):
1828             return True
1829         try:
1830             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1831             return True
1832         except ExtractorError as e:
1833             self.to_screen(
1834                 '%s: %s URL is invalid, skipping: %s'
1835                 % (video_id, item, error_to_compat_str(e.cause)))
1836             return False
1837
1838     def http_scheme(self):
1839         """ Either "http:" or "https:", depending on the user's preferences """
1840         return (
1841             'http:'
1842             if self.get_param('prefer_insecure', False)
1843             else 'https:')
1844
1845     def _proto_relative_url(self, url, scheme=None):
1846         if url is None:
1847             return url
1848         if url.startswith('//'):
1849             if scheme is None:
1850                 scheme = self.http_scheme()
1851             return scheme + url
1852         else:
1853             return url
1854
1855     def _sleep(self, timeout, video_id, msg_template=None):
1856         if msg_template is None:
1857             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1858         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1859         self.to_screen(msg)
1860         time.sleep(timeout)
1861
1862     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1863                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1864                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1865         manifest = self._download_xml(
1866             manifest_url, video_id, 'Downloading f4m manifest',
1867             'Unable to download f4m manifest',
1868             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1869             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1870             transform_source=transform_source,
1871             fatal=fatal, data=data, headers=headers, query=query)
1872
1873         if manifest is False:
1874             return []
1875
1876         return self._parse_f4m_formats(
1877             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1878             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1879
1880     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1881                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1882                            fatal=True, m3u8_id=None):
1883         if not isinstance(manifest, compat_etree_Element) and not fatal:
1884             return []
1885
1886         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1887         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1888         if akamai_pv is not None and ';' in akamai_pv.text:
1889             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1890             if playerVerificationChallenge.strip() != '':
1891                 return []
1892
1893         formats = []
1894         manifest_version = '1.0'
1895         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1896         if not media_nodes:
1897             manifest_version = '2.0'
1898             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1899         # Remove unsupported DRM protected media from final formats
1900         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1901         media_nodes = remove_encrypted_media(media_nodes)
1902         if not media_nodes:
1903             return formats
1904
1905         manifest_base_url = get_base_url(manifest)
1906
1907         bootstrap_info = xpath_element(
1908             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1909             'bootstrap info', default=None)
1910
1911         vcodec = None
1912         mime_type = xpath_text(
1913             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1914             'base URL', default=None)
1915         if mime_type and mime_type.startswith('audio/'):
1916             vcodec = 'none'
1917
1918         for i, media_el in enumerate(media_nodes):
1919             tbr = int_or_none(media_el.attrib.get('bitrate'))
1920             width = int_or_none(media_el.attrib.get('width'))
1921             height = int_or_none(media_el.attrib.get('height'))
1922             format_id = join_nonempty(f4m_id, tbr or i)
1923             # If <bootstrapInfo> is present, the specified f4m is a
1924             # stream-level manifest, and only set-level manifests may refer to
1925             # external resources.  See section 11.4 and section 4 of F4M spec
1926             if bootstrap_info is None:
1927                 media_url = None
1928                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1929                 if manifest_version == '2.0':
1930                     media_url = media_el.attrib.get('href')
1931                 if media_url is None:
1932                     media_url = media_el.attrib.get('url')
1933                 if not media_url:
1934                     continue
1935                 manifest_url = (
1936                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1937                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1938                 # If media_url is itself a f4m manifest do the recursive extraction
1939                 # since bitrates in parent manifest (this one) and media_url manifest
1940                 # may differ leading to inability to resolve the format by requested
1941                 # bitrate in f4m downloader
1942                 ext = determine_ext(manifest_url)
1943                 if ext == 'f4m':
1944                     f4m_formats = self._extract_f4m_formats(
1945                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1946                         transform_source=transform_source, fatal=fatal)
1947                     # Sometimes stream-level manifest contains single media entry that
1948                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1949                     # At the same time parent's media entry in set-level manifest may
1950                     # contain it. We will copy it from parent in such cases.
1951                     if len(f4m_formats) == 1:
1952                         f = f4m_formats[0]
1953                         f.update({
1954                             'tbr': f.get('tbr') or tbr,
1955                             'width': f.get('width') or width,
1956                             'height': f.get('height') or height,
1957                             'format_id': f.get('format_id') if not tbr else format_id,
1958                             'vcodec': vcodec,
1959                         })
1960                     formats.extend(f4m_formats)
1961                     continue
1962                 elif ext == 'm3u8':
1963                     formats.extend(self._extract_m3u8_formats(
1964                         manifest_url, video_id, 'mp4', preference=preference,
1965                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1966                     continue
1967             formats.append({
1968                 'format_id': format_id,
1969                 'url': manifest_url,
1970                 'manifest_url': manifest_url,
1971                 'ext': 'flv' if bootstrap_info is not None else None,
1972                 'protocol': 'f4m',
1973                 'tbr': tbr,
1974                 'width': width,
1975                 'height': height,
1976                 'vcodec': vcodec,
1977                 'preference': preference,
1978                 'quality': quality,
1979             })
1980         return formats
1981
1982     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1983         return {
1984             'format_id': join_nonempty(m3u8_id, 'meta'),
1985             'url': m3u8_url,
1986             'ext': ext,
1987             'protocol': 'm3u8',
1988             'preference': preference - 100 if preference else -100,
1989             'quality': quality,
1990             'resolution': 'multiple',
1991             'format_note': 'Quality selection URL',
1992         }
1993
1994     def _report_ignoring_subs(self, name):
1995         self.report_warning(bug_reports_message(
1996             f'Ignoring subtitle tracks found in the {name} manifest; '
1997             'if any subtitle tracks are missing,'
1998         ), only_once=True)
1999
2000     def _extract_m3u8_formats(self, *args, **kwargs):
2001         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2002         if subs:
2003             self._report_ignoring_subs('HLS')
2004         return fmts
2005
2006     def _extract_m3u8_formats_and_subtitles(
2007             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2008             preference=None, quality=None, m3u8_id=None, note=None,
2009             errnote=None, fatal=True, live=False, data=None, headers={},
2010             query={}):
2011
2012         res = self._download_webpage_handle(
2013             m3u8_url, video_id,
2014             note='Downloading m3u8 information' if note is None else note,
2015             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2016             fatal=fatal, data=data, headers=headers, query=query)
2017
2018         if res is False:
2019             return [], {}
2020
2021         m3u8_doc, urlh = res
2022         m3u8_url = urlh.geturl()
2023
2024         return self._parse_m3u8_formats_and_subtitles(
2025             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2026             preference=preference, quality=quality, m3u8_id=m3u8_id,
2027             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2028             headers=headers, query=query, video_id=video_id)
2029
2030     def _parse_m3u8_formats_and_subtitles(
2031             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2032             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2033             errnote=None, fatal=True, data=None, headers={}, query={},
2034             video_id=None):
2035         formats, subtitles = [], {}
2036
2037         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
2038             return formats, subtitles
2039
2040         has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc)
2041
2042         def format_url(url):
2043             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2044
2045         if self.get_param('hls_split_discontinuity', False):
2046             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2047                 if not m3u8_doc:
2048                     if not manifest_url:
2049                         return []
2050                     m3u8_doc = self._download_webpage(
2051                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2052                         note=False, errnote='Failed to download m3u8 playlist information')
2053                     if m3u8_doc is False:
2054                         return []
2055                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2056
2057         else:
2058             def _extract_m3u8_playlist_indices(*args, **kwargs):
2059                 return [None]
2060
2061         # References:
2062         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2063         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2064         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2065
2066         # We should try extracting formats only from master playlists [1, 4.3.4],
2067         # i.e. playlists that describe available qualities. On the other hand
2068         # media playlists [1, 4.3.3] should be returned as is since they contain
2069         # just the media without qualities renditions.
2070         # Fortunately, master playlist can be easily distinguished from media
2071         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2072         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2073         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2074         # media playlist and MUST NOT appear in master playlist thus we can
2075         # clearly detect media playlist with this criterion.
2076
2077         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2078             formats = [{
2079                 'format_id': join_nonempty(m3u8_id, idx),
2080                 'format_index': idx,
2081                 'url': m3u8_url,
2082                 'ext': ext,
2083                 'protocol': entry_protocol,
2084                 'preference': preference,
2085                 'quality': quality,
2086                 'has_drm': has_drm,
2087             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2088
2089             return formats, subtitles
2090
2091         groups = {}
2092         last_stream_inf = {}
2093
2094         def extract_media(x_media_line):
2095             media = parse_m3u8_attributes(x_media_line)
2096             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2097             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2098             if not (media_type and group_id and name):
2099                 return
2100             groups.setdefault(group_id, []).append(media)
2101             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2102             if media_type == 'SUBTITLES':
2103                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2104                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2105                 # However, lack of URI has been spotted in the wild.
2106                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2107                 if not media.get('URI'):
2108                     return
2109                 url = format_url(media['URI'])
2110                 sub_info = {
2111                     'url': url,
2112                     'ext': determine_ext(url),
2113                 }
2114                 if sub_info['ext'] == 'm3u8':
2115                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2116                     # files may contain is WebVTT:
2117                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2118                     sub_info['ext'] = 'vtt'
2119                     sub_info['protocol'] = 'm3u8_native'
2120                 lang = media.get('LANGUAGE') or 'und'
2121                 subtitles.setdefault(lang, []).append(sub_info)
2122             if media_type not in ('VIDEO', 'AUDIO'):
2123                 return
2124             media_url = media.get('URI')
2125             if media_url:
2126                 manifest_url = format_url(media_url)
2127                 formats.extend({
2128                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2129                     'format_note': name,
2130                     'format_index': idx,
2131                     'url': manifest_url,
2132                     'manifest_url': m3u8_url,
2133                     'language': media.get('LANGUAGE'),
2134                     'ext': ext,
2135                     'protocol': entry_protocol,
2136                     'preference': preference,
2137                     'quality': quality,
2138                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2139                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2140
2141         def build_stream_name():
2142             # Despite specification does not mention NAME attribute for
2143             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2144             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2145             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2146             stream_name = last_stream_inf.get('NAME')
2147             if stream_name:
2148                 return stream_name
2149             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2150             # from corresponding rendition group
2151             stream_group_id = last_stream_inf.get('VIDEO')
2152             if not stream_group_id:
2153                 return
2154             stream_group = groups.get(stream_group_id)
2155             if not stream_group:
2156                 return stream_group_id
2157             rendition = stream_group[0]
2158             return rendition.get('NAME') or stream_group_id
2159
2160         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2161         # chance to detect video only formats when EXT-X-STREAM-INF tags
2162         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2163         for line in m3u8_doc.splitlines():
2164             if line.startswith('#EXT-X-MEDIA:'):
2165                 extract_media(line)
2166
2167         for line in m3u8_doc.splitlines():
2168             if line.startswith('#EXT-X-STREAM-INF:'):
2169                 last_stream_inf = parse_m3u8_attributes(line)
2170             elif line.startswith('#') or not line.strip():
2171                 continue
2172             else:
2173                 tbr = float_or_none(
2174                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2175                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2176                 manifest_url = format_url(line.strip())
2177
2178                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2179                     format_id = [m3u8_id, None, idx]
2180                     # Bandwidth of live streams may differ over time thus making
2181                     # format_id unpredictable. So it's better to keep provided
2182                     # format_id intact.
2183                     if not live:
2184                         stream_name = build_stream_name()
2185                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2186                     f = {
2187                         'format_id': join_nonempty(*format_id),
2188                         'format_index': idx,
2189                         'url': manifest_url,
2190                         'manifest_url': m3u8_url,
2191                         'tbr': tbr,
2192                         'ext': ext,
2193                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2194                         'protocol': entry_protocol,
2195                         'preference': preference,
2196                         'quality': quality,
2197                     }
2198                     resolution = last_stream_inf.get('RESOLUTION')
2199                     if resolution:
2200                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2201                         if mobj:
2202                             f['width'] = int(mobj.group('width'))
2203                             f['height'] = int(mobj.group('height'))
2204                     # Unified Streaming Platform
2205                     mobj = re.search(
2206                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2207                     if mobj:
2208                         abr, vbr = mobj.groups()
2209                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2210                         f.update({
2211                             'vbr': vbr,
2212                             'abr': abr,
2213                         })
2214                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2215                     f.update(codecs)
2216                     audio_group_id = last_stream_inf.get('AUDIO')
2217                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2218                     # references a rendition group MUST have a CODECS attribute.
2219                     # However, this is not always respected, for example, [2]
2220                     # contains EXT-X-STREAM-INF tag which references AUDIO
2221                     # rendition group but does not have CODECS and despite
2222                     # referencing an audio group it represents a complete
2223                     # (with audio and video) format. So, for such cases we will
2224                     # ignore references to rendition groups and treat them
2225                     # as complete formats.
2226                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2227                         audio_group = groups.get(audio_group_id)
2228                         if audio_group and audio_group[0].get('URI'):
2229                             # TODO: update acodec for audio only formats with
2230                             # the same GROUP-ID
2231                             f['acodec'] = 'none'
2232                     if not f.get('ext'):
2233                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2234                     formats.append(f)
2235
2236                     # for DailyMotion
2237                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2238                     if progressive_uri:
2239                         http_f = f.copy()
2240                         del http_f['manifest_url']
2241                         http_f.update({
2242                             'format_id': f['format_id'].replace('hls-', 'http-'),
2243                             'protocol': 'http',
2244                             'url': progressive_uri,
2245                         })
2246                         formats.append(http_f)
2247
2248                 last_stream_inf = {}
2249         return formats, subtitles
2250
2251     def _extract_m3u8_vod_duration(
2252             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2253
2254         m3u8_vod = self._download_webpage(
2255             m3u8_vod_url, video_id,
2256             note='Downloading m3u8 VOD manifest' if note is None else note,
2257             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2258             fatal=False, data=data, headers=headers, query=query)
2259
2260         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2261
2262     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2263         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2264             return None
2265
2266         return int(sum(
2267             float(line[len('#EXTINF:'):].split(',')[0])
2268             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2269
2270     @staticmethod
2271     def _xpath_ns(path, namespace=None):
2272         if not namespace:
2273             return path
2274         out = []
2275         for c in path.split('/'):
2276             if not c or c == '.':
2277                 out.append(c)
2278             else:
2279                 out.append('{%s}%s' % (namespace, c))
2280         return '/'.join(out)
2281
2282     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2283         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2284
2285         if smil is False:
2286             assert not fatal
2287             return []
2288
2289         namespace = self._parse_smil_namespace(smil)
2290
2291         fmts = self._parse_smil_formats(
2292             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2293         subs = self._parse_smil_subtitles(
2294             smil, namespace=namespace)
2295
2296         return fmts, subs
2297
2298     def _extract_smil_formats(self, *args, **kwargs):
2299         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2300         if subs:
2301             self._report_ignoring_subs('SMIL')
2302         return fmts
2303
2304     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2305         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2306         if smil is False:
2307             return {}
2308         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2309
2310     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2311         return self._download_xml(
2312             smil_url, video_id, 'Downloading SMIL file',
2313             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2314
2315     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2316         namespace = self._parse_smil_namespace(smil)
2317
2318         formats = self._parse_smil_formats(
2319             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2320         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2321
2322         video_id = os.path.splitext(url_basename(smil_url))[0]
2323         title = None
2324         description = None
2325         upload_date = None
2326         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2327             name = meta.attrib.get('name')
2328             content = meta.attrib.get('content')
2329             if not name or not content:
2330                 continue
2331             if not title and name == 'title':
2332                 title = content
2333             elif not description and name in ('description', 'abstract'):
2334                 description = content
2335             elif not upload_date and name == 'date':
2336                 upload_date = unified_strdate(content)
2337
2338         thumbnails = [{
2339             'id': image.get('type'),
2340             'url': image.get('src'),
2341             'width': int_or_none(image.get('width')),
2342             'height': int_or_none(image.get('height')),
2343         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2344
2345         return {
2346             'id': video_id,
2347             'title': title or video_id,
2348             'description': description,
2349             'upload_date': upload_date,
2350             'thumbnails': thumbnails,
2351             'formats': formats,
2352             'subtitles': subtitles,
2353         }
2354
2355     def _parse_smil_namespace(self, smil):
2356         return self._search_regex(
2357             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2358
2359     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2360         base = smil_url
2361         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2362             b = meta.get('base') or meta.get('httpBase')
2363             if b:
2364                 base = b
2365                 break
2366
2367         formats = []
2368         rtmp_count = 0
2369         http_count = 0
2370         m3u8_count = 0
2371         imgs_count = 0
2372
2373         srcs = set()
2374         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2375         for medium in media:
2376             src = medium.get('src')
2377             if not src or src in srcs:
2378                 continue
2379             srcs.add(src)
2380
2381             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2382             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2383             width = int_or_none(medium.get('width'))
2384             height = int_or_none(medium.get('height'))
2385             proto = medium.get('proto')
2386             ext = medium.get('ext')
2387             src_ext = determine_ext(src)
2388             streamer = medium.get('streamer') or base
2389
2390             if proto == 'rtmp' or streamer.startswith('rtmp'):
2391                 rtmp_count += 1
2392                 formats.append({
2393                     'url': streamer,
2394                     'play_path': src,
2395                     'ext': 'flv',
2396                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2397                     'tbr': bitrate,
2398                     'filesize': filesize,
2399                     'width': width,
2400                     'height': height,
2401                 })
2402                 if transform_rtmp_url:
2403                     streamer, src = transform_rtmp_url(streamer, src)
2404                     formats[-1].update({
2405                         'url': streamer,
2406                         'play_path': src,
2407                     })
2408                 continue
2409
2410             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2411             src_url = src_url.strip()
2412
2413             if proto == 'm3u8' or src_ext == 'm3u8':
2414                 m3u8_formats = self._extract_m3u8_formats(
2415                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2416                 if len(m3u8_formats) == 1:
2417                     m3u8_count += 1
2418                     m3u8_formats[0].update({
2419                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2420                         'tbr': bitrate,
2421                         'width': width,
2422                         'height': height,
2423                     })
2424                 formats.extend(m3u8_formats)
2425             elif src_ext == 'f4m':
2426                 f4m_url = src_url
2427                 if not f4m_params:
2428                     f4m_params = {
2429                         'hdcore': '3.2.0',
2430                         'plugin': 'flowplayer-3.2.0.1',
2431                     }
2432                 f4m_url += '&' if '?' in f4m_url else '?'
2433                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2434                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2435             elif src_ext == 'mpd':
2436                 formats.extend(self._extract_mpd_formats(
2437                     src_url, video_id, mpd_id='dash', fatal=False))
2438             elif re.search(r'\.ism/[Mm]anifest', src_url):
2439                 formats.extend(self._extract_ism_formats(
2440                     src_url, video_id, ism_id='mss', fatal=False))
2441             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2442                 http_count += 1
2443                 formats.append({
2444                     'url': src_url,
2445                     'ext': ext or src_ext or 'flv',
2446                     'format_id': 'http-%d' % (bitrate or http_count),
2447                     'tbr': bitrate,
2448                     'filesize': filesize,
2449                     'width': width,
2450                     'height': height,
2451                 })
2452
2453         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2454             src = medium.get('src')
2455             if not src or src in srcs:
2456                 continue
2457             srcs.add(src)
2458
2459             imgs_count += 1
2460             formats.append({
2461                 'format_id': 'imagestream-%d' % (imgs_count),
2462                 'url': src,
2463                 'ext': mimetype2ext(medium.get('type')),
2464                 'acodec': 'none',
2465                 'vcodec': 'none',
2466                 'width': int_or_none(medium.get('width')),
2467                 'height': int_or_none(medium.get('height')),
2468                 'format_note': 'SMIL storyboards',
2469             })
2470
2471         return formats
2472
2473     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2474         urls = []
2475         subtitles = {}
2476         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2477             src = textstream.get('src')
2478             if not src or src in urls:
2479                 continue
2480             urls.append(src)
2481             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2482             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2483             subtitles.setdefault(lang, []).append({
2484                 'url': src,
2485                 'ext': ext,
2486             })
2487         return subtitles
2488
2489     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2490         xspf = self._download_xml(
2491             xspf_url, playlist_id, 'Downloading xpsf playlist',
2492             'Unable to download xspf manifest', fatal=fatal)
2493         if xspf is False:
2494             return []
2495         return self._parse_xspf(
2496             xspf, playlist_id, xspf_url=xspf_url,
2497             xspf_base_url=base_url(xspf_url))
2498
2499     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2500         NS_MAP = {
2501             'xspf': 'http://xspf.org/ns/0/',
2502             's1': 'http://static.streamone.nl/player/ns/0',
2503         }
2504
2505         entries = []
2506         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2507             title = xpath_text(
2508                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2509             description = xpath_text(
2510                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2511             thumbnail = xpath_text(
2512                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2513             duration = float_or_none(
2514                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2515
2516             formats = []
2517             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2518                 format_url = urljoin(xspf_base_url, location.text)
2519                 if not format_url:
2520                     continue
2521                 formats.append({
2522                     'url': format_url,
2523                     'manifest_url': xspf_url,
2524                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2525                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2526                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2527                 })
2528             self._sort_formats(formats)
2529
2530             entries.append({
2531                 'id': playlist_id,
2532                 'title': title,
2533                 'description': description,
2534                 'thumbnail': thumbnail,
2535                 'duration': duration,
2536                 'formats': formats,
2537             })
2538         return entries
2539
2540     def _extract_mpd_formats(self, *args, **kwargs):
2541         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2542         if subs:
2543             self._report_ignoring_subs('DASH')
2544         return fmts
2545
2546     def _extract_mpd_formats_and_subtitles(
2547             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2548             fatal=True, data=None, headers={}, query={}):
2549         res = self._download_xml_handle(
2550             mpd_url, video_id,
2551             note='Downloading MPD manifest' if note is None else note,
2552             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2553             fatal=fatal, data=data, headers=headers, query=query)
2554         if res is False:
2555             return [], {}
2556         mpd_doc, urlh = res
2557         if mpd_doc is None:
2558             return [], {}
2559         mpd_base_url = base_url(urlh.geturl())
2560
2561         return self._parse_mpd_formats_and_subtitles(
2562             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2563
2564     def _parse_mpd_formats(self, *args, **kwargs):
2565         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2566         if subs:
2567             self._report_ignoring_subs('DASH')
2568         return fmts
2569
2570     def _parse_mpd_formats_and_subtitles(
2571             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2572         """
2573         Parse formats from MPD manifest.
2574         References:
2575          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2576             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2577          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2578         """
2579         if not self.get_param('dynamic_mpd', True):
2580             if mpd_doc.get('type') == 'dynamic':
2581                 return [], {}
2582
2583         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2584
2585         def _add_ns(path):
2586             return self._xpath_ns(path, namespace)
2587
2588         def is_drm_protected(element):
2589             return element.find(_add_ns('ContentProtection')) is not None
2590
2591         def extract_multisegment_info(element, ms_parent_info):
2592             ms_info = ms_parent_info.copy()
2593
2594             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2595             # common attributes and elements.  We will only extract relevant
2596             # for us.
2597             def extract_common(source):
2598                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2599                 if segment_timeline is not None:
2600                     s_e = segment_timeline.findall(_add_ns('S'))
2601                     if s_e:
2602                         ms_info['total_number'] = 0
2603                         ms_info['s'] = []
2604                         for s in s_e:
2605                             r = int(s.get('r', 0))
2606                             ms_info['total_number'] += 1 + r
2607                             ms_info['s'].append({
2608                                 't': int(s.get('t', 0)),
2609                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2610                                 'd': int(s.attrib['d']),
2611                                 'r': r,
2612                             })
2613                 start_number = source.get('startNumber')
2614                 if start_number:
2615                     ms_info['start_number'] = int(start_number)
2616                 timescale = source.get('timescale')
2617                 if timescale:
2618                     ms_info['timescale'] = int(timescale)
2619                 segment_duration = source.get('duration')
2620                 if segment_duration:
2621                     ms_info['segment_duration'] = float(segment_duration)
2622
2623             def extract_Initialization(source):
2624                 initialization = source.find(_add_ns('Initialization'))
2625                 if initialization is not None:
2626                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2627
2628             segment_list = element.find(_add_ns('SegmentList'))
2629             if segment_list is not None:
2630                 extract_common(segment_list)
2631                 extract_Initialization(segment_list)
2632                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2633                 if segment_urls_e:
2634                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2635             else:
2636                 segment_template = element.find(_add_ns('SegmentTemplate'))
2637                 if segment_template is not None:
2638                     extract_common(segment_template)
2639                     media = segment_template.get('media')
2640                     if media:
2641                         ms_info['media'] = media
2642                     initialization = segment_template.get('initialization')
2643                     if initialization:
2644                         ms_info['initialization'] = initialization
2645                     else:
2646                         extract_Initialization(segment_template)
2647             return ms_info
2648
2649         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2650         formats, subtitles = [], {}
2651         stream_numbers = {'audio': 0, 'video': 0}
2652         for period in mpd_doc.findall(_add_ns('Period')):
2653             period_duration = parse_duration(period.get('duration')) or mpd_duration
2654             period_ms_info = extract_multisegment_info(period, {
2655                 'start_number': 1,
2656                 'timescale': 1,
2657             })
2658             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2659                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2660                 for representation in adaptation_set.findall(_add_ns('Representation')):
2661                     representation_attrib = adaptation_set.attrib.copy()
2662                     representation_attrib.update(representation.attrib)
2663                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2664                     mime_type = representation_attrib['mimeType']
2665                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2666
2667                     codecs = representation_attrib.get('codecs', '')
2668                     if content_type not in ('video', 'audio', 'text'):
2669                         if mime_type == 'image/jpeg':
2670                             content_type = mime_type
2671                         elif codecs.split('.')[0] == 'stpp':
2672                             content_type = 'text'
2673                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2674                             content_type = 'text'
2675                         else:
2676                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2677                             continue
2678
2679                     base_url = ''
2680                     for element in (representation, adaptation_set, period, mpd_doc):
2681                         base_url_e = element.find(_add_ns('BaseURL'))
2682                         if base_url_e is not None:
2683                             base_url = base_url_e.text + base_url
2684                             if re.match(r'^https?://', base_url):
2685                                 break
2686                     if mpd_base_url and base_url.startswith('/'):
2687                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2688                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2689                         if not mpd_base_url.endswith('/'):
2690                             mpd_base_url += '/'
2691                         base_url = mpd_base_url + base_url
2692                     representation_id = representation_attrib.get('id')
2693                     lang = representation_attrib.get('lang')
2694                     url_el = representation.find(_add_ns('BaseURL'))
2695                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2696                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2697                     if representation_id is not None:
2698                         format_id = representation_id
2699                     else:
2700                         format_id = content_type
2701                     if mpd_id:
2702                         format_id = mpd_id + '-' + format_id
2703                     if content_type in ('video', 'audio'):
2704                         f = {
2705                             'format_id': format_id,
2706                             'manifest_url': mpd_url,
2707                             'ext': mimetype2ext(mime_type),
2708                             'width': int_or_none(representation_attrib.get('width')),
2709                             'height': int_or_none(representation_attrib.get('height')),
2710                             'tbr': float_or_none(bandwidth, 1000),
2711                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2712                             'fps': int_or_none(representation_attrib.get('frameRate')),
2713                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2714                             'format_note': 'DASH %s' % content_type,
2715                             'filesize': filesize,
2716                             'container': mimetype2ext(mime_type) + '_dash',
2717                             'manifest_stream_number': stream_numbers[content_type]
2718                         }
2719                         f.update(parse_codecs(codecs))
2720                         stream_numbers[content_type] += 1
2721                     elif content_type == 'text':
2722                         f = {
2723                             'ext': mimetype2ext(mime_type),
2724                             'manifest_url': mpd_url,
2725                             'filesize': filesize,
2726                         }
2727                     elif content_type == 'image/jpeg':
2728                         # See test case in VikiIE
2729                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2730                         f = {
2731                             'format_id': format_id,
2732                             'ext': 'mhtml',
2733                             'manifest_url': mpd_url,
2734                             'format_note': 'DASH storyboards (jpeg)',
2735                             'acodec': 'none',
2736                             'vcodec': 'none',
2737                         }
2738                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2739                         f['has_drm'] = True
2740                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2741
2742                     def prepare_template(template_name, identifiers):
2743                         tmpl = representation_ms_info[template_name]
2744                         # First of, % characters outside $...$ templates
2745                         # must be escaped by doubling for proper processing
2746                         # by % operator string formatting used further (see
2747                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2748                         t = ''
2749                         in_template = False
2750                         for c in tmpl:
2751                             t += c
2752                             if c == '$':
2753                                 in_template = not in_template
2754                             elif c == '%' and not in_template:
2755                                 t += c
2756                         # Next, $...$ templates are translated to their
2757                         # %(...) counterparts to be used with % operator
2758                         if representation_id is not None:
2759                             t = t.replace('$RepresentationID$', representation_id)
2760                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2761                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2762                         t.replace('$$', '$')
2763                         return t
2764
2765                     # @initialization is a regular template like @media one
2766                     # so it should be handled just the same way (see
2767                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2768                     if 'initialization' in representation_ms_info:
2769                         initialization_template = prepare_template(
2770                             'initialization',
2771                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2772                             # $Time$ shall not be included for @initialization thus
2773                             # only $Bandwidth$ remains
2774                             ('Bandwidth', ))
2775                         representation_ms_info['initialization_url'] = initialization_template % {
2776                             'Bandwidth': bandwidth,
2777                         }
2778
2779                     def location_key(location):
2780                         return 'url' if re.match(r'^https?://', location) else 'path'
2781
2782                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2783
2784                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2785                         media_location_key = location_key(media_template)
2786
2787                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2788                         # can't be used at the same time
2789                         if '%(Number' in media_template and 's' not in representation_ms_info:
2790                             segment_duration = None
2791                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2792                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2793                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2794                             representation_ms_info['fragments'] = [{
2795                                 media_location_key: media_template % {
2796                                     'Number': segment_number,
2797                                     'Bandwidth': bandwidth,
2798                                 },
2799                                 'duration': segment_duration,
2800                             } for segment_number in range(
2801                                 representation_ms_info['start_number'],
2802                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2803                         else:
2804                             # $Number*$ or $Time$ in media template with S list available
2805                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2806                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2807                             representation_ms_info['fragments'] = []
2808                             segment_time = 0
2809                             segment_d = None
2810                             segment_number = representation_ms_info['start_number']
2811
2812                             def add_segment_url():
2813                                 segment_url = media_template % {
2814                                     'Time': segment_time,
2815                                     'Bandwidth': bandwidth,
2816                                     'Number': segment_number,
2817                                 }
2818                                 representation_ms_info['fragments'].append({
2819                                     media_location_key: segment_url,
2820                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2821                                 })
2822
2823                             for num, s in enumerate(representation_ms_info['s']):
2824                                 segment_time = s.get('t') or segment_time
2825                                 segment_d = s['d']
2826                                 add_segment_url()
2827                                 segment_number += 1
2828                                 for r in range(s.get('r', 0)):
2829                                     segment_time += segment_d
2830                                     add_segment_url()
2831                                     segment_number += 1
2832                                 segment_time += segment_d
2833                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2834                         # No media template
2835                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2836                         # or any YouTube dashsegments video
2837                         fragments = []
2838                         segment_index = 0
2839                         timescale = representation_ms_info['timescale']
2840                         for s in representation_ms_info['s']:
2841                             duration = float_or_none(s['d'], timescale)
2842                             for r in range(s.get('r', 0) + 1):
2843                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2844                                 fragments.append({
2845                                     location_key(segment_uri): segment_uri,
2846                                     'duration': duration,
2847                                 })
2848                                 segment_index += 1
2849                         representation_ms_info['fragments'] = fragments
2850                     elif 'segment_urls' in representation_ms_info:
2851                         # Segment URLs with no SegmentTimeline
2852                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2853                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2854                         fragments = []
2855                         segment_duration = float_or_none(
2856                             representation_ms_info['segment_duration'],
2857                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2858                         for segment_url in representation_ms_info['segment_urls']:
2859                             fragment = {
2860                                 location_key(segment_url): segment_url,
2861                             }
2862                             if segment_duration:
2863                                 fragment['duration'] = segment_duration
2864                             fragments.append(fragment)
2865                         representation_ms_info['fragments'] = fragments
2866                     # If there is a fragments key available then we correctly recognized fragmented media.
2867                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2868                     # assumption is not necessarily correct since we may simply have no support for
2869                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2870                     if 'fragments' in representation_ms_info:
2871                         f.update({
2872                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2873                             'url': mpd_url or base_url,
2874                             'fragment_base_url': base_url,
2875                             'fragments': [],
2876                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2877                         })
2878                         if 'initialization_url' in representation_ms_info:
2879                             initialization_url = representation_ms_info['initialization_url']
2880                             if not f.get('url'):
2881                                 f['url'] = initialization_url
2882                             f['fragments'].append({location_key(initialization_url): initialization_url})
2883                         f['fragments'].extend(representation_ms_info['fragments'])
2884                     else:
2885                         # Assuming direct URL to unfragmented media.
2886                         f['url'] = base_url
2887                     if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2888                         formats.append(f)
2889                     elif content_type == 'text':
2890                         subtitles.setdefault(lang or 'und', []).append(f)
2891
2892         return formats, subtitles
2893
2894     def _extract_ism_formats(self, *args, **kwargs):
2895         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2896         if subs:
2897             self._report_ignoring_subs('ISM')
2898         return fmts
2899
2900     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2901         res = self._download_xml_handle(
2902             ism_url, video_id,
2903             note='Downloading ISM manifest' if note is None else note,
2904             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2905             fatal=fatal, data=data, headers=headers, query=query)
2906         if res is False:
2907             return [], {}
2908         ism_doc, urlh = res
2909         if ism_doc is None:
2910             return [], {}
2911
2912         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2913
2914     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2915         """
2916         Parse formats from ISM manifest.
2917         References:
2918          1. [MS-SSTR]: Smooth Streaming Protocol,
2919             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2920         """
2921         if ism_doc.get('IsLive') == 'TRUE':
2922             return [], {}
2923
2924         duration = int(ism_doc.attrib['Duration'])
2925         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2926
2927         formats = []
2928         subtitles = {}
2929         for stream in ism_doc.findall('StreamIndex'):
2930             stream_type = stream.get('Type')
2931             if stream_type not in ('video', 'audio', 'text'):
2932                 continue
2933             url_pattern = stream.attrib['Url']
2934             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2935             stream_name = stream.get('Name')
2936             stream_language = stream.get('Language', 'und')
2937             for track in stream.findall('QualityLevel'):
2938                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2939                 # TODO: add support for WVC1 and WMAP
2940                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2941                     self.report_warning('%s is not a supported codec' % fourcc)
2942                     continue
2943                 tbr = int(track.attrib['Bitrate']) // 1000
2944                 # [1] does not mention Width and Height attributes. However,
2945                 # they're often present while MaxWidth and MaxHeight are
2946                 # missing, so should be used as fallbacks
2947                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2948                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2949                 sampling_rate = int_or_none(track.get('SamplingRate'))
2950
2951                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2952                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2953
2954                 fragments = []
2955                 fragment_ctx = {
2956                     'time': 0,
2957                 }
2958                 stream_fragments = stream.findall('c')
2959                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2960                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2961                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2962                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2963                     if not fragment_ctx['duration']:
2964                         try:
2965                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2966                         except IndexError:
2967                             next_fragment_time = duration
2968                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2969                     for _ in range(fragment_repeat):
2970                         fragments.append({
2971                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2972                             'duration': fragment_ctx['duration'] / stream_timescale,
2973                         })
2974                         fragment_ctx['time'] += fragment_ctx['duration']
2975
2976                 if stream_type == 'text':
2977                     subtitles.setdefault(stream_language, []).append({
2978                         'ext': 'ismt',
2979                         'protocol': 'ism',
2980                         'url': ism_url,
2981                         'manifest_url': ism_url,
2982                         'fragments': fragments,
2983                         '_download_params': {
2984                             'stream_type': stream_type,
2985                             'duration': duration,
2986                             'timescale': stream_timescale,
2987                             'fourcc': fourcc,
2988                             'language': stream_language,
2989                             'codec_private_data': track.get('CodecPrivateData'),
2990                         }
2991                     })
2992                 elif stream_type in ('video', 'audio'):
2993                     formats.append({
2994                         'format_id': join_nonempty(ism_id, stream_name, tbr),
2995                         'url': ism_url,
2996                         'manifest_url': ism_url,
2997                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2998                         'width': width,
2999                         'height': height,
3000                         'tbr': tbr,
3001                         'asr': sampling_rate,
3002                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3003                         'acodec': 'none' if stream_type == 'video' else fourcc,
3004                         'protocol': 'ism',
3005                         'fragments': fragments,
3006                         'has_drm': ism_doc.find('Protection') is not None,
3007                         '_download_params': {
3008                             'stream_type': stream_type,
3009                             'duration': duration,
3010                             'timescale': stream_timescale,
3011                             'width': width or 0,
3012                             'height': height or 0,
3013                             'fourcc': fourcc,
3014                             'language': stream_language,
3015                             'codec_private_data': track.get('CodecPrivateData'),
3016                             'sampling_rate': sampling_rate,
3017                             'channels': int_or_none(track.get('Channels', 2)),
3018                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3019                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3020                         },
3021                     })
3022         return formats, subtitles
3023
3024     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3025         def absolute_url(item_url):
3026             return urljoin(base_url, item_url)
3027
3028         def parse_content_type(content_type):
3029             if not content_type:
3030                 return {}
3031             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3032             if ctr:
3033                 mimetype, codecs = ctr.groups()
3034                 f = parse_codecs(codecs)
3035                 f['ext'] = mimetype2ext(mimetype)
3036                 return f
3037             return {}
3038
3039         def _media_formats(src, cur_media_type, type_info={}):
3040             full_url = absolute_url(src)
3041             ext = type_info.get('ext') or determine_ext(full_url)
3042             if ext == 'm3u8':
3043                 is_plain_url = False
3044                 formats = self._extract_m3u8_formats(
3045                     full_url, video_id, ext='mp4',
3046                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3047                     preference=preference, quality=quality, fatal=False)
3048             elif ext == 'mpd':
3049                 is_plain_url = False
3050                 formats = self._extract_mpd_formats(
3051                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3052             else:
3053                 is_plain_url = True
3054                 formats = [{
3055                     'url': full_url,
3056                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3057                 }]
3058             return is_plain_url, formats
3059
3060         entries = []
3061         # amp-video and amp-audio are very similar to their HTML5 counterparts
3062         # so we wll include them right here (see
3063         # https://www.ampproject.org/docs/reference/components/amp-video)
3064         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3065         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3066         media_tags = [(media_tag, media_tag_name, media_type, '')
3067                       for media_tag, media_tag_name, media_type
3068                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3069         media_tags.extend(re.findall(
3070             # We only allow video|audio followed by a whitespace or '>'.
3071             # Allowing more characters may end up in significant slow down (see
3072             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3073             # http://www.porntrex.com/maps/videositemap.xml).
3074             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3075         for media_tag, _, media_type, media_content in media_tags:
3076             media_info = {
3077                 'formats': [],
3078                 'subtitles': {},
3079             }
3080             media_attributes = extract_attributes(media_tag)
3081             src = strip_or_none(media_attributes.get('src'))
3082             if src:
3083                 _, formats = _media_formats(src, media_type)
3084                 media_info['formats'].extend(formats)
3085             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3086             if media_content:
3087                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3088                     s_attr = extract_attributes(source_tag)
3089                     # data-video-src and data-src are non standard but seen
3090                     # several times in the wild
3091                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3092                     if not src:
3093                         continue
3094                     f = parse_content_type(s_attr.get('type'))
3095                     is_plain_url, formats = _media_formats(src, media_type, f)
3096                     if is_plain_url:
3097                         # width, height, res, label and title attributes are
3098                         # all not standard but seen several times in the wild
3099                         labels = [
3100                             s_attr.get(lbl)
3101                             for lbl in ('label', 'title')
3102                             if str_or_none(s_attr.get(lbl))
3103                         ]
3104                         width = int_or_none(s_attr.get('width'))
3105                         height = (int_or_none(s_attr.get('height'))
3106                                   or int_or_none(s_attr.get('res')))
3107                         if not width or not height:
3108                             for lbl in labels:
3109                                 resolution = parse_resolution(lbl)
3110                                 if not resolution:
3111                                     continue
3112                                 width = width or resolution.get('width')
3113                                 height = height or resolution.get('height')
3114                         for lbl in labels:
3115                             tbr = parse_bitrate(lbl)
3116                             if tbr:
3117                                 break
3118                         else:
3119                             tbr = None
3120                         f.update({
3121                             'width': width,
3122                             'height': height,
3123                             'tbr': tbr,
3124                             'format_id': s_attr.get('label') or s_attr.get('title'),
3125                         })
3126                         f.update(formats[0])
3127                         media_info['formats'].append(f)
3128                     else:
3129                         media_info['formats'].extend(formats)
3130                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3131                     track_attributes = extract_attributes(track_tag)
3132                     kind = track_attributes.get('kind')
3133                     if not kind or kind in ('subtitles', 'captions'):
3134                         src = strip_or_none(track_attributes.get('src'))
3135                         if not src:
3136                             continue
3137                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3138                         media_info['subtitles'].setdefault(lang, []).append({
3139                             'url': absolute_url(src),
3140                         })
3141             for f in media_info['formats']:
3142                 f.setdefault('http_headers', {})['Referer'] = base_url
3143             if media_info['formats'] or media_info['subtitles']:
3144                 entries.append(media_info)
3145         return entries
3146
3147     def _extract_akamai_formats(self, *args, **kwargs):
3148         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3149         if subs:
3150             self._report_ignoring_subs('akamai')
3151         return fmts
3152
3153     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3154         signed = 'hdnea=' in manifest_url
3155         if not signed:
3156             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3157             manifest_url = re.sub(
3158                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3159                 '', manifest_url).strip('?')
3160
3161         formats = []
3162         subtitles = {}
3163
3164         hdcore_sign = 'hdcore=3.7.0'
3165         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3166         hds_host = hosts.get('hds')
3167         if hds_host:
3168             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3169         if 'hdcore=' not in f4m_url:
3170             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3171         f4m_formats = self._extract_f4m_formats(
3172             f4m_url, video_id, f4m_id='hds', fatal=False)
3173         for entry in f4m_formats:
3174             entry.update({'extra_param_to_segment_url': hdcore_sign})
3175         formats.extend(f4m_formats)
3176
3177         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3178         hls_host = hosts.get('hls')
3179         if hls_host:
3180             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3181         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3182             m3u8_url, video_id, 'mp4', 'm3u8_native',
3183             m3u8_id='hls', fatal=False)
3184         formats.extend(m3u8_formats)
3185         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3186
3187         http_host = hosts.get('http')
3188         if http_host and m3u8_formats and not signed:
3189             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3190             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3191             qualities_length = len(qualities)
3192             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3193                 i = 0
3194                 for f in m3u8_formats:
3195                     if f['vcodec'] != 'none':
3196                         for protocol in ('http', 'https'):
3197                             http_f = f.copy()
3198                             del http_f['manifest_url']
3199                             http_url = re.sub(
3200                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3201                             http_f.update({
3202                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3203                                 'url': http_url,
3204                                 'protocol': protocol,
3205                             })
3206                             formats.append(http_f)
3207                         i += 1
3208
3209         return formats, subtitles
3210
3211     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3212         query = compat_urlparse.urlparse(url).query
3213         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3214         mobj = re.search(
3215             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3216         url_base = mobj.group('url')
3217         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3218         formats = []
3219
3220         def manifest_url(manifest):
3221             m_url = '%s/%s' % (http_base_url, manifest)
3222             if query:
3223                 m_url += '?%s' % query
3224             return m_url
3225
3226         if 'm3u8' not in skip_protocols:
3227             formats.extend(self._extract_m3u8_formats(
3228                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3229                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3230         if 'f4m' not in skip_protocols:
3231             formats.extend(self._extract_f4m_formats(
3232                 manifest_url('manifest.f4m'),
3233                 video_id, f4m_id='hds', fatal=False))
3234         if 'dash' not in skip_protocols:
3235             formats.extend(self._extract_mpd_formats(
3236                 manifest_url('manifest.mpd'),
3237                 video_id, mpd_id='dash', fatal=False))
3238         if re.search(r'(?:/smil:|\.smil)', url_base):
3239             if 'smil' not in skip_protocols:
3240                 rtmp_formats = self._extract_smil_formats(
3241                     manifest_url('jwplayer.smil'),
3242                     video_id, fatal=False)
3243                 for rtmp_format in rtmp_formats:
3244                     rtsp_format = rtmp_format.copy()
3245                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3246                     del rtsp_format['play_path']
3247                     del rtsp_format['ext']
3248                     rtsp_format.update({
3249                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3250                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3251                         'protocol': 'rtsp',
3252                     })
3253                     formats.extend([rtmp_format, rtsp_format])
3254         else:
3255             for protocol in ('rtmp', 'rtsp'):
3256                 if protocol not in skip_protocols:
3257                     formats.append({
3258                         'url': '%s:%s' % (protocol, url_base),
3259                         'format_id': protocol,
3260                         'protocol': protocol,
3261                     })
3262         return formats
3263
3264     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3265         mobj = re.search(
3266             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3267             webpage)
3268         if mobj:
3269             try:
3270                 jwplayer_data = self._parse_json(mobj.group('options'),
3271                                                  video_id=video_id,
3272                                                  transform_source=transform_source)
3273             except ExtractorError:
3274                 pass
3275             else:
3276                 if isinstance(jwplayer_data, dict):
3277                     return jwplayer_data
3278
3279     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3280         jwplayer_data = self._find_jwplayer_data(
3281             webpage, video_id, transform_source=js_to_json)
3282         return self._parse_jwplayer_data(
3283             jwplayer_data, video_id, *args, **kwargs)
3284
3285     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3286                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3287         # JWPlayer backward compatibility: flattened playlists
3288         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3289         if 'playlist' not in jwplayer_data:
3290             jwplayer_data = {'playlist': [jwplayer_data]}
3291
3292         entries = []
3293
3294         # JWPlayer backward compatibility: single playlist item
3295         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3296         if not isinstance(jwplayer_data['playlist'], list):
3297             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3298
3299         for video_data in jwplayer_data['playlist']:
3300             # JWPlayer backward compatibility: flattened sources
3301             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3302             if 'sources' not in video_data:
3303                 video_data['sources'] = [video_data]
3304
3305             this_video_id = video_id or video_data['mediaid']
3306
3307             formats = self._parse_jwplayer_formats(
3308                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3309                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3310
3311             subtitles = {}
3312             tracks = video_data.get('tracks')
3313             if tracks and isinstance(tracks, list):
3314                 for track in tracks:
3315                     if not isinstance(track, dict):
3316                         continue
3317                     track_kind = track.get('kind')
3318                     if not track_kind or not isinstance(track_kind, compat_str):
3319                         continue
3320                     if track_kind.lower() not in ('captions', 'subtitles'):
3321                         continue
3322                     track_url = urljoin(base_url, track.get('file'))
3323                     if not track_url:
3324                         continue
3325                     subtitles.setdefault(track.get('label') or 'en', []).append({
3326                         'url': self._proto_relative_url(track_url)
3327                     })
3328
3329             entry = {
3330                 'id': this_video_id,
3331                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3332                 'description': clean_html(video_data.get('description')),
3333                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3334                 'timestamp': int_or_none(video_data.get('pubdate')),
3335                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3336                 'subtitles': subtitles,
3337             }
3338             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3339             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3340                 entry.update({
3341                     '_type': 'url_transparent',
3342                     'url': formats[0]['url'],
3343                 })
3344             else:
3345                 self._sort_formats(formats)
3346                 entry['formats'] = formats
3347             entries.append(entry)
3348         if len(entries) == 1:
3349             return entries[0]
3350         else:
3351             return self.playlist_result(entries)
3352
3353     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3354                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3355         urls = []
3356         formats = []
3357         for source in jwplayer_sources_data:
3358             if not isinstance(source, dict):
3359                 continue
3360             source_url = urljoin(
3361                 base_url, self._proto_relative_url(source.get('file')))
3362             if not source_url or source_url in urls:
3363                 continue
3364             urls.append(source_url)
3365             source_type = source.get('type') or ''
3366             ext = mimetype2ext(source_type) or determine_ext(source_url)
3367             if source_type == 'hls' or ext == 'm3u8':
3368                 formats.extend(self._extract_m3u8_formats(
3369                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3370                     m3u8_id=m3u8_id, fatal=False))
3371             elif source_type == 'dash' or ext == 'mpd':
3372                 formats.extend(self._extract_mpd_formats(
3373                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3374             elif ext == 'smil':
3375                 formats.extend(self._extract_smil_formats(
3376                     source_url, video_id, fatal=False))
3377             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3378             elif source_type.startswith('audio') or ext in (
3379                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3380                 formats.append({
3381                     'url': source_url,
3382                     'vcodec': 'none',
3383                     'ext': ext,
3384                 })
3385             else:
3386                 height = int_or_none(source.get('height'))
3387                 if height is None:
3388                     # Often no height is provided but there is a label in
3389                     # format like "1080p", "720p SD", or 1080.
3390                     height = int_or_none(self._search_regex(
3391                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3392                         'height', default=None))
3393                 a_format = {
3394                     'url': source_url,
3395                     'width': int_or_none(source.get('width')),
3396                     'height': height,
3397                     'tbr': int_or_none(source.get('bitrate')),
3398                     'ext': ext,
3399                 }
3400                 if source_url.startswith('rtmp'):
3401                     a_format['ext'] = 'flv'
3402                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3403                     # of jwplayer.flash.swf
3404                     rtmp_url_parts = re.split(
3405                         r'((?:mp4|mp3|flv):)', source_url, 1)
3406                     if len(rtmp_url_parts) == 3:
3407                         rtmp_url, prefix, play_path = rtmp_url_parts
3408                         a_format.update({
3409                             'url': rtmp_url,
3410                             'play_path': prefix + play_path,
3411                         })
3412                     if rtmp_params:
3413                         a_format.update(rtmp_params)
3414                 formats.append(a_format)
3415         return formats
3416
3417     def _live_title(self, name):
3418         """ Generate the title for a live video """
3419         now = datetime.datetime.now()
3420         now_str = now.strftime('%Y-%m-%d %H:%M')
3421         return name + ' ' + now_str
3422
3423     def _int(self, v, name, fatal=False, **kwargs):
3424         res = int_or_none(v, **kwargs)
3425         if 'get_attr' in kwargs:
3426             print(getattr(v, kwargs['get_attr']))
3427         if res is None:
3428             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3429             if fatal:
3430                 raise ExtractorError(msg)
3431             else:
3432                 self.report_warning(msg)
3433         return res
3434
3435     def _float(self, v, name, fatal=False, **kwargs):
3436         res = float_or_none(v, **kwargs)
3437         if res is None:
3438             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3439             if fatal:
3440                 raise ExtractorError(msg)
3441             else:
3442                 self.report_warning(msg)
3443         return res
3444
3445     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3446                     path='/', secure=False, discard=False, rest={}, **kwargs):
3447         cookie = compat_cookiejar_Cookie(
3448             0, name, value, port, port is not None, domain, True,
3449             domain.startswith('.'), path, True, secure, expire_time,
3450             discard, None, None, rest)
3451         self._downloader.cookiejar.set_cookie(cookie)
3452
3453     def _get_cookies(self, url):
3454         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3455         req = sanitized_Request(url)
3456         self._downloader.cookiejar.add_cookie_header(req)
3457         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3458
3459     def _apply_first_set_cookie_header(self, url_handle, cookie):
3460         """
3461         Apply first Set-Cookie header instead of the last. Experimental.
3462
3463         Some sites (e.g. [1-3]) may serve two cookies under the same name
3464         in Set-Cookie header and expect the first (old) one to be set rather
3465         than second (new). However, as of RFC6265 the newer one cookie
3466         should be set into cookie store what actually happens.
3467         We will workaround this issue by resetting the cookie to
3468         the first one manually.
3469         1. https://new.vk.com/
3470         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3471         3. https://learning.oreilly.com/
3472         """
3473         for header, cookies in url_handle.headers.items():
3474             if header.lower() != 'set-cookie':
3475                 continue
3476             if sys.version_info[0] >= 3:
3477                 cookies = cookies.encode('iso-8859-1')
3478             cookies = cookies.decode('utf-8')
3479             cookie_value = re.search(
3480                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3481             if cookie_value:
3482                 value, domain = cookie_value.groups()
3483                 self._set_cookie(domain, cookie, value)
3484                 break
3485
3486     def get_testcases(self, include_onlymatching=False):
3487         t = getattr(self, '_TEST', None)
3488         if t:
3489             assert not hasattr(self, '_TESTS'), \
3490                 '%s has _TEST and _TESTS' % type(self).__name__
3491             tests = [t]
3492         else:
3493             tests = getattr(self, '_TESTS', [])
3494         for t in tests:
3495             if not include_onlymatching and t.get('only_matching', False):
3496                 continue
3497             t['name'] = type(self).__name__[:-len('IE')]
3498             yield t
3499
3500     def is_suitable(self, age_limit):
3501         """ Test whether the extractor is generally suitable for the given
3502         age limit (i.e. pornographic sites are not, all others usually are) """
3503
3504         any_restricted = False
3505         for tc in self.get_testcases(include_onlymatching=False):
3506             if tc.get('playlist', []):
3507                 tc = tc['playlist'][0]
3508             is_restricted = age_restricted(
3509                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3510             if not is_restricted:
3511                 return True
3512             any_restricted = any_restricted or is_restricted
3513         return not any_restricted
3514
3515     def extract_subtitles(self, *args, **kwargs):
3516         if (self.get_param('writesubtitles', False)
3517                 or self.get_param('listsubtitles')):
3518             return self._get_subtitles(*args, **kwargs)
3519         return {}
3520
3521     def _get_subtitles(self, *args, **kwargs):
3522         raise NotImplementedError('This method must be implemented by subclasses')
3523
3524     def extract_comments(self, *args, **kwargs):
3525         if not self.get_param('getcomments'):
3526             return None
3527         generator = self._get_comments(*args, **kwargs)
3528
3529         def extractor():
3530             comments = []
3531             try:
3532                 while True:
3533                     comments.append(next(generator))
3534             except KeyboardInterrupt:
3535                 interrupted = True
3536                 self.to_screen('Interrupted by user')
3537             except StopIteration:
3538                 interrupted = False
3539             comment_count = len(comments)
3540             self.to_screen(f'Extracted {comment_count} comments')
3541             return {
3542                 'comments': comments,
3543                 'comment_count': None if interrupted else comment_count
3544             }
3545         return extractor
3546
3547     def _get_comments(self, *args, **kwargs):
3548         raise NotImplementedError('This method must be implemented by subclasses')
3549
3550     @staticmethod
3551     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3552         """ Merge subtitle items for one language. Items with duplicated URLs
3553         will be dropped. """
3554         list1_urls = set([item['url'] for item in subtitle_list1])
3555         ret = list(subtitle_list1)
3556         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3557         return ret
3558
3559     @classmethod
3560     def _merge_subtitles(cls, *dicts, target=None):
3561         """ Merge subtitle dictionaries, language by language. """
3562         if target is None:
3563             target = {}
3564         for d in dicts:
3565             for lang, subs in d.items():
3566                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3567         return target
3568
3569     def extract_automatic_captions(self, *args, **kwargs):
3570         if (self.get_param('writeautomaticsub', False)
3571                 or self.get_param('listsubtitles')):
3572             return self._get_automatic_captions(*args, **kwargs)
3573         return {}
3574
3575     def _get_automatic_captions(self, *args, **kwargs):
3576         raise NotImplementedError('This method must be implemented by subclasses')
3577
3578     def mark_watched(self, *args, **kwargs):
3579         if not self.get_param('mark_watched', False):
3580             return
3581         if (self._get_login_info()[0] is not None
3582                 or self.get_param('cookiefile')
3583                 or self.get_param('cookiesfrombrowser')):
3584             self._mark_watched(*args, **kwargs)
3585
3586     def _mark_watched(self, *args, **kwargs):
3587         raise NotImplementedError('This method must be implemented by subclasses')
3588
3589     def geo_verification_headers(self):
3590         headers = {}
3591         geo_verification_proxy = self.get_param('geo_verification_proxy')
3592         if geo_verification_proxy:
3593             headers['Ytdl-request-proxy'] = geo_verification_proxy
3594         return headers
3595
3596     def _generic_id(self, url):
3597         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3598
3599     def _generic_title(self, url):
3600         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3601
3602     @staticmethod
3603     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3604         all_known = all(map(
3605             lambda x: x is not None,
3606             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3607         return (
3608             'private' if is_private
3609             else 'premium_only' if needs_premium
3610             else 'subscriber_only' if needs_subscription
3611             else 'needs_auth' if needs_auth
3612             else 'unlisted' if is_unlisted
3613             else 'public' if all_known
3614             else None)
3615
3616     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3617         '''
3618         @returns            A list of values for the extractor argument given by "key"
3619                             or "default" if no such key is present
3620         @param default      The default value to return when the key is not present (default: [])
3621         @param casesense    When false, the values are converted to lower case
3622         '''
3623         val = traverse_obj(
3624             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3625         if val is None:
3626             return [] if default is NO_DEFAULT else default
3627         return list(val) if casesense else [x.lower() for x in val]
3628
3629
3630 class SearchInfoExtractor(InfoExtractor):
3631     """
3632     Base class for paged search queries extractors.
3633     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3634     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3635     """
3636
3637     _MAX_RESULTS = float('inf')
3638
3639     @classmethod
3640     def _make_valid_url(cls):
3641         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3642
3643     @classmethod
3644     def suitable(cls, url):
3645         return re.match(cls._make_valid_url(), url) is not None
3646
3647     def _real_extract(self, query):
3648         mobj = re.match(self._make_valid_url(), query)
3649         if mobj is None:
3650             raise ExtractorError('Invalid search query "%s"' % query)
3651
3652         prefix = mobj.group('prefix')
3653         query = mobj.group('query')
3654         if prefix == '':
3655             return self._get_n_results(query, 1)
3656         elif prefix == 'all':
3657             return self._get_n_results(query, self._MAX_RESULTS)
3658         else:
3659             n = int(prefix)
3660             if n <= 0:
3661                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3662             elif n > self._MAX_RESULTS:
3663                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3664                 n = self._MAX_RESULTS
3665             return self._get_n_results(query, n)
3666
3667     def _get_n_results(self, query, n):
3668         """Get a specified number of results for a query.
3669         Either this function or _search_results must be overridden by subclasses """
3670         return self.playlist_result(
3671             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3672             query, query)
3673
3674     def _search_results(self, query):
3675         """Returns an iterator of search results"""
3676         raise NotImplementedError('This method must be implemented by subclasses')
3677
3678     @property
3679     def SEARCH_KEY(self):
3680         return self._SEARCH_KEY