yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import itertools
   8 import json
   9 import netrc
  10 import os
  11 import random
  12 import re
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar_Cookie,
  19     compat_cookies_SimpleCookie,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader import FileDownloader
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     error_to_compat_str,
  49     extract_attributes,
  50     ExtractorError,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     GeoRestrictedError,
  55     GeoUtils,
  56     int_or_none,
  57     join_nonempty,
  58     js_to_json,
  59     JSON_LD_RE,
  60     mimetype2ext,
  61     network_exceptions,
  62     NO_DEFAULT,
  63     orderedSet,
  64     parse_bitrate,
  65     parse_codecs,
  66     parse_duration,
  67     parse_iso8601,
  68     parse_m3u8_attributes,
  69     parse_resolution,
  70     RegexNotFoundError,
  71     sanitize_filename,
  72     sanitized_Request,
  73     str_or_none,
  74     str_to_int,
  75     strip_or_none,
  76     traverse_obj,
  77     unescapeHTML,
  78     UnsupportedError,
  79     unified_strdate,
  80     unified_timestamp,
  81     update_Request,
  82     update_url_query,
  83     url_basename,
  84     url_or_none,
  85     urljoin,
  86     variadic,
  87     xpath_element,
  88     xpath_text,
  89     xpath_with_ns,
  90 )
  91
  92
  93 class InfoExtractor(object):
  94     """Information Extractor class.
  95
  96     Information extractors are the classes that, given a URL, extract
  97     information about the video (or videos) the URL refers to. This
  98     information includes the real video URL, the video title, author and
  99     others. The information is stored in a dictionary which is then
 100     passed to the YoutubeDL. The YoutubeDL processes this
 101     information possibly downloading the video to the file system, among
 102     other possible outcomes.
 103
 104     The type field determines the type of the result.
 105     By far the most common value (and the default if _type is missing) is
 106     "video", which indicates a single video.
 107
 108     For a video, the dictionaries must include the following fields:
 109
 110     id:             Video identifier.
 111     title:          Video title, unescaped.
 112
 113     Additionally, it must contain either a formats entry or a url one:
 114
 115     formats:        A list of dictionaries for each format available, ordered
 116                     from worst to best quality.
 117
 118                     Potential fields:
 119                     * url        The mandatory URL representing the media:
 120                                    for plain file media - HTTP URL of this file,
 121                                    for RTMP - RTMP URL,
 122                                    for HLS - URL of the M3U8 media playlist,
 123                                    for HDS - URL of the F4M manifest,
 124                                    for DASH
 125                                      - HTTP URL to plain file media (in case of
 126                                        unfragmented media)
 127                                      - URL of the MPD manifest or base URL
 128                                        representing the media if MPD manifest
 129                                        is parsed from a string (in case of
 130                                        fragmented media)
 131                                    for MSS - URL of the ISM manifest.
 132                     * manifest_url
 133                                  The URL of the manifest file in case of
 134                                  fragmented media:
 135                                    for HLS - URL of the M3U8 master playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH - URL of the MPD manifest,
 138                                    for MSS - URL of the ISM manifest.
 139                     * ext        Will be calculated from URL if missing
 140                     * format     A human-readable description of the format
 141                                  ("mp4 container with h264/opus").
 142                                  Calculated from the format_id, width, height.
 143                                  and format_note fields if missing.
 144                     * format_id  A short description of the format
 145                                  ("mp4_h264_opus" or "19").
 146                                 Technically optional, but strongly recommended.
 147                     * format_note Additional info about the format
 148                                  ("3D" or "DASH video")
 149                     * width      Width of the video, if known
 150                     * height     Height of the video, if known
 151                     * resolution Textual description of width and height
 152                     * dynamic_range The dynamic range of the video. One of:
 153                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 154                     * tbr        Average bitrate of audio and video in KBit/s
 155                     * abr        Average audio bitrate in KBit/s
 156                     * acodec     Name of the audio codec in use
 157                     * asr        Audio sampling rate in Hertz
 158                     * vbr        Average video bitrate in KBit/s
 159                     * fps        Frame rate
 160                     * vcodec     Name of the video codec in use
 161                     * container  Name of the container format
 162                     * filesize   The number of bytes, if known in advance
 163                     * filesize_approx  An estimate for the number of bytes
 164                     * player_url SWF Player URL (used for rtmpdump).
 165                     * protocol   The protocol that will be used for the actual
 166                                  download, lower-case.
 167                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 168                                  "m3u8", "m3u8_native" or "http_dash_segments".
 169                     * fragment_base_url
 170                                  Base URL for fragments. Each fragment's path
 171                                  value (if present) will be relative to
 172                                  this URL.
 173                     * fragments  A list of fragments of a fragmented media.
 174                                  Each fragment entry must contain either an url
 175                                  or a path. If an url is present it should be
 176                                  considered by a client. Otherwise both path and
 177                                  fragment_base_url must be present. Here is
 178                                  the list of all potential fields:
 179                                  * "url" - fragment's URL
 180                                  * "path" - fragment's path relative to
 181                                             fragment_base_url
 182                                  * "duration" (optional, int or float)
 183                                  * "filesize" (optional, int)
 184                     * preference Order number of this format. If this field is
 185                                  present and not None, the formats get sorted
 186                                  by this field, regardless of all other values.
 187                                  -1 for default (order by other properties),
 188                                  -2 or smaller for less than default.
 189                                  < -1000 to hide the format (if there is
 190                                     another one which is strictly better)
 191                     * language   Language code, e.g. "de" or "en-US".
 192                     * language_preference  Is this in the language mentioned in
 193                                  the URL?
 194                                  10 if it's what the URL is about,
 195                                  -1 for default (don't know),
 196                                  -10 otherwise, other values reserved for now.
 197                     * quality    Order number of the video quality of this
 198                                  format, irrespective of the file format.
 199                                  -1 for default (order by other properties),
 200                                  -2 or smaller for less than default.
 201                     * source_preference  Order number for this video source
 202                                   (quality takes higher priority)
 203                                  -1 for default (order by other properties),
 204                                  -2 or smaller for less than default.
 205                     * http_headers  A dictionary of additional HTTP headers
 206                                  to add to the request.
 207                     * stretched_ratio  If given and not 1, indicates that the
 208                                  video's pixels are not square.
 209                                  width : height ratio as float.
 210                     * no_resume  The server does not support resuming the
 211                                  (HTTP or RTMP) download. Boolean.
 212                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 213                     * downloader_options  A dictionary of downloader options as
 214                                  described in FileDownloader
 215                     RTMP formats can also have the additional fields: page_url,
 216                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 217                     rtmp_protocol, rtmp_real_time
 218
 219     url:            Final video URL.
 220     ext:            Video filename extension.
 221     format:         The video format, defaults to ext (used for --get-format)
 222     player_url:     SWF Player URL (used for rtmpdump).
 223
 224     The following fields are optional:
 225
 226     alt_title:      A secondary title of the video.
 227     display_id      An alternative identifier for the video, not necessarily
 228                     unique, but available before title. Typically, id is
 229                     something like "4234987", title "Dancing naked mole rats",
 230                     and display_id "dancing-naked-mole-rats"
 231     thumbnails:     A list of dictionaries, with the following entries:
 232                         * "id" (optional, string) - Thumbnail format ID
 233                         * "url"
 234                         * "preference" (optional, int) - quality of the image
 235                         * "width" (optional, int)
 236                         * "height" (optional, int)
 237                         * "resolution" (optional, string "{width}x{height}",
 238                                         deprecated)
 239                         * "filesize" (optional, int)
 240     thumbnail:      Full URL to a video thumbnail image.
 241     description:    Full video description.
 242     uploader:       Full name of the video uploader.
 243     license:        License name the video is licensed under.
 244     creator:        The creator of the video.
 245     release_timestamp: UNIX timestamp of the moment the video was released.
 246     release_date:   The date (YYYYMMDD) when the video was released.
 247     timestamp:      UNIX timestamp of the moment the video was uploaded
 248     upload_date:    Video upload date (YYYYMMDD).
 249                     If not explicitly set, calculated from timestamp.
 250     uploader_id:    Nickname or id of the video uploader.
 251     uploader_url:   Full URL to a personal webpage of the video uploader.
 252     channel:        Full name of the channel the video is uploaded on.
 253                     Note that channel fields may or may not repeat uploader
 254                     fields. This depends on a particular extractor.
 255     channel_id:     Id of the channel.
 256     channel_url:    Full URL to a channel webpage.
 257     location:       Physical location where the video was filmed.
 258     subtitles:      The available subtitles as a dictionary in the format
 259                     {tag: subformats}. "tag" is usually a language code, and
 260                     "subformats" is a list sorted from lower to higher
 261                     preference, each element is a dictionary with the "ext"
 262                     entry and one of:
 263                         * "data": The subtitles file contents
 264                         * "url": A URL pointing to the subtitles file
 265                     It can optionally also have:
 266                         * "name": Name or description of the subtitles
 267                     "ext" will be calculated from URL if missing
 268     automatic_captions: Like 'subtitles'; contains automatically generated
 269                     captions instead of normal subtitles
 270     duration:       Length of the video in seconds, as an integer or float.
 271     view_count:     How many users have watched the video on the platform.
 272     like_count:     Number of positive ratings of the video
 273     dislike_count:  Number of negative ratings of the video
 274     repost_count:   Number of reposts of the video
 275     average_rating: Average rating give by users, the scale used depends on the webpage
 276     comment_count:  Number of comments on the video
 277     comments:       A list of comments, each with one or more of the following
 278                     properties (all but one of text or html optional):
 279                         * "author" - human-readable name of the comment author
 280                         * "author_id" - user ID of the comment author
 281                         * "author_thumbnail" - The thumbnail of the comment author
 282                         * "id" - Comment ID
 283                         * "html" - Comment as HTML
 284                         * "text" - Plain text of the comment
 285                         * "timestamp" - UNIX timestamp of comment
 286                         * "parent" - ID of the comment this one is replying to.
 287                                      Set to "root" to indicate that this is a
 288                                      comment to the original video.
 289                         * "like_count" - Number of positive ratings of the comment
 290                         * "dislike_count" - Number of negative ratings of the comment
 291                         * "is_favorited" - Whether the comment is marked as
 292                                            favorite by the video uploader
 293                         * "author_is_uploader" - Whether the comment is made by
 294                                                  the video uploader
 295     age_limit:      Age restriction for the video, as an integer (years)
 296     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 297                     should allow to get the same result again. (It will be set
 298                     by YoutubeDL if it's missing)
 299     categories:     A list of categories that the video falls in, for example
 300                     ["Sports", "Berlin"]
 301     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 302     cast:           A list of the video cast
 303     is_live:        True, False, or None (=unknown). Whether this video is a
 304                     live stream that goes on instead of a fixed-length video.
 305     was_live:       True, False, or None (=unknown). Whether this video was
 306                     originally a live stream.
 307     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 308                     If absent, automatically set from is_live, was_live
 309     start_time:     Time in seconds where the reproduction should start, as
 310                     specified in the URL.
 311     end_time:       Time in seconds where the reproduction should end, as
 312                     specified in the URL.
 313     chapters:       A list of dictionaries, with the following entries:
 314                         * "start_time" - The start time of the chapter in seconds
 315                         * "end_time" - The end time of the chapter in seconds
 316                         * "title" (optional, string)
 317     playable_in_embed: Whether this video is allowed to play in embedded
 318                     players on other sites. Can be True (=always allowed),
 319                     False (=never allowed), None (=unknown), or a string
 320                     specifying the criteria for embedability (Eg: 'whitelist')
 321     availability:   Under what condition the video is available. One of
 322                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 323                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 324                     to set it
 325     __post_extractor: A function to be called just before the metadata is
 326                     written to either disk, logger or console. The function
 327                     must return a dict which will be added to the info_dict.
 328                     This is usefull for additional information that is
 329                     time-consuming to extract. Note that the fields thus
 330                     extracted will not be available to output template and
 331                     match_filter. So, only "comments" and "comment_count" are
 332                     currently allowed to be extracted via this method.
 333
 334     The following fields should only be used when the video belongs to some logical
 335     chapter or section:
 336
 337     chapter:        Name or title of the chapter the video belongs to.
 338     chapter_number: Number of the chapter the video belongs to, as an integer.
 339     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 340
 341     The following fields should only be used when the video is an episode of some
 342     series, programme or podcast:
 343
 344     series:         Title of the series or programme the video episode belongs to.
 345     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 346     season:         Title of the season the video episode belongs to.
 347     season_number:  Number of the season the video episode belongs to, as an integer.
 348     season_id:      Id of the season the video episode belongs to, as a unicode string.
 349     episode:        Title of the video episode. Unlike mandatory video title field,
 350                     this field should denote the exact title of the video episode
 351                     without any kind of decoration.
 352     episode_number: Number of the video episode within a season, as an integer.
 353     episode_id:     Id of the video episode, as a unicode string.
 354
 355     The following fields should only be used when the media is a track or a part of
 356     a music album:
 357
 358     track:          Title of the track.
 359     track_number:   Number of the track within an album or a disc, as an integer.
 360     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 361                     as a unicode string.
 362     artist:         Artist(s) of the track.
 363     genre:          Genre(s) of the track.
 364     album:          Title of the album the track belongs to.
 365     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 366     album_artist:   List of all artists appeared on the album (e.g.
 367                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 368                     and compilations).
 369     disc_number:    Number of the disc or other physical medium the track belongs to,
 370                     as an integer.
 371     release_year:   Year (YYYY) when the album was released.
 372
 373     Unless mentioned otherwise, the fields should be Unicode strings.
 374
 375     Unless mentioned otherwise, None is equivalent to absence of information.
 376
 377
 378     _type "playlist" indicates multiple videos.
 379     There must be a key "entries", which is a list, an iterable, or a PagedList
 380     object, each element of which is a valid dictionary by this specification.
 381
 382     Additionally, playlists can have "id", "title", and any other relevent
 383     attributes with the same semantics as videos (see above).
 384
 385
 386     _type "multi_video" indicates that there are multiple videos that
 387     form a single show, for examples multiple acts of an opera or TV episode.
 388     It must have an entries key like a playlist and contain all the keys
 389     required for a video at the same time.
 390
 391
 392     _type "url" indicates that the video must be extracted from another
 393     location, possibly by a different extractor. Its only required key is:
 394     "url" - the next URL to extract.
 395     The key "ie_key" can be set to the class name (minus the trailing "IE",
 396     e.g. "Youtube") if the extractor class is known in advance.
 397     Additionally, the dictionary may have any properties of the resolved entity
 398     known in advance, for example "title" if the title of the referred video is
 399     known ahead of time.
 400
 401
 402     _type "url_transparent" entities have the same specification as "url", but
 403     indicate that the given additional information is more precise than the one
 404     associated with the resolved URL.
 405     This is useful when a site employs a video service that hosts the video and
 406     its technical metadata, but that video service does not embed a useful
 407     title, description etc.
 408
 409
 410     Subclasses of this one should re-define the _real_initialize() and
 411     _real_extract() methods and define a _VALID_URL regexp.
 412     Probably, they should also be added to the list of extractors.
 413
 414     Subclasses may also override suitable() if necessary, but ensure the function
 415     signature is preserved and that this function imports everything it needs
 416     (except other extractors), so that lazy_extractors works correctly
 417
 418     _GEO_BYPASS attribute may be set to False in order to disable
 419     geo restriction bypass mechanisms for a particular extractor.
 420     Though it won't disable explicit geo restriction bypass based on
 421     country code provided with geo_bypass_country.
 422
 423     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 424     countries for this extractor. One of these countries will be used by
 425     geo restriction bypass mechanism right away in order to bypass
 426     geo restriction, of course, if the mechanism is not disabled.
 427
 428     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 429     IP blocks in CIDR notation for this extractor. One of these IP blocks
 430     will be used by geo restriction bypass mechanism similarly
 431     to _GEO_COUNTRIES.
 432
 433     The _WORKING attribute should be set to False for broken IEs
 434     in order to warn the users and skip the tests.
 435     """
 436
 437     _ready = False
 438     _downloader = None
 439     _x_forwarded_for_ip = None
 440     _GEO_BYPASS = True
 441     _GEO_COUNTRIES = None
 442     _GEO_IP_BLOCKS = None
 443     _WORKING = True
 444
 445     _LOGIN_HINTS = {
 446         'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
 447         'cookies': (
 448             'Use --cookies-from-browser or --cookies for the authentication. '
 449             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 450         'password': 'Use --username and --password, or --netrc to provide account credentials',
 451     }
 452
 453     def __init__(self, downloader=None):
 454         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 455         If a downloader is not passed during initialization,
 456         it must be set using "set_downloader()" before "extract()" is called"""
 457         self._ready = False
 458         self._x_forwarded_for_ip = None
 459         self._printed_messages = set()
 460         self.set_downloader(downloader)
 461
 462     @classmethod
 463     def _match_valid_url(cls, url):
 464         # This does not use has/getattr intentionally - we want to know whether
 465         # we have cached the regexp for *this* class, whereas getattr would also
 466         # match the superclass
 467         if '_VALID_URL_RE' not in cls.__dict__:
 468             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 469         return cls._VALID_URL_RE.match(url)
 470
 471     @classmethod
 472     def suitable(cls, url):
 473         """Receives a URL and returns True if suitable for this IE."""
 474         # This function must import everything it needs (except other extractors),
 475         # so that lazy_extractors works correctly
 476         return cls._match_valid_url(url) is not None
 477
 478     @classmethod
 479     def _match_id(cls, url):
 480         return cls._match_valid_url(url).group('id')
 481
 482     @classmethod
 483     def get_temp_id(cls, url):
 484         try:
 485             return cls._match_id(url)
 486         except (IndexError, AttributeError):
 487             return None
 488
 489     @classmethod
 490     def working(cls):
 491         """Getter method for _WORKING."""
 492         return cls._WORKING
 493
 494     def initialize(self):
 495         """Initializes an instance (authentication, etc)."""
 496         self._printed_messages = set()
 497         self._initialize_geo_bypass({
 498             'countries': self._GEO_COUNTRIES,
 499             'ip_blocks': self._GEO_IP_BLOCKS,
 500         })
 501         if not self._ready:
 502             self._real_initialize()
 503             self._ready = True
 504
 505     def _initialize_geo_bypass(self, geo_bypass_context):
 506         """
 507         Initialize geo restriction bypass mechanism.
 508
 509         This method is used to initialize geo bypass mechanism based on faking
 510         X-Forwarded-For HTTP header. A random country from provided country list
 511         is selected and a random IP belonging to this country is generated. This
 512         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 513         HTTP requests.
 514
 515         This method will be used for initial geo bypass mechanism initialization
 516         during the instance initialization with _GEO_COUNTRIES and
 517         _GEO_IP_BLOCKS.
 518
 519         You may also manually call it from extractor's code if geo bypass
 520         information is not available beforehand (e.g. obtained during
 521         extraction) or due to some other reason. In this case you should pass
 522         this information in geo bypass context passed as first argument. It may
 523         contain following fields:
 524
 525         countries:  List of geo unrestricted countries (similar
 526                     to _GEO_COUNTRIES)
 527         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 528                     (similar to _GEO_IP_BLOCKS)
 529
 530         """
 531         if not self._x_forwarded_for_ip:
 532
 533             # Geo bypass mechanism is explicitly disabled by user
 534             if not self.get_param('geo_bypass', True):
 535                 return
 536
 537             if not geo_bypass_context:
 538                 geo_bypass_context = {}
 539
 540             # Backward compatibility: previously _initialize_geo_bypass
 541             # expected a list of countries, some 3rd party code may still use
 542             # it this way
 543             if isinstance(geo_bypass_context, (list, tuple)):
 544                 geo_bypass_context = {
 545                     'countries': geo_bypass_context,
 546                 }
 547
 548             # The whole point of geo bypass mechanism is to fake IP
 549             # as X-Forwarded-For HTTP header based on some IP block or
 550             # country code.
 551
 552             # Path 1: bypassing based on IP block in CIDR notation
 553
 554             # Explicit IP block specified by user, use it right away
 555             # regardless of whether extractor is geo bypassable or not
 556             ip_block = self.get_param('geo_bypass_ip_block', None)
 557
 558             # Otherwise use random IP block from geo bypass context but only
 559             # if extractor is known as geo bypassable
 560             if not ip_block:
 561                 ip_blocks = geo_bypass_context.get('ip_blocks')
 562                 if self._GEO_BYPASS and ip_blocks:
 563                     ip_block = random.choice(ip_blocks)
 564
 565             if ip_block:
 566                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 567                 self._downloader.write_debug(
 568                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 569                 return
 570
 571             # Path 2: bypassing based on country code
 572
 573             # Explicit country code specified by user, use it right away
 574             # regardless of whether extractor is geo bypassable or not
 575             country = self.get_param('geo_bypass_country', None)
 576
 577             # Otherwise use random country code from geo bypass context but
 578             # only if extractor is known as geo bypassable
 579             if not country:
 580                 countries = geo_bypass_context.get('countries')
 581                 if self._GEO_BYPASS and countries:
 582                     country = random.choice(countries)
 583
 584             if country:
 585                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 586                 self._downloader.write_debug(
 587                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 588
 589     def extract(self, url):
 590         """Extracts URL information and returns it in list of dicts."""
 591         try:
 592             for _ in range(2):
 593                 try:
 594                     self.initialize()
 595                     self.write_debug('Extracting URL: %s' % url)
 596                     ie_result = self._real_extract(url)
 597                     if ie_result is None:
 598                         return None
 599                     if self._x_forwarded_for_ip:
 600                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 601                     subtitles = ie_result.get('subtitles')
 602                     if (subtitles and 'live_chat' in subtitles
 603                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 604                         del subtitles['live_chat']
 605                     return ie_result
 606                 except GeoRestrictedError as e:
 607                     if self.__maybe_fake_ip_and_retry(e.countries):
 608                         continue
 609                     raise
 610         except UnsupportedError:
 611             raise
 612         except ExtractorError as e:
 613             kwargs = {
 614                 'video_id': e.video_id or self.get_temp_id(url),
 615                 'ie': self.IE_NAME,
 616                 'tb': e.traceback,
 617                 'expected': e.expected,
 618                 'cause': e.cause
 619             }
 620             if hasattr(e, 'countries'):
 621                 kwargs['countries'] = e.countries
 622             raise type(e)(e.msg, **kwargs)
 623         except compat_http_client.IncompleteRead as e:
 624             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 625         except (KeyError, StopIteration) as e:
 626             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 627
 628     def __maybe_fake_ip_and_retry(self, countries):
 629         if (not self.get_param('geo_bypass_country', None)
 630                 and self._GEO_BYPASS
 631                 and self.get_param('geo_bypass', True)
 632                 and not self._x_forwarded_for_ip
 633                 and countries):
 634             country_code = random.choice(countries)
 635             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 636             if self._x_forwarded_for_ip:
 637                 self.report_warning(
 638                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 639                     % (self._x_forwarded_for_ip, country_code.upper()))
 640                 return True
 641         return False
 642
 643     def set_downloader(self, downloader):
 644         """Sets the downloader for this IE."""
 645         self._downloader = downloader
 646
 647     def _real_initialize(self):
 648         """Real initialization process. Redefine in subclasses."""
 649         pass
 650
 651     def _real_extract(self, url):
 652         """Real extraction process. Redefine in subclasses."""
 653         pass
 654
 655     @classmethod
 656     def ie_key(cls):
 657         """A string for getting the InfoExtractor with get_info_extractor"""
 658         return cls.__name__[:-2]
 659
 660     @property
 661     def IE_NAME(self):
 662         return compat_str(type(self).__name__[:-2])
 663
 664     @staticmethod
 665     def __can_accept_status_code(err, expected_status):
 666         assert isinstance(err, compat_urllib_error.HTTPError)
 667         if expected_status is None:
 668             return False
 669         elif callable(expected_status):
 670             return expected_status(err.code) is True
 671         else:
 672             return err.code in variadic(expected_status)
 673
 674     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 675         """
 676         Return the response handle.
 677
 678         See _download_webpage docstring for arguments specification.
 679         """
 680         if not self._downloader._first_webpage_request:
 681             sleep_interval = self.get_param('sleep_interval_requests') or 0
 682             if sleep_interval > 0:
 683                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 684                 time.sleep(sleep_interval)
 685         else:
 686             self._downloader._first_webpage_request = False
 687
 688         if note is None:
 689             self.report_download_webpage(video_id)
 690         elif note is not False:
 691             if video_id is None:
 692                 self.to_screen('%s' % (note,))
 693             else:
 694                 self.to_screen('%s: %s' % (video_id, note))
 695
 696         # Some sites check X-Forwarded-For HTTP header in order to figure out
 697         # the origin of the client behind proxy. This allows bypassing geo
 698         # restriction by faking this header's value to IP that belongs to some
 699         # geo unrestricted country. We will do so once we encounter any
 700         # geo restriction error.
 701         if self._x_forwarded_for_ip:
 702             if 'X-Forwarded-For' not in headers:
 703                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 704
 705         if isinstance(url_or_request, compat_urllib_request.Request):
 706             url_or_request = update_Request(
 707                 url_or_request, data=data, headers=headers, query=query)
 708         else:
 709             if query:
 710                 url_or_request = update_url_query(url_or_request, query)
 711             if data is not None or headers:
 712                 url_or_request = sanitized_Request(url_or_request, data, headers)
 713         try:
 714             return self._downloader.urlopen(url_or_request)
 715         except network_exceptions as err:
 716             if isinstance(err, compat_urllib_error.HTTPError):
 717                 if self.__can_accept_status_code(err, expected_status):
 718                     # Retain reference to error to prevent file object from
 719                     # being closed before it can be read. Works around the
 720                     # effects of <https://bugs.python.org/issue15002>
 721                     # introduced in Python 3.4.1.
 722                     err.fp._error = err
 723                     return err.fp
 724
 725             if errnote is False:
 726                 return False
 727             if errnote is None:
 728                 errnote = 'Unable to download webpage'
 729
 730             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 731             if fatal:
 732                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 733             else:
 734                 self.report_warning(errmsg)
 735                 return False
 736
 737     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 738         """
 739         Return a tuple (page content as string, URL handle).
 740
 741         See _download_webpage docstring for arguments specification.
 742         """
 743         # Strip hashes from the URL (#1038)
 744         if isinstance(url_or_request, (compat_str, str)):
 745             url_or_request = url_or_request.partition('#')[0]
 746
 747         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 748         if urlh is False:
 749             assert not fatal
 750             return False
 751         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 752         return (content, urlh)
 753
 754     @staticmethod
 755     def _guess_encoding_from_content(content_type, webpage_bytes):
 756         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 757         if m:
 758             encoding = m.group(1)
 759         else:
 760             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 761                           webpage_bytes[:1024])
 762             if m:
 763                 encoding = m.group(1).decode('ascii')
 764             elif webpage_bytes.startswith(b'\xff\xfe'):
 765                 encoding = 'utf-16'
 766             else:
 767                 encoding = 'utf-8'
 768
 769         return encoding
 770
 771     def __check_blocked(self, content):
 772         first_block = content[:512]
 773         if ('<title>Access to this site is blocked</title>' in content
 774                 and 'Websense' in first_block):
 775             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 776             blocked_iframe = self._html_search_regex(
 777                 r'<iframe src="([^"]+)"', content,
 778                 'Websense information URL', default=None)
 779             if blocked_iframe:
 780                 msg += ' Visit %s for more details' % blocked_iframe
 781             raise ExtractorError(msg, expected=True)
 782         if '<title>The URL you requested has been blocked</title>' in first_block:
 783             msg = (
 784                 'Access to this webpage has been blocked by Indian censorship. '
 785                 'Use a VPN or proxy server (with --proxy) to route around it.')
 786             block_msg = self._html_search_regex(
 787                 r'</h1><p>(.*?)</p>',
 788                 content, 'block message', default=None)
 789             if block_msg:
 790                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 791             raise ExtractorError(msg, expected=True)
 792         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 793                 and 'blocklist.rkn.gov.ru' in content):
 794             raise ExtractorError(
 795                 'Access to this webpage has been blocked by decision of the Russian government. '
 796                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 797                 expected=True)
 798
 799     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 800         content_type = urlh.headers.get('Content-Type', '')
 801         webpage_bytes = urlh.read()
 802         if prefix is not None:
 803             webpage_bytes = prefix + webpage_bytes
 804         if not encoding:
 805             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 806         if self.get_param('dump_intermediate_pages', False):
 807             self.to_screen('Dumping request to ' + urlh.geturl())
 808             dump = base64.b64encode(webpage_bytes).decode('ascii')
 809             self._downloader.to_screen(dump)
 810         if self.get_param('write_pages', False):
 811             basen = '%s_%s' % (video_id, urlh.geturl())
 812             trim_length = self.get_param('trim_file_name') or 240
 813             if len(basen) > trim_length:
 814                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 815                 basen = basen[:trim_length - len(h)] + h
 816             raw_filename = basen + '.dump'
 817             filename = sanitize_filename(raw_filename, restricted=True)
 818             self.to_screen('Saving request to ' + filename)
 819             # Working around MAX_PATH limitation on Windows (see
 820             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 821             if compat_os_name == 'nt':
 822                 absfilepath = os.path.abspath(filename)
 823                 if len(absfilepath) > 259:
 824                     filename = '\\\\?\\' + absfilepath
 825             with open(filename, 'wb') as outf:
 826                 outf.write(webpage_bytes)
 827
 828         try:
 829             content = webpage_bytes.decode(encoding, 'replace')
 830         except LookupError:
 831             content = webpage_bytes.decode('utf-8', 'replace')
 832
 833         self.__check_blocked(content)
 834
 835         return content
 836
 837     def _download_webpage(
 838             self, url_or_request, video_id, note=None, errnote=None,
 839             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 840             headers={}, query={}, expected_status=None):
 841         """
 842         Return the data of the page as a string.
 843
 844         Arguments:
 845         url_or_request -- plain text URL as a string or
 846             a compat_urllib_request.Requestobject
 847         video_id -- Video/playlist/item identifier (string)
 848
 849         Keyword arguments:
 850         note -- note printed before downloading (string)
 851         errnote -- note printed in case of an error (string)
 852         fatal -- flag denoting whether error should be considered fatal,
 853             i.e. whether it should cause ExtractionError to be raised,
 854             otherwise a warning will be reported and extraction continued
 855         tries -- number of tries
 856         timeout -- sleep interval between tries
 857         encoding -- encoding for a page content decoding, guessed automatically
 858             when not explicitly specified
 859         data -- POST data (bytes)
 860         headers -- HTTP headers (dict)
 861         query -- URL query (dict)
 862         expected_status -- allows to accept failed HTTP requests (non 2xx
 863             status code) by explicitly specifying a set of accepted status
 864             codes. Can be any of the following entities:
 865                 - an integer type specifying an exact failed status code to
 866                   accept
 867                 - a list or a tuple of integer types specifying a list of
 868                   failed status codes to accept
 869                 - a callable accepting an actual failed status code and
 870                   returning True if it should be accepted
 871             Note that this argument does not affect success status codes (2xx)
 872             which are always accepted.
 873         """
 874
 875         success = False
 876         try_count = 0
 877         while success is False:
 878             try:
 879                 res = self._download_webpage_handle(
 880                     url_or_request, video_id, note, errnote, fatal,
 881                     encoding=encoding, data=data, headers=headers, query=query,
 882                     expected_status=expected_status)
 883                 success = True
 884             except compat_http_client.IncompleteRead as e:
 885                 try_count += 1
 886                 if try_count >= tries:
 887                     raise e
 888                 self._sleep(timeout, video_id)
 889         if res is False:
 890             return res
 891         else:
 892             content, _ = res
 893             return content
 894
 895     def _download_xml_handle(
 896             self, url_or_request, video_id, note='Downloading XML',
 897             errnote='Unable to download XML', transform_source=None,
 898             fatal=True, encoding=None, data=None, headers={}, query={},
 899             expected_status=None):
 900         """
 901         Return a tuple (xml as an compat_etree_Element, URL handle).
 902
 903         See _download_webpage docstring for arguments specification.
 904         """
 905         res = self._download_webpage_handle(
 906             url_or_request, video_id, note, errnote, fatal=fatal,
 907             encoding=encoding, data=data, headers=headers, query=query,
 908             expected_status=expected_status)
 909         if res is False:
 910             return res
 911         xml_string, urlh = res
 912         return self._parse_xml(
 913             xml_string, video_id, transform_source=transform_source,
 914             fatal=fatal), urlh
 915
 916     def _download_xml(
 917             self, url_or_request, video_id,
 918             note='Downloading XML', errnote='Unable to download XML',
 919             transform_source=None, fatal=True, encoding=None,
 920             data=None, headers={}, query={}, expected_status=None):
 921         """
 922         Return the xml as an compat_etree_Element.
 923
 924         See _download_webpage docstring for arguments specification.
 925         """
 926         res = self._download_xml_handle(
 927             url_or_request, video_id, note=note, errnote=errnote,
 928             transform_source=transform_source, fatal=fatal, encoding=encoding,
 929             data=data, headers=headers, query=query,
 930             expected_status=expected_status)
 931         return res if res is False else res[0]
 932
 933     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 934         if transform_source:
 935             xml_string = transform_source(xml_string)
 936         try:
 937             return compat_etree_fromstring(xml_string.encode('utf-8'))
 938         except compat_xml_parse_error as ve:
 939             errmsg = '%s: Failed to parse XML ' % video_id
 940             if fatal:
 941                 raise ExtractorError(errmsg, cause=ve)
 942             else:
 943                 self.report_warning(errmsg + str(ve))
 944
 945     def _download_json_handle(
 946             self, url_or_request, video_id, note='Downloading JSON metadata',
 947             errnote='Unable to download JSON metadata', transform_source=None,
 948             fatal=True, encoding=None, data=None, headers={}, query={},
 949             expected_status=None):
 950         """
 951         Return a tuple (JSON object, URL handle).
 952
 953         See _download_webpage docstring for arguments specification.
 954         """
 955         res = self._download_webpage_handle(
 956             url_or_request, video_id, note, errnote, fatal=fatal,
 957             encoding=encoding, data=data, headers=headers, query=query,
 958             expected_status=expected_status)
 959         if res is False:
 960             return res
 961         json_string, urlh = res
 962         return self._parse_json(
 963             json_string, video_id, transform_source=transform_source,
 964             fatal=fatal), urlh
 965
 966     def _download_json(
 967             self, url_or_request, video_id, note='Downloading JSON metadata',
 968             errnote='Unable to download JSON metadata', transform_source=None,
 969             fatal=True, encoding=None, data=None, headers={}, query={},
 970             expected_status=None):
 971         """
 972         Return the JSON object as a dict.
 973
 974         See _download_webpage docstring for arguments specification.
 975         """
 976         res = self._download_json_handle(
 977             url_or_request, video_id, note=note, errnote=errnote,
 978             transform_source=transform_source, fatal=fatal, encoding=encoding,
 979             data=data, headers=headers, query=query,
 980             expected_status=expected_status)
 981         return res if res is False else res[0]
 982
 983     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 984         if transform_source:
 985             json_string = transform_source(json_string)
 986         try:
 987             return json.loads(json_string)
 988         except ValueError as ve:
 989             errmsg = '%s: Failed to parse JSON ' % video_id
 990             if fatal:
 991                 raise ExtractorError(errmsg, cause=ve)
 992             else:
 993                 self.report_warning(errmsg + str(ve))
 994
 995     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 996         return self._parse_json(
 997             data[data.find('{'):data.rfind('}') + 1],
 998             video_id, transform_source, fatal)
 999
1000     def _download_socket_json_handle(
1001             self, url_or_request, video_id, note='Polling socket',
1002             errnote='Unable to poll socket', transform_source=None,
1003             fatal=True, encoding=None, data=None, headers={}, query={},
1004             expected_status=None):
1005         """
1006         Return a tuple (JSON object, URL handle).
1007
1008         See _download_webpage docstring for arguments specification.
1009         """
1010         res = self._download_webpage_handle(
1011             url_or_request, video_id, note, errnote, fatal=fatal,
1012             encoding=encoding, data=data, headers=headers, query=query,
1013             expected_status=expected_status)
1014         if res is False:
1015             return res
1016         webpage, urlh = res
1017         return self._parse_socket_response_as_json(
1018             webpage, video_id, transform_source=transform_source,
1019             fatal=fatal), urlh
1020
1021     def _download_socket_json(
1022             self, url_or_request, video_id, note='Polling socket',
1023             errnote='Unable to poll socket', transform_source=None,
1024             fatal=True, encoding=None, data=None, headers={}, query={},
1025             expected_status=None):
1026         """
1027         Return the JSON object as a dict.
1028
1029         See _download_webpage docstring for arguments specification.
1030         """
1031         res = self._download_socket_json_handle(
1032             url_or_request, video_id, note=note, errnote=errnote,
1033             transform_source=transform_source, fatal=fatal, encoding=encoding,
1034             data=data, headers=headers, query=query,
1035             expected_status=expected_status)
1036         return res if res is False else res[0]
1037
1038     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1039         idstr = format_field(video_id, template='%s: ')
1040         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1041         if only_once:
1042             if f'WARNING: {msg}' in self._printed_messages:
1043                 return
1044             self._printed_messages.add(f'WARNING: {msg}')
1045         self._downloader.report_warning(msg, *args, **kwargs)
1046
1047     def to_screen(self, msg, *args, **kwargs):
1048         """Print msg to screen, prefixing it with '[ie_name]'"""
1049         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1050
1051     def write_debug(self, msg, *args, **kwargs):
1052         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1053
1054     def get_param(self, name, default=None, *args, **kwargs):
1055         if self._downloader:
1056             return self._downloader.params.get(name, default, *args, **kwargs)
1057         return default
1058
1059     def report_drm(self, video_id, partial=False):
1060         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1061
1062     def report_extraction(self, id_or_name):
1063         """Report information extraction."""
1064         self.to_screen('%s: Extracting information' % id_or_name)
1065
1066     def report_download_webpage(self, video_id):
1067         """Report webpage download."""
1068         self.to_screen('%s: Downloading webpage' % video_id)
1069
1070     def report_age_confirmation(self):
1071         """Report attempt to confirm age."""
1072         self.to_screen('Confirming age')
1073
1074     def report_login(self):
1075         """Report attempt to log in."""
1076         self.to_screen('Logging in')
1077
1078     def raise_login_required(
1079             self, msg='This video is only available for registered users',
1080             metadata_available=False, method='any'):
1081         if metadata_available and self.get_param('ignore_no_formats_error'):
1082             self.report_warning(msg)
1083         if method is not None:
1084             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1085         raise ExtractorError(msg, expected=True)
1086
1087     def raise_geo_restricted(
1088             self, msg='This video is not available from your location due to geo restriction',
1089             countries=None, metadata_available=False):
1090         if metadata_available and self.get_param('ignore_no_formats_error'):
1091             self.report_warning(msg)
1092         else:
1093             raise GeoRestrictedError(msg, countries=countries)
1094
1095     def raise_no_formats(self, msg, expected=False, video_id=None):
1096         if expected and self.get_param('ignore_no_formats_error'):
1097             self.report_warning(msg, video_id)
1098         elif isinstance(msg, ExtractorError):
1099             raise msg
1100         else:
1101             raise ExtractorError(msg, expected=expected, video_id=video_id)
1102
1103     # Methods for following #608
1104     @staticmethod
1105     def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1106         """Returns a URL that points to a page that should be processed"""
1107         # TODO: ie should be the class used for getting the info
1108         video_info = {'_type': 'url',
1109                       'url': url,
1110                       'ie_key': ie}
1111         video_info.update(kwargs)
1112         if video_id is not None:
1113             video_info['id'] = video_id
1114         if video_title is not None:
1115             video_info['title'] = video_title
1116         return video_info
1117
1118     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1119         urls = orderedSet(
1120             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1121             for m in matches)
1122         return self.playlist_result(
1123             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1124
1125     @staticmethod
1126     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1127         """Returns a playlist"""
1128         video_info = {'_type': 'playlist',
1129                       'entries': entries}
1130         video_info.update(kwargs)
1131         if playlist_id:
1132             video_info['id'] = playlist_id
1133         if playlist_title:
1134             video_info['title'] = playlist_title
1135         if playlist_description is not None:
1136             video_info['description'] = playlist_description
1137         return video_info
1138
1139     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1140         """
1141         Perform a regex search on the given string, using a single or a list of
1142         patterns returning the first matching group.
1143         In case of failure return a default value or raise a WARNING or a
1144         RegexNotFoundError, depending on fatal, specifying the field name.
1145         """
1146         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1147             mobj = re.search(pattern, string, flags)
1148         else:
1149             for p in pattern:
1150                 mobj = re.search(p, string, flags)
1151                 if mobj:
1152                     break
1153
1154         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1155
1156         if mobj:
1157             if group is None:
1158                 # return the first matching group
1159                 return next(g for g in mobj.groups() if g is not None)
1160             elif isinstance(group, (list, tuple)):
1161                 return tuple(mobj.group(g) for g in group)
1162             else:
1163                 return mobj.group(group)
1164         elif default is not NO_DEFAULT:
1165             return default
1166         elif fatal:
1167             raise RegexNotFoundError('Unable to extract %s' % _name)
1168         else:
1169             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1170             return None
1171
1172     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1173         """
1174         Like _search_regex, but strips HTML tags and unescapes entities.
1175         """
1176         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1177         if res:
1178             return clean_html(res).strip()
1179         else:
1180             return res
1181
1182     def _get_netrc_login_info(self, netrc_machine=None):
1183         username = None
1184         password = None
1185         netrc_machine = netrc_machine or self._NETRC_MACHINE
1186
1187         if self.get_param('usenetrc', False):
1188             try:
1189                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1190                 if os.path.isdir(netrc_file):
1191                     netrc_file = os.path.join(netrc_file, '.netrc')
1192                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1193                 if info is not None:
1194                     username = info[0]
1195                     password = info[2]
1196                 else:
1197                     raise netrc.NetrcParseError(
1198                         'No authenticators for %s' % netrc_machine)
1199             except (IOError, netrc.NetrcParseError) as err:
1200                 self.report_warning(
1201                     'parsing .netrc: %s' % error_to_compat_str(err))
1202
1203         return username, password
1204
1205     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1206         """
1207         Get the login info as (username, password)
1208         First look for the manually specified credentials using username_option
1209         and password_option as keys in params dictionary. If no such credentials
1210         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1211         value.
1212         If there's no info available, return (None, None)
1213         """
1214
1215         # Attempt to use provided username and password or .netrc data
1216         username = self.get_param(username_option)
1217         if username is not None:
1218             password = self.get_param(password_option)
1219         else:
1220             username, password = self._get_netrc_login_info(netrc_machine)
1221
1222         return username, password
1223
1224     def _get_tfa_info(self, note='two-factor verification code'):
1225         """
1226         Get the two-factor authentication info
1227         TODO - asking the user will be required for sms/phone verify
1228         currently just uses the command line option
1229         If there's no info available, return None
1230         """
1231
1232         tfa = self.get_param('twofactor')
1233         if tfa is not None:
1234             return tfa
1235
1236         return compat_getpass('Type %s and press [Return]: ' % note)
1237
1238     # Helper functions for extracting OpenGraph info
1239     @staticmethod
1240     def _og_regexes(prop):
1241         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1242         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1243                        % {'prop': re.escape(prop)})
1244         template = r'<meta[^>]+?%s[^>]+?%s'
1245         return [
1246             template % (property_re, content_re),
1247             template % (content_re, property_re),
1248         ]
1249
1250     @staticmethod
1251     def _meta_regex(prop):
1252         return r'''(?isx)<meta
1253                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1254                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1255
1256     def _og_search_property(self, prop, html, name=None, **kargs):
1257         prop = variadic(prop)
1258         if name is None:
1259             name = 'OpenGraph %s' % prop[0]
1260         og_regexes = []
1261         for p in prop:
1262             og_regexes.extend(self._og_regexes(p))
1263         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1264         if escaped is None:
1265             return None
1266         return unescapeHTML(escaped)
1267
1268     def _og_search_thumbnail(self, html, **kargs):
1269         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1270
1271     def _og_search_description(self, html, **kargs):
1272         return self._og_search_property('description', html, fatal=False, **kargs)
1273
1274     def _og_search_title(self, html, **kargs):
1275         return self._og_search_property('title', html, **kargs)
1276
1277     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1278         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1279         if secure:
1280             regexes = self._og_regexes('video:secure_url') + regexes
1281         return self._html_search_regex(regexes, html, name, **kargs)
1282
1283     def _og_search_url(self, html, **kargs):
1284         return self._og_search_property('url', html, **kargs)
1285
1286     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1287         name = variadic(name)
1288         if display_name is None:
1289             display_name = name[0]
1290         return self._html_search_regex(
1291             [self._meta_regex(n) for n in name],
1292             html, display_name, fatal=fatal, group='content', **kwargs)
1293
1294     def _dc_search_uploader(self, html):
1295         return self._html_search_meta('dc.creator', html, 'uploader')
1296
1297     def _rta_search(self, html):
1298         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1299         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1300                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1301                      html):
1302             return 18
1303         return 0
1304
1305     def _media_rating_search(self, html):
1306         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1307         rating = self._html_search_meta('rating', html)
1308
1309         if not rating:
1310             return None
1311
1312         RATING_TABLE = {
1313             'safe for kids': 0,
1314             'general': 8,
1315             '14 years': 14,
1316             'mature': 17,
1317             'restricted': 19,
1318         }
1319         return RATING_TABLE.get(rating.lower())
1320
1321     def _family_friendly_search(self, html):
1322         # See http://schema.org/VideoObject
1323         family_friendly = self._html_search_meta(
1324             'isFamilyFriendly', html, default=None)
1325
1326         if not family_friendly:
1327             return None
1328
1329         RATING_TABLE = {
1330             '1': 0,
1331             'true': 0,
1332             '0': 18,
1333             'false': 18,
1334         }
1335         return RATING_TABLE.get(family_friendly.lower())
1336
1337     def _twitter_search_player(self, html):
1338         return self._html_search_meta('twitter:player', html,
1339                                       'twitter card player')
1340
1341     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1342         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1343         default = kwargs.get('default', NO_DEFAULT)
1344         # JSON-LD may be malformed and thus `fatal` should be respected.
1345         # At the same time `default` may be passed that assumes `fatal=False`
1346         # for _search_regex. Let's simulate the same behavior here as well.
1347         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1348         json_ld = []
1349         for mobj in json_ld_list:
1350             json_ld_item = self._parse_json(
1351                 mobj.group('json_ld'), video_id, fatal=fatal)
1352             if not json_ld_item:
1353                 continue
1354             if isinstance(json_ld_item, dict):
1355                 json_ld.append(json_ld_item)
1356             elif isinstance(json_ld_item, (list, tuple)):
1357                 json_ld.extend(json_ld_item)
1358         if json_ld:
1359             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1360         if json_ld:
1361             return json_ld
1362         if default is not NO_DEFAULT:
1363             return default
1364         elif fatal:
1365             raise RegexNotFoundError('Unable to extract JSON-LD')
1366         else:
1367             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1368             return {}
1369
1370     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1371         if isinstance(json_ld, compat_str):
1372             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1373         if not json_ld:
1374             return {}
1375         info = {}
1376         if not isinstance(json_ld, (list, tuple, dict)):
1377             return info
1378         if isinstance(json_ld, dict):
1379             json_ld = [json_ld]
1380
1381         INTERACTION_TYPE_MAP = {
1382             'CommentAction': 'comment',
1383             'AgreeAction': 'like',
1384             'DisagreeAction': 'dislike',
1385             'LikeAction': 'like',
1386             'DislikeAction': 'dislike',
1387             'ListenAction': 'view',
1388             'WatchAction': 'view',
1389             'ViewAction': 'view',
1390         }
1391
1392         def extract_interaction_type(e):
1393             interaction_type = e.get('interactionType')
1394             if isinstance(interaction_type, dict):
1395                 interaction_type = interaction_type.get('@type')
1396             return str_or_none(interaction_type)
1397
1398         def extract_interaction_statistic(e):
1399             interaction_statistic = e.get('interactionStatistic')
1400             if isinstance(interaction_statistic, dict):
1401                 interaction_statistic = [interaction_statistic]
1402             if not isinstance(interaction_statistic, list):
1403                 return
1404             for is_e in interaction_statistic:
1405                 if not isinstance(is_e, dict):
1406                     continue
1407                 if is_e.get('@type') != 'InteractionCounter':
1408                     continue
1409                 interaction_type = extract_interaction_type(is_e)
1410                 if not interaction_type:
1411                     continue
1412                 # For interaction count some sites provide string instead of
1413                 # an integer (as per spec) with non digit characters (e.g. ",")
1414                 # so extracting count with more relaxed str_to_int
1415                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1416                 if interaction_count is None:
1417                     continue
1418                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1419                 if not count_kind:
1420                     continue
1421                 count_key = '%s_count' % count_kind
1422                 if info.get(count_key) is not None:
1423                     continue
1424                 info[count_key] = interaction_count
1425
1426         def extract_video_object(e):
1427             assert e['@type'] == 'VideoObject'
1428             author = e.get('author')
1429             info.update({
1430                 'url': url_or_none(e.get('contentUrl')),
1431                 'title': unescapeHTML(e.get('name')),
1432                 'description': unescapeHTML(e.get('description')),
1433                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1434                 'duration': parse_duration(e.get('duration')),
1435                 'timestamp': unified_timestamp(e.get('uploadDate')),
1436                 # author can be an instance of 'Organization' or 'Person' types.
1437                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1438                 # however some websites are using 'Text' type instead.
1439                 # 1. https://schema.org/VideoObject
1440                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1441                 'filesize': float_or_none(e.get('contentSize')),
1442                 'tbr': int_or_none(e.get('bitrate')),
1443                 'width': int_or_none(e.get('width')),
1444                 'height': int_or_none(e.get('height')),
1445                 'view_count': int_or_none(e.get('interactionCount')),
1446             })
1447             extract_interaction_statistic(e)
1448
1449         for e in json_ld:
1450             if '@context' in e:
1451                 item_type = e.get('@type')
1452                 if expected_type is not None and expected_type != item_type:
1453                     continue
1454                 if item_type in ('TVEpisode', 'Episode'):
1455                     episode_name = unescapeHTML(e.get('name'))
1456                     info.update({
1457                         'episode': episode_name,
1458                         'episode_number': int_or_none(e.get('episodeNumber')),
1459                         'description': unescapeHTML(e.get('description')),
1460                     })
1461                     if not info.get('title') and episode_name:
1462                         info['title'] = episode_name
1463                     part_of_season = e.get('partOfSeason')
1464                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1465                         info.update({
1466                             'season': unescapeHTML(part_of_season.get('name')),
1467                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1468                         })
1469                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1470                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1471                         info['series'] = unescapeHTML(part_of_series.get('name'))
1472                 elif item_type == 'Movie':
1473                     info.update({
1474                         'title': unescapeHTML(e.get('name')),
1475                         'description': unescapeHTML(e.get('description')),
1476                         'duration': parse_duration(e.get('duration')),
1477                         'timestamp': unified_timestamp(e.get('dateCreated')),
1478                     })
1479                 elif item_type in ('Article', 'NewsArticle'):
1480                     info.update({
1481                         'timestamp': parse_iso8601(e.get('datePublished')),
1482                         'title': unescapeHTML(e.get('headline')),
1483                         'description': unescapeHTML(e.get('articleBody')),
1484                     })
1485                 elif item_type == 'VideoObject':
1486                     extract_video_object(e)
1487                     if expected_type is None:
1488                         continue
1489                     else:
1490                         break
1491                 video = e.get('video')
1492                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1493                     extract_video_object(video)
1494                 if expected_type is None:
1495                     continue
1496                 else:
1497                     break
1498         return dict((k, v) for k, v in info.items() if v is not None)
1499
1500     def _search_nextjs_data(self, webpage, video_id, **kw):
1501         return self._parse_json(
1502             self._search_regex(
1503                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1504                 webpage, 'next.js data', **kw),
1505             video_id, **kw)
1506
1507     @staticmethod
1508     def _hidden_inputs(html):
1509         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1510         hidden_inputs = {}
1511         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1512             attrs = extract_attributes(input)
1513             if not input:
1514                 continue
1515             if attrs.get('type') not in ('hidden', 'submit'):
1516                 continue
1517             name = attrs.get('name') or attrs.get('id')
1518             value = attrs.get('value')
1519             if name and value is not None:
1520                 hidden_inputs[name] = value
1521         return hidden_inputs
1522
1523     def _form_hidden_inputs(self, form_id, html):
1524         form = self._search_regex(
1525             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1526             html, '%s form' % form_id, group='form')
1527         return self._hidden_inputs(form)
1528
1529     class FormatSort:
1530         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1531
1532         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1533                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1534                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1535         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1536                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1537                         'fps', 'fs_approx', 'source', 'format_id')
1538
1539         settings = {
1540             'vcodec': {'type': 'ordered', 'regex': True,
1541                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1542             'acodec': {'type': 'ordered', 'regex': True,
1543                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1544             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1545                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1546             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1547                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1548             'vext': {'type': 'ordered', 'field': 'video_ext',
1549                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1550                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1551             'aext': {'type': 'ordered', 'field': 'audio_ext',
1552                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1553                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1554             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1555             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1556                            'field': ('vcodec', 'acodec'),
1557                            'function': lambda it: int(any(v != 'none' for v in it))},
1558             'ie_pref': {'priority': True, 'type': 'extractor'},
1559             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1560             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1561             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1562             'quality': {'convert': 'float', 'default': -1},
1563             'filesize': {'convert': 'bytes'},
1564             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1565             'id': {'convert': 'string', 'field': 'format_id'},
1566             'height': {'convert': 'float_none'},
1567             'width': {'convert': 'float_none'},
1568             'fps': {'convert': 'float_none'},
1569             'tbr': {'convert': 'float_none'},
1570             'vbr': {'convert': 'float_none'},
1571             'abr': {'convert': 'float_none'},
1572             'asr': {'convert': 'float_none'},
1573             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1574
1575             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1576             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1577             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1578             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1579             'res': {'type': 'multiple', 'field': ('height', 'width'),
1580                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1581
1582             # Most of these exist only for compatibility reasons
1583             'dimension': {'type': 'alias', 'field': 'res'},
1584             'resolution': {'type': 'alias', 'field': 'res'},
1585             'extension': {'type': 'alias', 'field': 'ext'},
1586             'bitrate': {'type': 'alias', 'field': 'br'},
1587             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1588             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1589             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1590             'framerate': {'type': 'alias', 'field': 'fps'},
1591             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1592             'protocol': {'type': 'alias', 'field': 'proto'},
1593             'source_preference': {'type': 'alias', 'field': 'source'},
1594             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1595             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1596             'samplerate': {'type': 'alias', 'field': 'asr'},
1597             'video_ext': {'type': 'alias', 'field': 'vext'},
1598             'audio_ext': {'type': 'alias', 'field': 'aext'},
1599             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1600             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1601             'video': {'type': 'alias', 'field': 'hasvid'},
1602             'has_video': {'type': 'alias', 'field': 'hasvid'},
1603             'audio': {'type': 'alias', 'field': 'hasaud'},
1604             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1605             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1606             'preference': {'type': 'alias', 'field': 'ie_pref'},
1607             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1608             'format_id': {'type': 'alias', 'field': 'id'},
1609         }
1610
1611         _order = []
1612
1613         def _get_field_setting(self, field, key):
1614             if field not in self.settings:
1615                 self.settings[field] = {}
1616             propObj = self.settings[field]
1617             if key not in propObj:
1618                 type = propObj.get('type')
1619                 if key == 'field':
1620                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1621                 elif key == 'convert':
1622                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1623                 else:
1624                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1625                 propObj[key] = default
1626             return propObj[key]
1627
1628         def _resolve_field_value(self, field, value, convertNone=False):
1629             if value is None:
1630                 if not convertNone:
1631                     return None
1632             else:
1633                 value = value.lower()
1634             conversion = self._get_field_setting(field, 'convert')
1635             if conversion == 'ignore':
1636                 return None
1637             if conversion == 'string':
1638                 return value
1639             elif conversion == 'float_none':
1640                 return float_or_none(value)
1641             elif conversion == 'bytes':
1642                 return FileDownloader.parse_bytes(value)
1643             elif conversion == 'order':
1644                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1645                 use_regex = self._get_field_setting(field, 'regex')
1646                 list_length = len(order_list)
1647                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1648                 if use_regex and value is not None:
1649                     for i, regex in enumerate(order_list):
1650                         if regex and re.match(regex, value):
1651                             return list_length - i
1652                     return list_length - empty_pos  # not in list
1653                 else:  # not regex or  value = None
1654                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1655             else:
1656                 if value.isnumeric():
1657                     return float(value)
1658                 else:
1659                     self.settings[field]['convert'] = 'string'
1660                     return value
1661
1662         def evaluate_params(self, params, sort_extractor):
1663             self._use_free_order = params.get('prefer_free_formats', False)
1664             self._sort_user = params.get('format_sort', [])
1665             self._sort_extractor = sort_extractor
1666
1667             def add_item(field, reverse, closest, limit_text):
1668                 field = field.lower()
1669                 if field in self._order:
1670                     return
1671                 self._order.append(field)
1672                 limit = self._resolve_field_value(field, limit_text)
1673                 data = {
1674                     'reverse': reverse,
1675                     'closest': False if limit is None else closest,
1676                     'limit_text': limit_text,
1677                     'limit': limit}
1678                 if field in self.settings:
1679                     self.settings[field].update(data)
1680                 else:
1681                     self.settings[field] = data
1682
1683             sort_list = (
1684                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1685                 + (tuple() if params.get('format_sort_force', False)
1686                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1687                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1688
1689             for item in sort_list:
1690                 match = re.match(self.regex, item)
1691                 if match is None:
1692                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1693                 field = match.group('field')
1694                 if field is None:
1695                     continue
1696                 if self._get_field_setting(field, 'type') == 'alias':
1697                     field = self._get_field_setting(field, 'field')
1698                 reverse = match.group('reverse') is not None
1699                 closest = match.group('separator') == '~'
1700                 limit_text = match.group('limit')
1701
1702                 has_limit = limit_text is not None
1703                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1704                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1705
1706                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1707                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1708                 limit_count = len(limits)
1709                 for (i, f) in enumerate(fields):
1710                     add_item(f, reverse, closest,
1711                              limits[i] if i < limit_count
1712                              else limits[0] if has_limit and not has_multiple_limits
1713                              else None)
1714
1715         def print_verbose_info(self, write_debug):
1716             if self._sort_user:
1717                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1718             if self._sort_extractor:
1719                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1720             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1721                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1722                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1723                               self._get_field_setting(field, 'limit_text'),
1724                               self._get_field_setting(field, 'limit'))
1725                 if self._get_field_setting(field, 'limit_text') is not None else '')
1726                 for field in self._order if self._get_field_setting(field, 'visible')]))
1727
1728         def _calculate_field_preference_from_value(self, format, field, type, value):
1729             reverse = self._get_field_setting(field, 'reverse')
1730             closest = self._get_field_setting(field, 'closest')
1731             limit = self._get_field_setting(field, 'limit')
1732
1733             if type == 'extractor':
1734                 maximum = self._get_field_setting(field, 'max')
1735                 if value is None or (maximum is not None and value >= maximum):
1736                     value = -1
1737             elif type == 'boolean':
1738                 in_list = self._get_field_setting(field, 'in_list')
1739                 not_in_list = self._get_field_setting(field, 'not_in_list')
1740                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1741             elif type == 'ordered':
1742                 value = self._resolve_field_value(field, value, True)
1743
1744             # try to convert to number
1745             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1746             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1747             if is_num:
1748                 value = val_num
1749
1750             return ((-10, 0) if value is None
1751                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1752                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1753                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1754                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1755                     else (-1, value, 0))
1756
1757         def _calculate_field_preference(self, format, field):
1758             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1759             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1760             if type == 'multiple':
1761                 type = 'field'  # Only 'field' is allowed in multiple for now
1762                 actual_fields = self._get_field_setting(field, 'field')
1763
1764                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1765             else:
1766                 value = get_value(field)
1767             return self._calculate_field_preference_from_value(format, field, type, value)
1768
1769         def calculate_preference(self, format):
1770             # Determine missing protocol
1771             if not format.get('protocol'):
1772                 format['protocol'] = determine_protocol(format)
1773
1774             # Determine missing ext
1775             if not format.get('ext') and 'url' in format:
1776                 format['ext'] = determine_ext(format['url'])
1777             if format.get('vcodec') == 'none':
1778                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1779                 format['video_ext'] = 'none'
1780             else:
1781                 format['video_ext'] = format['ext']
1782                 format['audio_ext'] = 'none'
1783             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1784             #    format['preference'] = -1000
1785
1786             # Determine missing bitrates
1787             if format.get('tbr') is None:
1788                 if format.get('vbr') is not None and format.get('abr') is not None:
1789                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1790             else:
1791                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1792                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1793                 if format.get('acodec') != 'none' and format.get('abr') is None:
1794                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1795
1796             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1797
1798     def _sort_formats(self, formats, field_preference=[]):
1799         if not formats:
1800             return
1801         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1802         format_sort.evaluate_params(self._downloader.params, field_preference)
1803         if self.get_param('verbose', False):
1804             format_sort.print_verbose_info(self._downloader.write_debug)
1805         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1806
1807     def _check_formats(self, formats, video_id):
1808         if formats:
1809             formats[:] = filter(
1810                 lambda f: self._is_valid_url(
1811                     f['url'], video_id,
1812                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1813                 formats)
1814
1815     @staticmethod
1816     def _remove_duplicate_formats(formats):
1817         format_urls = set()
1818         unique_formats = []
1819         for f in formats:
1820             if f['url'] not in format_urls:
1821                 format_urls.add(f['url'])
1822                 unique_formats.append(f)
1823         formats[:] = unique_formats
1824
1825     def _is_valid_url(self, url, video_id, item='video', headers={}):
1826         url = self._proto_relative_url(url, scheme='http:')
1827         # For now assume non HTTP(S) URLs always valid
1828         if not (url.startswith('http://') or url.startswith('https://')):
1829             return True
1830         try:
1831             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1832             return True
1833         except ExtractorError as e:
1834             self.to_screen(
1835                 '%s: %s URL is invalid, skipping: %s'
1836                 % (video_id, item, error_to_compat_str(e.cause)))
1837             return False
1838
1839     def http_scheme(self):
1840         """ Either "http:" or "https:", depending on the user's preferences """
1841         return (
1842             'http:'
1843             if self.get_param('prefer_insecure', False)
1844             else 'https:')
1845
1846     def _proto_relative_url(self, url, scheme=None):
1847         if url is None:
1848             return url
1849         if url.startswith('//'):
1850             if scheme is None:
1851                 scheme = self.http_scheme()
1852             return scheme + url
1853         else:
1854             return url
1855
1856     def _sleep(self, timeout, video_id, msg_template=None):
1857         if msg_template is None:
1858             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1859         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1860         self.to_screen(msg)
1861         time.sleep(timeout)
1862
1863     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1864                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1865                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1866         manifest = self._download_xml(
1867             manifest_url, video_id, 'Downloading f4m manifest',
1868             'Unable to download f4m manifest',
1869             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1870             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1871             transform_source=transform_source,
1872             fatal=fatal, data=data, headers=headers, query=query)
1873
1874         if manifest is False:
1875             return []
1876
1877         return self._parse_f4m_formats(
1878             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1879             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1880
1881     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1882                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1883                            fatal=True, m3u8_id=None):
1884         if not isinstance(manifest, compat_etree_Element) and not fatal:
1885             return []
1886
1887         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1888         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1889         if akamai_pv is not None and ';' in akamai_pv.text:
1890             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1891             if playerVerificationChallenge.strip() != '':
1892                 return []
1893
1894         formats = []
1895         manifest_version = '1.0'
1896         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1897         if not media_nodes:
1898             manifest_version = '2.0'
1899             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1900         # Remove unsupported DRM protected media from final formats
1901         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1902         media_nodes = remove_encrypted_media(media_nodes)
1903         if not media_nodes:
1904             return formats
1905
1906         manifest_base_url = get_base_url(manifest)
1907
1908         bootstrap_info = xpath_element(
1909             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1910             'bootstrap info', default=None)
1911
1912         vcodec = None
1913         mime_type = xpath_text(
1914             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1915             'base URL', default=None)
1916         if mime_type and mime_type.startswith('audio/'):
1917             vcodec = 'none'
1918
1919         for i, media_el in enumerate(media_nodes):
1920             tbr = int_or_none(media_el.attrib.get('bitrate'))
1921             width = int_or_none(media_el.attrib.get('width'))
1922             height = int_or_none(media_el.attrib.get('height'))
1923             format_id = join_nonempty(f4m_id, tbr or i)
1924             # If <bootstrapInfo> is present, the specified f4m is a
1925             # stream-level manifest, and only set-level manifests may refer to
1926             # external resources.  See section 11.4 and section 4 of F4M spec
1927             if bootstrap_info is None:
1928                 media_url = None
1929                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1930                 if manifest_version == '2.0':
1931                     media_url = media_el.attrib.get('href')
1932                 if media_url is None:
1933                     media_url = media_el.attrib.get('url')
1934                 if not media_url:
1935                     continue
1936                 manifest_url = (
1937                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1938                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1939                 # If media_url is itself a f4m manifest do the recursive extraction
1940                 # since bitrates in parent manifest (this one) and media_url manifest
1941                 # may differ leading to inability to resolve the format by requested
1942                 # bitrate in f4m downloader
1943                 ext = determine_ext(manifest_url)
1944                 if ext == 'f4m':
1945                     f4m_formats = self._extract_f4m_formats(
1946                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1947                         transform_source=transform_source, fatal=fatal)
1948                     # Sometimes stream-level manifest contains single media entry that
1949                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1950                     # At the same time parent's media entry in set-level manifest may
1951                     # contain it. We will copy it from parent in such cases.
1952                     if len(f4m_formats) == 1:
1953                         f = f4m_formats[0]
1954                         f.update({
1955                             'tbr': f.get('tbr') or tbr,
1956                             'width': f.get('width') or width,
1957                             'height': f.get('height') or height,
1958                             'format_id': f.get('format_id') if not tbr else format_id,
1959                             'vcodec': vcodec,
1960                         })
1961                     formats.extend(f4m_formats)
1962                     continue
1963                 elif ext == 'm3u8':
1964                     formats.extend(self._extract_m3u8_formats(
1965                         manifest_url, video_id, 'mp4', preference=preference,
1966                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1967                     continue
1968             formats.append({
1969                 'format_id': format_id,
1970                 'url': manifest_url,
1971                 'manifest_url': manifest_url,
1972                 'ext': 'flv' if bootstrap_info is not None else None,
1973                 'protocol': 'f4m',
1974                 'tbr': tbr,
1975                 'width': width,
1976                 'height': height,
1977                 'vcodec': vcodec,
1978                 'preference': preference,
1979                 'quality': quality,
1980             })
1981         return formats
1982
1983     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1984         return {
1985             'format_id': join_nonempty(m3u8_id, 'meta'),
1986             'url': m3u8_url,
1987             'ext': ext,
1988             'protocol': 'm3u8',
1989             'preference': preference - 100 if preference else -100,
1990             'quality': quality,
1991             'resolution': 'multiple',
1992             'format_note': 'Quality selection URL',
1993         }
1994
1995     def _report_ignoring_subs(self, name):
1996         self.report_warning(bug_reports_message(
1997             f'Ignoring subtitle tracks found in the {name} manifest; '
1998             'if any subtitle tracks are missing,'
1999         ), only_once=True)
2000
2001     def _extract_m3u8_formats(self, *args, **kwargs):
2002         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2003         if subs:
2004             self._report_ignoring_subs('HLS')
2005         return fmts
2006
2007     def _extract_m3u8_formats_and_subtitles(
2008             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2009             preference=None, quality=None, m3u8_id=None, note=None,
2010             errnote=None, fatal=True, live=False, data=None, headers={},
2011             query={}):
2012
2013         res = self._download_webpage_handle(
2014             m3u8_url, video_id,
2015             note='Downloading m3u8 information' if note is None else note,
2016             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2017             fatal=fatal, data=data, headers=headers, query=query)
2018
2019         if res is False:
2020             return [], {}
2021
2022         m3u8_doc, urlh = res
2023         m3u8_url = urlh.geturl()
2024
2025         return self._parse_m3u8_formats_and_subtitles(
2026             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2027             preference=preference, quality=quality, m3u8_id=m3u8_id,
2028             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2029             headers=headers, query=query, video_id=video_id)
2030
2031     def _parse_m3u8_formats_and_subtitles(
2032             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2033             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2034             errnote=None, fatal=True, data=None, headers={}, query={},
2035             video_id=None):
2036         formats, subtitles = [], {}
2037
2038         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
2039             return formats, subtitles
2040
2041         has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc)
2042
2043         def format_url(url):
2044             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2045
2046         if self.get_param('hls_split_discontinuity', False):
2047             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2048                 if not m3u8_doc:
2049                     if not manifest_url:
2050                         return []
2051                     m3u8_doc = self._download_webpage(
2052                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2053                         note=False, errnote='Failed to download m3u8 playlist information')
2054                     if m3u8_doc is False:
2055                         return []
2056                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2057
2058         else:
2059             def _extract_m3u8_playlist_indices(*args, **kwargs):
2060                 return [None]
2061
2062         # References:
2063         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2064         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2065         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2066
2067         # We should try extracting formats only from master playlists [1, 4.3.4],
2068         # i.e. playlists that describe available qualities. On the other hand
2069         # media playlists [1, 4.3.3] should be returned as is since they contain
2070         # just the media without qualities renditions.
2071         # Fortunately, master playlist can be easily distinguished from media
2072         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2073         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2074         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2075         # media playlist and MUST NOT appear in master playlist thus we can
2076         # clearly detect media playlist with this criterion.
2077
2078         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2079             formats = [{
2080                 'format_id': join_nonempty(m3u8_id, idx),
2081                 'format_index': idx,
2082                 'url': m3u8_url,
2083                 'ext': ext,
2084                 'protocol': entry_protocol,
2085                 'preference': preference,
2086                 'quality': quality,
2087                 'has_drm': has_drm,
2088             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2089
2090             return formats, subtitles
2091
2092         groups = {}
2093         last_stream_inf = {}
2094
2095         def extract_media(x_media_line):
2096             media = parse_m3u8_attributes(x_media_line)
2097             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2098             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2099             if not (media_type and group_id and name):
2100                 return
2101             groups.setdefault(group_id, []).append(media)
2102             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2103             if media_type == 'SUBTITLES':
2104                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2105                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2106                 # However, lack of URI has been spotted in the wild.
2107                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2108                 if not media.get('URI'):
2109                     return
2110                 url = format_url(media['URI'])
2111                 sub_info = {
2112                     'url': url,
2113                     'ext': determine_ext(url),
2114                 }
2115                 if sub_info['ext'] == 'm3u8':
2116                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2117                     # files may contain is WebVTT:
2118                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2119                     sub_info['ext'] = 'vtt'
2120                     sub_info['protocol'] = 'm3u8_native'
2121                 lang = media.get('LANGUAGE') or 'und'
2122                 subtitles.setdefault(lang, []).append(sub_info)
2123             if media_type not in ('VIDEO', 'AUDIO'):
2124                 return
2125             media_url = media.get('URI')
2126             if media_url:
2127                 manifest_url = format_url(media_url)
2128                 formats.extend({
2129                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2130                     'format_note': name,
2131                     'format_index': idx,
2132                     'url': manifest_url,
2133                     'manifest_url': m3u8_url,
2134                     'language': media.get('LANGUAGE'),
2135                     'ext': ext,
2136                     'protocol': entry_protocol,
2137                     'preference': preference,
2138                     'quality': quality,
2139                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2140                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2141
2142         def build_stream_name():
2143             # Despite specification does not mention NAME attribute for
2144             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2145             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2146             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2147             stream_name = last_stream_inf.get('NAME')
2148             if stream_name:
2149                 return stream_name
2150             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2151             # from corresponding rendition group
2152             stream_group_id = last_stream_inf.get('VIDEO')
2153             if not stream_group_id:
2154                 return
2155             stream_group = groups.get(stream_group_id)
2156             if not stream_group:
2157                 return stream_group_id
2158             rendition = stream_group[0]
2159             return rendition.get('NAME') or stream_group_id
2160
2161         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2162         # chance to detect video only formats when EXT-X-STREAM-INF tags
2163         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2164         for line in m3u8_doc.splitlines():
2165             if line.startswith('#EXT-X-MEDIA:'):
2166                 extract_media(line)
2167
2168         for line in m3u8_doc.splitlines():
2169             if line.startswith('#EXT-X-STREAM-INF:'):
2170                 last_stream_inf = parse_m3u8_attributes(line)
2171             elif line.startswith('#') or not line.strip():
2172                 continue
2173             else:
2174                 tbr = float_or_none(
2175                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2176                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2177                 manifest_url = format_url(line.strip())
2178
2179                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2180                     format_id = [m3u8_id, None, idx]
2181                     # Bandwidth of live streams may differ over time thus making
2182                     # format_id unpredictable. So it's better to keep provided
2183                     # format_id intact.
2184                     if not live:
2185                         stream_name = build_stream_name()
2186                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2187                     f = {
2188                         'format_id': join_nonempty(*format_id),
2189                         'format_index': idx,
2190                         'url': manifest_url,
2191                         'manifest_url': m3u8_url,
2192                         'tbr': tbr,
2193                         'ext': ext,
2194                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2195                         'protocol': entry_protocol,
2196                         'preference': preference,
2197                         'quality': quality,
2198                     }
2199                     resolution = last_stream_inf.get('RESOLUTION')
2200                     if resolution:
2201                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2202                         if mobj:
2203                             f['width'] = int(mobj.group('width'))
2204                             f['height'] = int(mobj.group('height'))
2205                     # Unified Streaming Platform
2206                     mobj = re.search(
2207                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2208                     if mobj:
2209                         abr, vbr = mobj.groups()
2210                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2211                         f.update({
2212                             'vbr': vbr,
2213                             'abr': abr,
2214                         })
2215                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2216                     f.update(codecs)
2217                     audio_group_id = last_stream_inf.get('AUDIO')
2218                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2219                     # references a rendition group MUST have a CODECS attribute.
2220                     # However, this is not always respected, for example, [2]
2221                     # contains EXT-X-STREAM-INF tag which references AUDIO
2222                     # rendition group but does not have CODECS and despite
2223                     # referencing an audio group it represents a complete
2224                     # (with audio and video) format. So, for such cases we will
2225                     # ignore references to rendition groups and treat them
2226                     # as complete formats.
2227                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2228                         audio_group = groups.get(audio_group_id)
2229                         if audio_group and audio_group[0].get('URI'):
2230                             # TODO: update acodec for audio only formats with
2231                             # the same GROUP-ID
2232                             f['acodec'] = 'none'
2233                     if not f.get('ext'):
2234                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2235                     formats.append(f)
2236
2237                     # for DailyMotion
2238                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2239                     if progressive_uri:
2240                         http_f = f.copy()
2241                         del http_f['manifest_url']
2242                         http_f.update({
2243                             'format_id': f['format_id'].replace('hls-', 'http-'),
2244                             'protocol': 'http',
2245                             'url': progressive_uri,
2246                         })
2247                         formats.append(http_f)
2248
2249                 last_stream_inf = {}
2250         return formats, subtitles
2251
2252     def _extract_m3u8_vod_duration(
2253             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2254
2255         m3u8_vod = self._download_webpage(
2256             m3u8_vod_url, video_id,
2257             note='Downloading m3u8 VOD manifest' if note is None else note,
2258             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2259             fatal=False, data=data, headers=headers, query=query)
2260
2261         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2262
2263     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2264         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2265             return None
2266
2267         return int(sum(
2268             float(line[len('#EXTINF:'):].split(',')[0])
2269             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2270
2271     @staticmethod
2272     def _xpath_ns(path, namespace=None):
2273         if not namespace:
2274             return path
2275         out = []
2276         for c in path.split('/'):
2277             if not c or c == '.':
2278                 out.append(c)
2279             else:
2280                 out.append('{%s}%s' % (namespace, c))
2281         return '/'.join(out)
2282
2283     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2284         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2285
2286         if smil is False:
2287             assert not fatal
2288             return []
2289
2290         namespace = self._parse_smil_namespace(smil)
2291
2292         fmts = self._parse_smil_formats(
2293             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2294         subs = self._parse_smil_subtitles(
2295             smil, namespace=namespace)
2296
2297         return fmts, subs
2298
2299     def _extract_smil_formats(self, *args, **kwargs):
2300         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2301         if subs:
2302             self._report_ignoring_subs('SMIL')
2303         return fmts
2304
2305     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2306         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2307         if smil is False:
2308             return {}
2309         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2310
2311     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2312         return self._download_xml(
2313             smil_url, video_id, 'Downloading SMIL file',
2314             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2315
2316     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2317         namespace = self._parse_smil_namespace(smil)
2318
2319         formats = self._parse_smil_formats(
2320             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2321         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2322
2323         video_id = os.path.splitext(url_basename(smil_url))[0]
2324         title = None
2325         description = None
2326         upload_date = None
2327         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2328             name = meta.attrib.get('name')
2329             content = meta.attrib.get('content')
2330             if not name or not content:
2331                 continue
2332             if not title and name == 'title':
2333                 title = content
2334             elif not description and name in ('description', 'abstract'):
2335                 description = content
2336             elif not upload_date and name == 'date':
2337                 upload_date = unified_strdate(content)
2338
2339         thumbnails = [{
2340             'id': image.get('type'),
2341             'url': image.get('src'),
2342             'width': int_or_none(image.get('width')),
2343             'height': int_or_none(image.get('height')),
2344         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2345
2346         return {
2347             'id': video_id,
2348             'title': title or video_id,
2349             'description': description,
2350             'upload_date': upload_date,
2351             'thumbnails': thumbnails,
2352             'formats': formats,
2353             'subtitles': subtitles,
2354         }
2355
2356     def _parse_smil_namespace(self, smil):
2357         return self._search_regex(
2358             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2359
2360     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2361         base = smil_url
2362         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2363             b = meta.get('base') or meta.get('httpBase')
2364             if b:
2365                 base = b
2366                 break
2367
2368         formats = []
2369         rtmp_count = 0
2370         http_count = 0
2371         m3u8_count = 0
2372         imgs_count = 0
2373
2374         srcs = set()
2375         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2376         for medium in media:
2377             src = medium.get('src')
2378             if not src or src in srcs:
2379                 continue
2380             srcs.add(src)
2381
2382             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2383             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2384             width = int_or_none(medium.get('width'))
2385             height = int_or_none(medium.get('height'))
2386             proto = medium.get('proto')
2387             ext = medium.get('ext')
2388             src_ext = determine_ext(src)
2389             streamer = medium.get('streamer') or base
2390
2391             if proto == 'rtmp' or streamer.startswith('rtmp'):
2392                 rtmp_count += 1
2393                 formats.append({
2394                     'url': streamer,
2395                     'play_path': src,
2396                     'ext': 'flv',
2397                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2398                     'tbr': bitrate,
2399                     'filesize': filesize,
2400                     'width': width,
2401                     'height': height,
2402                 })
2403                 if transform_rtmp_url:
2404                     streamer, src = transform_rtmp_url(streamer, src)
2405                     formats[-1].update({
2406                         'url': streamer,
2407                         'play_path': src,
2408                     })
2409                 continue
2410
2411             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2412             src_url = src_url.strip()
2413
2414             if proto == 'm3u8' or src_ext == 'm3u8':
2415                 m3u8_formats = self._extract_m3u8_formats(
2416                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2417                 if len(m3u8_formats) == 1:
2418                     m3u8_count += 1
2419                     m3u8_formats[0].update({
2420                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2421                         'tbr': bitrate,
2422                         'width': width,
2423                         'height': height,
2424                     })
2425                 formats.extend(m3u8_formats)
2426             elif src_ext == 'f4m':
2427                 f4m_url = src_url
2428                 if not f4m_params:
2429                     f4m_params = {
2430                         'hdcore': '3.2.0',
2431                         'plugin': 'flowplayer-3.2.0.1',
2432                     }
2433                 f4m_url += '&' if '?' in f4m_url else '?'
2434                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2435                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2436             elif src_ext == 'mpd':
2437                 formats.extend(self._extract_mpd_formats(
2438                     src_url, video_id, mpd_id='dash', fatal=False))
2439             elif re.search(r'\.ism/[Mm]anifest', src_url):
2440                 formats.extend(self._extract_ism_formats(
2441                     src_url, video_id, ism_id='mss', fatal=False))
2442             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2443                 http_count += 1
2444                 formats.append({
2445                     'url': src_url,
2446                     'ext': ext or src_ext or 'flv',
2447                     'format_id': 'http-%d' % (bitrate or http_count),
2448                     'tbr': bitrate,
2449                     'filesize': filesize,
2450                     'width': width,
2451                     'height': height,
2452                 })
2453
2454         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2455             src = medium.get('src')
2456             if not src or src in srcs:
2457                 continue
2458             srcs.add(src)
2459
2460             imgs_count += 1
2461             formats.append({
2462                 'format_id': 'imagestream-%d' % (imgs_count),
2463                 'url': src,
2464                 'ext': mimetype2ext(medium.get('type')),
2465                 'acodec': 'none',
2466                 'vcodec': 'none',
2467                 'width': int_or_none(medium.get('width')),
2468                 'height': int_or_none(medium.get('height')),
2469                 'format_note': 'SMIL storyboards',
2470             })
2471
2472         return formats
2473
2474     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2475         urls = []
2476         subtitles = {}
2477         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2478             src = textstream.get('src')
2479             if not src or src in urls:
2480                 continue
2481             urls.append(src)
2482             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2483             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2484             subtitles.setdefault(lang, []).append({
2485                 'url': src,
2486                 'ext': ext,
2487             })
2488         return subtitles
2489
2490     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2491         xspf = self._download_xml(
2492             xspf_url, playlist_id, 'Downloading xpsf playlist',
2493             'Unable to download xspf manifest', fatal=fatal)
2494         if xspf is False:
2495             return []
2496         return self._parse_xspf(
2497             xspf, playlist_id, xspf_url=xspf_url,
2498             xspf_base_url=base_url(xspf_url))
2499
2500     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2501         NS_MAP = {
2502             'xspf': 'http://xspf.org/ns/0/',
2503             's1': 'http://static.streamone.nl/player/ns/0',
2504         }
2505
2506         entries = []
2507         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2508             title = xpath_text(
2509                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2510             description = xpath_text(
2511                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2512             thumbnail = xpath_text(
2513                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2514             duration = float_or_none(
2515                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2516
2517             formats = []
2518             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2519                 format_url = urljoin(xspf_base_url, location.text)
2520                 if not format_url:
2521                     continue
2522                 formats.append({
2523                     'url': format_url,
2524                     'manifest_url': xspf_url,
2525                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2526                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2527                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2528                 })
2529             self._sort_formats(formats)
2530
2531             entries.append({
2532                 'id': playlist_id,
2533                 'title': title,
2534                 'description': description,
2535                 'thumbnail': thumbnail,
2536                 'duration': duration,
2537                 'formats': formats,
2538             })
2539         return entries
2540
2541     def _extract_mpd_formats(self, *args, **kwargs):
2542         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2543         if subs:
2544             self._report_ignoring_subs('DASH')
2545         return fmts
2546
2547     def _extract_mpd_formats_and_subtitles(
2548             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2549             fatal=True, data=None, headers={}, query={}):
2550         res = self._download_xml_handle(
2551             mpd_url, video_id,
2552             note='Downloading MPD manifest' if note is None else note,
2553             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2554             fatal=fatal, data=data, headers=headers, query=query)
2555         if res is False:
2556             return [], {}
2557         mpd_doc, urlh = res
2558         if mpd_doc is None:
2559             return [], {}
2560         mpd_base_url = base_url(urlh.geturl())
2561
2562         return self._parse_mpd_formats_and_subtitles(
2563             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2564
2565     def _parse_mpd_formats(self, *args, **kwargs):
2566         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2567         if subs:
2568             self._report_ignoring_subs('DASH')
2569         return fmts
2570
2571     def _parse_mpd_formats_and_subtitles(
2572             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2573         """
2574         Parse formats from MPD manifest.
2575         References:
2576          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2577             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2578          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2579         """
2580         if not self.get_param('dynamic_mpd', True):
2581             if mpd_doc.get('type') == 'dynamic':
2582                 return [], {}
2583
2584         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2585
2586         def _add_ns(path):
2587             return self._xpath_ns(path, namespace)
2588
2589         def is_drm_protected(element):
2590             return element.find(_add_ns('ContentProtection')) is not None
2591
2592         def extract_multisegment_info(element, ms_parent_info):
2593             ms_info = ms_parent_info.copy()
2594
2595             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2596             # common attributes and elements.  We will only extract relevant
2597             # for us.
2598             def extract_common(source):
2599                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2600                 if segment_timeline is not None:
2601                     s_e = segment_timeline.findall(_add_ns('S'))
2602                     if s_e:
2603                         ms_info['total_number'] = 0
2604                         ms_info['s'] = []
2605                         for s in s_e:
2606                             r = int(s.get('r', 0))
2607                             ms_info['total_number'] += 1 + r
2608                             ms_info['s'].append({
2609                                 't': int(s.get('t', 0)),
2610                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2611                                 'd': int(s.attrib['d']),
2612                                 'r': r,
2613                             })
2614                 start_number = source.get('startNumber')
2615                 if start_number:
2616                     ms_info['start_number'] = int(start_number)
2617                 timescale = source.get('timescale')
2618                 if timescale:
2619                     ms_info['timescale'] = int(timescale)
2620                 segment_duration = source.get('duration')
2621                 if segment_duration:
2622                     ms_info['segment_duration'] = float(segment_duration)
2623
2624             def extract_Initialization(source):
2625                 initialization = source.find(_add_ns('Initialization'))
2626                 if initialization is not None:
2627                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2628
2629             segment_list = element.find(_add_ns('SegmentList'))
2630             if segment_list is not None:
2631                 extract_common(segment_list)
2632                 extract_Initialization(segment_list)
2633                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2634                 if segment_urls_e:
2635                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2636             else:
2637                 segment_template = element.find(_add_ns('SegmentTemplate'))
2638                 if segment_template is not None:
2639                     extract_common(segment_template)
2640                     media = segment_template.get('media')
2641                     if media:
2642                         ms_info['media'] = media
2643                     initialization = segment_template.get('initialization')
2644                     if initialization:
2645                         ms_info['initialization'] = initialization
2646                     else:
2647                         extract_Initialization(segment_template)
2648             return ms_info
2649
2650         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2651         formats, subtitles = [], {}
2652         stream_numbers = {'audio': 0, 'video': 0}
2653         for period in mpd_doc.findall(_add_ns('Period')):
2654             period_duration = parse_duration(period.get('duration')) or mpd_duration
2655             period_ms_info = extract_multisegment_info(period, {
2656                 'start_number': 1,
2657                 'timescale': 1,
2658             })
2659             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2660                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2661                 for representation in adaptation_set.findall(_add_ns('Representation')):
2662                     representation_attrib = adaptation_set.attrib.copy()
2663                     representation_attrib.update(representation.attrib)
2664                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2665                     mime_type = representation_attrib['mimeType']
2666                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2667
2668                     codecs = representation_attrib.get('codecs', '')
2669                     if content_type not in ('video', 'audio', 'text'):
2670                         if mime_type == 'image/jpeg':
2671                             content_type = mime_type
2672                         elif codecs.split('.')[0] == 'stpp':
2673                             content_type = 'text'
2674                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2675                             content_type = 'text'
2676                         else:
2677                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2678                             continue
2679
2680                     base_url = ''
2681                     for element in (representation, adaptation_set, period, mpd_doc):
2682                         base_url_e = element.find(_add_ns('BaseURL'))
2683                         if base_url_e is not None:
2684                             base_url = base_url_e.text + base_url
2685                             if re.match(r'^https?://', base_url):
2686                                 break
2687                     if mpd_base_url and base_url.startswith('/'):
2688                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2689                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2690                         if not mpd_base_url.endswith('/'):
2691                             mpd_base_url += '/'
2692                         base_url = mpd_base_url + base_url
2693                     representation_id = representation_attrib.get('id')
2694                     lang = representation_attrib.get('lang')
2695                     url_el = representation.find(_add_ns('BaseURL'))
2696                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2697                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2698                     if representation_id is not None:
2699                         format_id = representation_id
2700                     else:
2701                         format_id = content_type
2702                     if mpd_id:
2703                         format_id = mpd_id + '-' + format_id
2704                     if content_type in ('video', 'audio'):
2705                         f = {
2706                             'format_id': format_id,
2707                             'manifest_url': mpd_url,
2708                             'ext': mimetype2ext(mime_type),
2709                             'width': int_or_none(representation_attrib.get('width')),
2710                             'height': int_or_none(representation_attrib.get('height')),
2711                             'tbr': float_or_none(bandwidth, 1000),
2712                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2713                             'fps': int_or_none(representation_attrib.get('frameRate')),
2714                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2715                             'format_note': 'DASH %s' % content_type,
2716                             'filesize': filesize,
2717                             'container': mimetype2ext(mime_type) + '_dash',
2718                             'manifest_stream_number': stream_numbers[content_type]
2719                         }
2720                         f.update(parse_codecs(codecs))
2721                         stream_numbers[content_type] += 1
2722                     elif content_type == 'text':
2723                         f = {
2724                             'ext': mimetype2ext(mime_type),
2725                             'manifest_url': mpd_url,
2726                             'filesize': filesize,
2727                         }
2728                     elif content_type == 'image/jpeg':
2729                         # See test case in VikiIE
2730                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2731                         f = {
2732                             'format_id': format_id,
2733                             'ext': 'mhtml',
2734                             'manifest_url': mpd_url,
2735                             'format_note': 'DASH storyboards (jpeg)',
2736                             'acodec': 'none',
2737                             'vcodec': 'none',
2738                         }
2739                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2740                         f['has_drm'] = True
2741                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2742
2743                     def prepare_template(template_name, identifiers):
2744                         tmpl = representation_ms_info[template_name]
2745                         # First of, % characters outside $...$ templates
2746                         # must be escaped by doubling for proper processing
2747                         # by % operator string formatting used further (see
2748                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2749                         t = ''
2750                         in_template = False
2751                         for c in tmpl:
2752                             t += c
2753                             if c == '$':
2754                                 in_template = not in_template
2755                             elif c == '%' and not in_template:
2756                                 t += c
2757                         # Next, $...$ templates are translated to their
2758                         # %(...) counterparts to be used with % operator
2759                         if representation_id is not None:
2760                             t = t.replace('$RepresentationID$', representation_id)
2761                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2762                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2763                         t.replace('$$', '$')
2764                         return t
2765
2766                     # @initialization is a regular template like @media one
2767                     # so it should be handled just the same way (see
2768                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2769                     if 'initialization' in representation_ms_info:
2770                         initialization_template = prepare_template(
2771                             'initialization',
2772                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2773                             # $Time$ shall not be included for @initialization thus
2774                             # only $Bandwidth$ remains
2775                             ('Bandwidth', ))
2776                         representation_ms_info['initialization_url'] = initialization_template % {
2777                             'Bandwidth': bandwidth,
2778                         }
2779
2780                     def location_key(location):
2781                         return 'url' if re.match(r'^https?://', location) else 'path'
2782
2783                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2784
2785                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2786                         media_location_key = location_key(media_template)
2787
2788                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2789                         # can't be used at the same time
2790                         if '%(Number' in media_template and 's' not in representation_ms_info:
2791                             segment_duration = None
2792                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2793                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2794                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2795                             representation_ms_info['fragments'] = [{
2796                                 media_location_key: media_template % {
2797                                     'Number': segment_number,
2798                                     'Bandwidth': bandwidth,
2799                                 },
2800                                 'duration': segment_duration,
2801                             } for segment_number in range(
2802                                 representation_ms_info['start_number'],
2803                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2804                         else:
2805                             # $Number*$ or $Time$ in media template with S list available
2806                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2807                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2808                             representation_ms_info['fragments'] = []
2809                             segment_time = 0
2810                             segment_d = None
2811                             segment_number = representation_ms_info['start_number']
2812
2813                             def add_segment_url():
2814                                 segment_url = media_template % {
2815                                     'Time': segment_time,
2816                                     'Bandwidth': bandwidth,
2817                                     'Number': segment_number,
2818                                 }
2819                                 representation_ms_info['fragments'].append({
2820                                     media_location_key: segment_url,
2821                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2822                                 })
2823
2824                             for num, s in enumerate(representation_ms_info['s']):
2825                                 segment_time = s.get('t') or segment_time
2826                                 segment_d = s['d']
2827                                 add_segment_url()
2828                                 segment_number += 1
2829                                 for r in range(s.get('r', 0)):
2830                                     segment_time += segment_d
2831                                     add_segment_url()
2832                                     segment_number += 1
2833                                 segment_time += segment_d
2834                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2835                         # No media template
2836                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2837                         # or any YouTube dashsegments video
2838                         fragments = []
2839                         segment_index = 0
2840                         timescale = representation_ms_info['timescale']
2841                         for s in representation_ms_info['s']:
2842                             duration = float_or_none(s['d'], timescale)
2843                             for r in range(s.get('r', 0) + 1):
2844                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2845                                 fragments.append({
2846                                     location_key(segment_uri): segment_uri,
2847                                     'duration': duration,
2848                                 })
2849                                 segment_index += 1
2850                         representation_ms_info['fragments'] = fragments
2851                     elif 'segment_urls' in representation_ms_info:
2852                         # Segment URLs with no SegmentTimeline
2853                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2854                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2855                         fragments = []
2856                         segment_duration = float_or_none(
2857                             representation_ms_info['segment_duration'],
2858                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2859                         for segment_url in representation_ms_info['segment_urls']:
2860                             fragment = {
2861                                 location_key(segment_url): segment_url,
2862                             }
2863                             if segment_duration:
2864                                 fragment['duration'] = segment_duration
2865                             fragments.append(fragment)
2866                         representation_ms_info['fragments'] = fragments
2867                     # If there is a fragments key available then we correctly recognized fragmented media.
2868                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2869                     # assumption is not necessarily correct since we may simply have no support for
2870                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2871                     if 'fragments' in representation_ms_info:
2872                         f.update({
2873                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2874                             'url': mpd_url or base_url,
2875                             'fragment_base_url': base_url,
2876                             'fragments': [],
2877                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2878                         })
2879                         if 'initialization_url' in representation_ms_info:
2880                             initialization_url = representation_ms_info['initialization_url']
2881                             if not f.get('url'):
2882                                 f['url'] = initialization_url
2883                             f['fragments'].append({location_key(initialization_url): initialization_url})
2884                         f['fragments'].extend(representation_ms_info['fragments'])
2885                     else:
2886                         # Assuming direct URL to unfragmented media.
2887                         f['url'] = base_url
2888                     if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2889                         formats.append(f)
2890                     elif content_type == 'text':
2891                         subtitles.setdefault(lang or 'und', []).append(f)
2892
2893         return formats, subtitles
2894
2895     def _extract_ism_formats(self, *args, **kwargs):
2896         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2897         if subs:
2898             self._report_ignoring_subs('ISM')
2899         return fmts
2900
2901     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2902         res = self._download_xml_handle(
2903             ism_url, video_id,
2904             note='Downloading ISM manifest' if note is None else note,
2905             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2906             fatal=fatal, data=data, headers=headers, query=query)
2907         if res is False:
2908             return [], {}
2909         ism_doc, urlh = res
2910         if ism_doc is None:
2911             return [], {}
2912
2913         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2914
2915     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2916         """
2917         Parse formats from ISM manifest.
2918         References:
2919          1. [MS-SSTR]: Smooth Streaming Protocol,
2920             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2921         """
2922         if ism_doc.get('IsLive') == 'TRUE':
2923             return [], {}
2924
2925         duration = int(ism_doc.attrib['Duration'])
2926         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2927
2928         formats = []
2929         subtitles = {}
2930         for stream in ism_doc.findall('StreamIndex'):
2931             stream_type = stream.get('Type')
2932             if stream_type not in ('video', 'audio', 'text'):
2933                 continue
2934             url_pattern = stream.attrib['Url']
2935             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2936             stream_name = stream.get('Name')
2937             stream_language = stream.get('Language', 'und')
2938             for track in stream.findall('QualityLevel'):
2939                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2940                 # TODO: add support for WVC1 and WMAP
2941                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2942                     self.report_warning('%s is not a supported codec' % fourcc)
2943                     continue
2944                 tbr = int(track.attrib['Bitrate']) // 1000
2945                 # [1] does not mention Width and Height attributes. However,
2946                 # they're often present while MaxWidth and MaxHeight are
2947                 # missing, so should be used as fallbacks
2948                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2949                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2950                 sampling_rate = int_or_none(track.get('SamplingRate'))
2951
2952                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2953                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2954
2955                 fragments = []
2956                 fragment_ctx = {
2957                     'time': 0,
2958                 }
2959                 stream_fragments = stream.findall('c')
2960                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2961                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2962                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2963                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2964                     if not fragment_ctx['duration']:
2965                         try:
2966                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2967                         except IndexError:
2968                             next_fragment_time = duration
2969                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2970                     for _ in range(fragment_repeat):
2971                         fragments.append({
2972                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2973                             'duration': fragment_ctx['duration'] / stream_timescale,
2974                         })
2975                         fragment_ctx['time'] += fragment_ctx['duration']
2976
2977                 if stream_type == 'text':
2978                     subtitles.setdefault(stream_language, []).append({
2979                         'ext': 'ismt',
2980                         'protocol': 'ism',
2981                         'url': ism_url,
2982                         'manifest_url': ism_url,
2983                         'fragments': fragments,
2984                         '_download_params': {
2985                             'stream_type': stream_type,
2986                             'duration': duration,
2987                             'timescale': stream_timescale,
2988                             'fourcc': fourcc,
2989                             'language': stream_language,
2990                             'codec_private_data': track.get('CodecPrivateData'),
2991                         }
2992                     })
2993                 elif stream_type in ('video', 'audio'):
2994                     formats.append({
2995                         'format_id': join_nonempty(ism_id, stream_name, tbr),
2996                         'url': ism_url,
2997                         'manifest_url': ism_url,
2998                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2999                         'width': width,
3000                         'height': height,
3001                         'tbr': tbr,
3002                         'asr': sampling_rate,
3003                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3004                         'acodec': 'none' if stream_type == 'video' else fourcc,
3005                         'protocol': 'ism',
3006                         'fragments': fragments,
3007                         'has_drm': ism_doc.find('Protection') is not None,
3008                         '_download_params': {
3009                             'stream_type': stream_type,
3010                             'duration': duration,
3011                             'timescale': stream_timescale,
3012                             'width': width or 0,
3013                             'height': height or 0,
3014                             'fourcc': fourcc,
3015                             'language': stream_language,
3016                             'codec_private_data': track.get('CodecPrivateData'),
3017                             'sampling_rate': sampling_rate,
3018                             'channels': int_or_none(track.get('Channels', 2)),
3019                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3020                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3021                         },
3022                     })
3023         return formats, subtitles
3024
3025     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3026         def absolute_url(item_url):
3027             return urljoin(base_url, item_url)
3028
3029         def parse_content_type(content_type):
3030             if not content_type:
3031                 return {}
3032             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3033             if ctr:
3034                 mimetype, codecs = ctr.groups()
3035                 f = parse_codecs(codecs)
3036                 f['ext'] = mimetype2ext(mimetype)
3037                 return f
3038             return {}
3039
3040         def _media_formats(src, cur_media_type, type_info={}):
3041             full_url = absolute_url(src)
3042             ext = type_info.get('ext') or determine_ext(full_url)
3043             if ext == 'm3u8':
3044                 is_plain_url = False
3045                 formats = self._extract_m3u8_formats(
3046                     full_url, video_id, ext='mp4',
3047                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3048                     preference=preference, quality=quality, fatal=False)
3049             elif ext == 'mpd':
3050                 is_plain_url = False
3051                 formats = self._extract_mpd_formats(
3052                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3053             else:
3054                 is_plain_url = True
3055                 formats = [{
3056                     'url': full_url,
3057                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3058                 }]
3059             return is_plain_url, formats
3060
3061         entries = []
3062         # amp-video and amp-audio are very similar to their HTML5 counterparts
3063         # so we wll include them right here (see
3064         # https://www.ampproject.org/docs/reference/components/amp-video)
3065         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3066         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3067         media_tags = [(media_tag, media_tag_name, media_type, '')
3068                       for media_tag, media_tag_name, media_type
3069                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3070         media_tags.extend(re.findall(
3071             # We only allow video|audio followed by a whitespace or '>'.
3072             # Allowing more characters may end up in significant slow down (see
3073             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3074             # http://www.porntrex.com/maps/videositemap.xml).
3075             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3076         for media_tag, _, media_type, media_content in media_tags:
3077             media_info = {
3078                 'formats': [],
3079                 'subtitles': {},
3080             }
3081             media_attributes = extract_attributes(media_tag)
3082             src = strip_or_none(media_attributes.get('src'))
3083             if src:
3084                 _, formats = _media_formats(src, media_type)
3085                 media_info['formats'].extend(formats)
3086             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3087             if media_content:
3088                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3089                     s_attr = extract_attributes(source_tag)
3090                     # data-video-src and data-src are non standard but seen
3091                     # several times in the wild
3092                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3093                     if not src:
3094                         continue
3095                     f = parse_content_type(s_attr.get('type'))
3096                     is_plain_url, formats = _media_formats(src, media_type, f)
3097                     if is_plain_url:
3098                         # width, height, res, label and title attributes are
3099                         # all not standard but seen several times in the wild
3100                         labels = [
3101                             s_attr.get(lbl)
3102                             for lbl in ('label', 'title')
3103                             if str_or_none(s_attr.get(lbl))
3104                         ]
3105                         width = int_or_none(s_attr.get('width'))
3106                         height = (int_or_none(s_attr.get('height'))
3107                                   or int_or_none(s_attr.get('res')))
3108                         if not width or not height:
3109                             for lbl in labels:
3110                                 resolution = parse_resolution(lbl)
3111                                 if not resolution:
3112                                     continue
3113                                 width = width or resolution.get('width')
3114                                 height = height or resolution.get('height')
3115                         for lbl in labels:
3116                             tbr = parse_bitrate(lbl)
3117                             if tbr:
3118                                 break
3119                         else:
3120                             tbr = None
3121                         f.update({
3122                             'width': width,
3123                             'height': height,
3124                             'tbr': tbr,
3125                             'format_id': s_attr.get('label') or s_attr.get('title'),
3126                         })
3127                         f.update(formats[0])
3128                         media_info['formats'].append(f)
3129                     else:
3130                         media_info['formats'].extend(formats)
3131                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3132                     track_attributes = extract_attributes(track_tag)
3133                     kind = track_attributes.get('kind')
3134                     if not kind or kind in ('subtitles', 'captions'):
3135                         src = strip_or_none(track_attributes.get('src'))
3136                         if not src:
3137                             continue
3138                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3139                         media_info['subtitles'].setdefault(lang, []).append({
3140                             'url': absolute_url(src),
3141                         })
3142             for f in media_info['formats']:
3143                 f.setdefault('http_headers', {})['Referer'] = base_url
3144             if media_info['formats'] or media_info['subtitles']:
3145                 entries.append(media_info)
3146         return entries
3147
3148     def _extract_akamai_formats(self, *args, **kwargs):
3149         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3150         if subs:
3151             self._report_ignoring_subs('akamai')
3152         return fmts
3153
3154     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3155         signed = 'hdnea=' in manifest_url
3156         if not signed:
3157             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3158             manifest_url = re.sub(
3159                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3160                 '', manifest_url).strip('?')
3161
3162         formats = []
3163         subtitles = {}
3164
3165         hdcore_sign = 'hdcore=3.7.0'
3166         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3167         hds_host = hosts.get('hds')
3168         if hds_host:
3169             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3170         if 'hdcore=' not in f4m_url:
3171             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3172         f4m_formats = self._extract_f4m_formats(
3173             f4m_url, video_id, f4m_id='hds', fatal=False)
3174         for entry in f4m_formats:
3175             entry.update({'extra_param_to_segment_url': hdcore_sign})
3176         formats.extend(f4m_formats)
3177
3178         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3179         hls_host = hosts.get('hls')
3180         if hls_host:
3181             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3182         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3183             m3u8_url, video_id, 'mp4', 'm3u8_native',
3184             m3u8_id='hls', fatal=False)
3185         formats.extend(m3u8_formats)
3186         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3187
3188         http_host = hosts.get('http')
3189         if http_host and m3u8_formats and not signed:
3190             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3191             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3192             qualities_length = len(qualities)
3193             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3194                 i = 0
3195                 for f in m3u8_formats:
3196                     if f['vcodec'] != 'none':
3197                         for protocol in ('http', 'https'):
3198                             http_f = f.copy()
3199                             del http_f['manifest_url']
3200                             http_url = re.sub(
3201                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3202                             http_f.update({
3203                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3204                                 'url': http_url,
3205                                 'protocol': protocol,
3206                             })
3207                             formats.append(http_f)
3208                         i += 1
3209
3210         return formats, subtitles
3211
3212     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3213         query = compat_urlparse.urlparse(url).query
3214         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3215         mobj = re.search(
3216             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3217         url_base = mobj.group('url')
3218         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3219         formats = []
3220
3221         def manifest_url(manifest):
3222             m_url = '%s/%s' % (http_base_url, manifest)
3223             if query:
3224                 m_url += '?%s' % query
3225             return m_url
3226
3227         if 'm3u8' not in skip_protocols:
3228             formats.extend(self._extract_m3u8_formats(
3229                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3230                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3231         if 'f4m' not in skip_protocols:
3232             formats.extend(self._extract_f4m_formats(
3233                 manifest_url('manifest.f4m'),
3234                 video_id, f4m_id='hds', fatal=False))
3235         if 'dash' not in skip_protocols:
3236             formats.extend(self._extract_mpd_formats(
3237                 manifest_url('manifest.mpd'),
3238                 video_id, mpd_id='dash', fatal=False))
3239         if re.search(r'(?:/smil:|\.smil)', url_base):
3240             if 'smil' not in skip_protocols:
3241                 rtmp_formats = self._extract_smil_formats(
3242                     manifest_url('jwplayer.smil'),
3243                     video_id, fatal=False)
3244                 for rtmp_format in rtmp_formats:
3245                     rtsp_format = rtmp_format.copy()
3246                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3247                     del rtsp_format['play_path']
3248                     del rtsp_format['ext']
3249                     rtsp_format.update({
3250                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3251                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3252                         'protocol': 'rtsp',
3253                     })
3254                     formats.extend([rtmp_format, rtsp_format])
3255         else:
3256             for protocol in ('rtmp', 'rtsp'):
3257                 if protocol not in skip_protocols:
3258                     formats.append({
3259                         'url': '%s:%s' % (protocol, url_base),
3260                         'format_id': protocol,
3261                         'protocol': protocol,
3262                     })
3263         return formats
3264
3265     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3266         mobj = re.search(
3267             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3268             webpage)
3269         if mobj:
3270             try:
3271                 jwplayer_data = self._parse_json(mobj.group('options'),
3272                                                  video_id=video_id,
3273                                                  transform_source=transform_source)
3274             except ExtractorError:
3275                 pass
3276             else:
3277                 if isinstance(jwplayer_data, dict):
3278                     return jwplayer_data
3279
3280     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3281         jwplayer_data = self._find_jwplayer_data(
3282             webpage, video_id, transform_source=js_to_json)
3283         return self._parse_jwplayer_data(
3284             jwplayer_data, video_id, *args, **kwargs)
3285
3286     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3287                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3288         # JWPlayer backward compatibility: flattened playlists
3289         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3290         if 'playlist' not in jwplayer_data:
3291             jwplayer_data = {'playlist': [jwplayer_data]}
3292
3293         entries = []
3294
3295         # JWPlayer backward compatibility: single playlist item
3296         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3297         if not isinstance(jwplayer_data['playlist'], list):
3298             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3299
3300         for video_data in jwplayer_data['playlist']:
3301             # JWPlayer backward compatibility: flattened sources
3302             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3303             if 'sources' not in video_data:
3304                 video_data['sources'] = [video_data]
3305
3306             this_video_id = video_id or video_data['mediaid']
3307
3308             formats = self._parse_jwplayer_formats(
3309                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3310                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3311
3312             subtitles = {}
3313             tracks = video_data.get('tracks')
3314             if tracks and isinstance(tracks, list):
3315                 for track in tracks:
3316                     if not isinstance(track, dict):
3317                         continue
3318                     track_kind = track.get('kind')
3319                     if not track_kind or not isinstance(track_kind, compat_str):
3320                         continue
3321                     if track_kind.lower() not in ('captions', 'subtitles'):
3322                         continue
3323                     track_url = urljoin(base_url, track.get('file'))
3324                     if not track_url:
3325                         continue
3326                     subtitles.setdefault(track.get('label') or 'en', []).append({
3327                         'url': self._proto_relative_url(track_url)
3328                     })
3329
3330             entry = {
3331                 'id': this_video_id,
3332                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3333                 'description': clean_html(video_data.get('description')),
3334                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3335                 'timestamp': int_or_none(video_data.get('pubdate')),
3336                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3337                 'subtitles': subtitles,
3338             }
3339             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3340             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3341                 entry.update({
3342                     '_type': 'url_transparent',
3343                     'url': formats[0]['url'],
3344                 })
3345             else:
3346                 self._sort_formats(formats)
3347                 entry['formats'] = formats
3348             entries.append(entry)
3349         if len(entries) == 1:
3350             return entries[0]
3351         else:
3352             return self.playlist_result(entries)
3353
3354     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3355                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3356         urls = []
3357         formats = []
3358         for source in jwplayer_sources_data:
3359             if not isinstance(source, dict):
3360                 continue
3361             source_url = urljoin(
3362                 base_url, self._proto_relative_url(source.get('file')))
3363             if not source_url or source_url in urls:
3364                 continue
3365             urls.append(source_url)
3366             source_type = source.get('type') or ''
3367             ext = mimetype2ext(source_type) or determine_ext(source_url)
3368             if source_type == 'hls' or ext == 'm3u8':
3369                 formats.extend(self._extract_m3u8_formats(
3370                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3371                     m3u8_id=m3u8_id, fatal=False))
3372             elif source_type == 'dash' or ext == 'mpd':
3373                 formats.extend(self._extract_mpd_formats(
3374                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3375             elif ext == 'smil':
3376                 formats.extend(self._extract_smil_formats(
3377                     source_url, video_id, fatal=False))
3378             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3379             elif source_type.startswith('audio') or ext in (
3380                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3381                 formats.append({
3382                     'url': source_url,
3383                     'vcodec': 'none',
3384                     'ext': ext,
3385                 })
3386             else:
3387                 height = int_or_none(source.get('height'))
3388                 if height is None:
3389                     # Often no height is provided but there is a label in
3390                     # format like "1080p", "720p SD", or 1080.
3391                     height = int_or_none(self._search_regex(
3392                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3393                         'height', default=None))
3394                 a_format = {
3395                     'url': source_url,
3396                     'width': int_or_none(source.get('width')),
3397                     'height': height,
3398                     'tbr': int_or_none(source.get('bitrate')),
3399                     'ext': ext,
3400                 }
3401                 if source_url.startswith('rtmp'):
3402                     a_format['ext'] = 'flv'
3403                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3404                     # of jwplayer.flash.swf
3405                     rtmp_url_parts = re.split(
3406                         r'((?:mp4|mp3|flv):)', source_url, 1)
3407                     if len(rtmp_url_parts) == 3:
3408                         rtmp_url, prefix, play_path = rtmp_url_parts
3409                         a_format.update({
3410                             'url': rtmp_url,
3411                             'play_path': prefix + play_path,
3412                         })
3413                     if rtmp_params:
3414                         a_format.update(rtmp_params)
3415                 formats.append(a_format)
3416         return formats
3417
3418     def _live_title(self, name):
3419         """ Generate the title for a live video """
3420         now = datetime.datetime.now()
3421         now_str = now.strftime('%Y-%m-%d %H:%M')
3422         return name + ' ' + now_str
3423
3424     def _int(self, v, name, fatal=False, **kwargs):
3425         res = int_or_none(v, **kwargs)
3426         if 'get_attr' in kwargs:
3427             print(getattr(v, kwargs['get_attr']))
3428         if res is None:
3429             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3430             if fatal:
3431                 raise ExtractorError(msg)
3432             else:
3433                 self.report_warning(msg)
3434         return res
3435
3436     def _float(self, v, name, fatal=False, **kwargs):
3437         res = float_or_none(v, **kwargs)
3438         if res is None:
3439             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3440             if fatal:
3441                 raise ExtractorError(msg)
3442             else:
3443                 self.report_warning(msg)
3444         return res
3445
3446     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3447                     path='/', secure=False, discard=False, rest={}, **kwargs):
3448         cookie = compat_cookiejar_Cookie(
3449             0, name, value, port, port is not None, domain, True,
3450             domain.startswith('.'), path, True, secure, expire_time,
3451             discard, None, None, rest)
3452         self._downloader.cookiejar.set_cookie(cookie)
3453
3454     def _get_cookies(self, url):
3455         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3456         req = sanitized_Request(url)
3457         self._downloader.cookiejar.add_cookie_header(req)
3458         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3459
3460     def _apply_first_set_cookie_header(self, url_handle, cookie):
3461         """
3462         Apply first Set-Cookie header instead of the last. Experimental.
3463
3464         Some sites (e.g. [1-3]) may serve two cookies under the same name
3465         in Set-Cookie header and expect the first (old) one to be set rather
3466         than second (new). However, as of RFC6265 the newer one cookie
3467         should be set into cookie store what actually happens.
3468         We will workaround this issue by resetting the cookie to
3469         the first one manually.
3470         1. https://new.vk.com/
3471         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3472         3. https://learning.oreilly.com/
3473         """
3474         for header, cookies in url_handle.headers.items():
3475             if header.lower() != 'set-cookie':
3476                 continue
3477             if sys.version_info[0] >= 3:
3478                 cookies = cookies.encode('iso-8859-1')
3479             cookies = cookies.decode('utf-8')
3480             cookie_value = re.search(
3481                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3482             if cookie_value:
3483                 value, domain = cookie_value.groups()
3484                 self._set_cookie(domain, cookie, value)
3485                 break
3486
3487     def get_testcases(self, include_onlymatching=False):
3488         t = getattr(self, '_TEST', None)
3489         if t:
3490             assert not hasattr(self, '_TESTS'), \
3491                 '%s has _TEST and _TESTS' % type(self).__name__
3492             tests = [t]
3493         else:
3494             tests = getattr(self, '_TESTS', [])
3495         for t in tests:
3496             if not include_onlymatching and t.get('only_matching', False):
3497                 continue
3498             t['name'] = type(self).__name__[:-len('IE')]
3499             yield t
3500
3501     def is_suitable(self, age_limit):
3502         """ Test whether the extractor is generally suitable for the given
3503         age limit (i.e. pornographic sites are not, all others usually are) """
3504
3505         any_restricted = False
3506         for tc in self.get_testcases(include_onlymatching=False):
3507             if tc.get('playlist', []):
3508                 tc = tc['playlist'][0]
3509             is_restricted = age_restricted(
3510                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3511             if not is_restricted:
3512                 return True
3513             any_restricted = any_restricted or is_restricted
3514         return not any_restricted
3515
3516     def extract_subtitles(self, *args, **kwargs):
3517         if (self.get_param('writesubtitles', False)
3518                 or self.get_param('listsubtitles')):
3519             return self._get_subtitles(*args, **kwargs)
3520         return {}
3521
3522     def _get_subtitles(self, *args, **kwargs):
3523         raise NotImplementedError('This method must be implemented by subclasses')
3524
3525     def extract_comments(self, *args, **kwargs):
3526         if not self.get_param('getcomments'):
3527             return None
3528         generator = self._get_comments(*args, **kwargs)
3529
3530         def extractor():
3531             comments = []
3532             try:
3533                 while True:
3534                     comments.append(next(generator))
3535             except KeyboardInterrupt:
3536                 interrupted = True
3537                 self.to_screen('Interrupted by user')
3538             except StopIteration:
3539                 interrupted = False
3540             comment_count = len(comments)
3541             self.to_screen(f'Extracted {comment_count} comments')
3542             return {
3543                 'comments': comments,
3544                 'comment_count': None if interrupted else comment_count
3545             }
3546         return extractor
3547
3548     def _get_comments(self, *args, **kwargs):
3549         raise NotImplementedError('This method must be implemented by subclasses')
3550
3551     @staticmethod
3552     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3553         """ Merge subtitle items for one language. Items with duplicated URLs
3554         will be dropped. """
3555         list1_urls = set([item['url'] for item in subtitle_list1])
3556         ret = list(subtitle_list1)
3557         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3558         return ret
3559
3560     @classmethod
3561     def _merge_subtitles(cls, *dicts, target=None):
3562         """ Merge subtitle dictionaries, language by language. """
3563         if target is None:
3564             target = {}
3565         for d in dicts:
3566             for lang, subs in d.items():
3567                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3568         return target
3569
3570     def extract_automatic_captions(self, *args, **kwargs):
3571         if (self.get_param('writeautomaticsub', False)
3572                 or self.get_param('listsubtitles')):
3573             return self._get_automatic_captions(*args, **kwargs)
3574         return {}
3575
3576     def _get_automatic_captions(self, *args, **kwargs):
3577         raise NotImplementedError('This method must be implemented by subclasses')
3578
3579     def mark_watched(self, *args, **kwargs):
3580         if not self.get_param('mark_watched', False):
3581             return
3582         if (self._get_login_info()[0] is not None
3583                 or self.get_param('cookiefile')
3584                 or self.get_param('cookiesfrombrowser')):
3585             self._mark_watched(*args, **kwargs)
3586
3587     def _mark_watched(self, *args, **kwargs):
3588         raise NotImplementedError('This method must be implemented by subclasses')
3589
3590     def geo_verification_headers(self):
3591         headers = {}
3592         geo_verification_proxy = self.get_param('geo_verification_proxy')
3593         if geo_verification_proxy:
3594             headers['Ytdl-request-proxy'] = geo_verification_proxy
3595         return headers
3596
3597     def _generic_id(self, url):
3598         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3599
3600     def _generic_title(self, url):
3601         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3602
3603     @staticmethod
3604     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3605         all_known = all(map(
3606             lambda x: x is not None,
3607             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3608         return (
3609             'private' if is_private
3610             else 'premium_only' if needs_premium
3611             else 'subscriber_only' if needs_subscription
3612             else 'needs_auth' if needs_auth
3613             else 'unlisted' if is_unlisted
3614             else 'public' if all_known
3615             else None)
3616
3617     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3618         '''
3619         @returns            A list of values for the extractor argument given by "key"
3620                             or "default" if no such key is present
3621         @param default      The default value to return when the key is not present (default: [])
3622         @param casesense    When false, the values are converted to lower case
3623         '''
3624         val = traverse_obj(
3625             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3626         if val is None:
3627             return [] if default is NO_DEFAULT else default
3628         return list(val) if casesense else [x.lower() for x in val]
3629
3630
3631 class SearchInfoExtractor(InfoExtractor):
3632     """
3633     Base class for paged search queries extractors.
3634     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3635     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3636     """
3637
3638     _MAX_RESULTS = float('inf')
3639
3640     @classmethod
3641     def _make_valid_url(cls):
3642         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3643
3644     @classmethod
3645     def suitable(cls, url):
3646         return re.match(cls._make_valid_url(), url) is not None
3647
3648     def _real_extract(self, query):
3649         mobj = re.match(self._make_valid_url(), query)
3650         if mobj is None:
3651             raise ExtractorError('Invalid search query "%s"' % query)
3652
3653         prefix = mobj.group('prefix')
3654         query = mobj.group('query')
3655         if prefix == '':
3656             return self._get_n_results(query, 1)
3657         elif prefix == 'all':
3658             return self._get_n_results(query, self._MAX_RESULTS)
3659         else:
3660             n = int(prefix)
3661             if n <= 0:
3662                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3663             elif n > self._MAX_RESULTS:
3664                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3665                 n = self._MAX_RESULTS
3666             return self._get_n_results(query, n)
3667
3668     def _get_n_results(self, query, n):
3669         """Get a specified number of results for a query.
3670         Either this function or _search_results must be overridden by subclasses """
3671         return self.playlist_result(
3672             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3673             query, query)
3674
3675     def _search_results(self, query):
3676         """Returns an iterator of search results"""
3677         raise NotImplementedError('This method must be implemented by subclasses')
3678
3679     @property
3680     def SEARCH_KEY(self):
3681         return self._SEARCH_KEY