yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import collections
   6 import datetime
   7 import hashlib
   8 import itertools
   9 import json
  10 import netrc
  11 import os
  12 import random
  13 import re
  14 import sys
  15 import time
  16 import math
  17
  18 from ..compat import (
  19     compat_cookiejar_Cookie,
  20     compat_cookies_SimpleCookie,
  21     compat_etree_Element,
  22     compat_etree_fromstring,
  23     compat_expanduser,
  24     compat_getpass,
  25     compat_http_client,
  26     compat_os_name,
  27     compat_str,
  28     compat_urllib_error,
  29     compat_urllib_parse_unquote,
  30     compat_urllib_parse_urlencode,
  31     compat_urllib_request,
  32     compat_urlparse,
  33     compat_xml_parse_error,
  34 )
  35 from ..downloader import FileDownloader
  36 from ..downloader.f4m import (
  37     get_base_url,
  38     remove_encrypted_media,
  39 )
  40 from ..utils import (
  41     age_restricted,
  42     base_url,
  43     bug_reports_message,
  44     clean_html,
  45     compiled_regex_type,
  46     determine_ext,
  47     determine_protocol,
  48     dict_get,
  49     error_to_compat_str,
  50     extract_attributes,
  51     ExtractorError,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     join_nonempty,
  59     js_to_json,
  60     JSON_LD_RE,
  61     mimetype2ext,
  62     network_exceptions,
  63     NO_DEFAULT,
  64     orderedSet,
  65     parse_bitrate,
  66     parse_codecs,
  67     parse_duration,
  68     parse_iso8601,
  69     parse_m3u8_attributes,
  70     parse_resolution,
  71     RegexNotFoundError,
  72     sanitize_filename,
  73     sanitized_Request,
  74     str_or_none,
  75     str_to_int,
  76     strip_or_none,
  77     traverse_obj,
  78     unescapeHTML,
  79     UnsupportedError,
  80     unified_strdate,
  81     unified_timestamp,
  82     update_Request,
  83     update_url_query,
  84     url_basename,
  85     url_or_none,
  86     urljoin,
  87     variadic,
  88     xpath_element,
  89     xpath_text,
  90     xpath_with_ns,
  91 )
  92
  93
  94 class InfoExtractor(object):
  95     """Information Extractor class.
  96
  97     Information extractors are the classes that, given a URL, extract
  98     information about the video (or videos) the URL refers to. This
  99     information includes the real video URL, the video title, author and
 100     others. The information is stored in a dictionary which is then
 101     passed to the YoutubeDL. The YoutubeDL processes this
 102     information possibly downloading the video to the file system, among
 103     other possible outcomes.
 104
 105     The type field determines the type of the result.
 106     By far the most common value (and the default if _type is missing) is
 107     "video", which indicates a single video.
 108
 109     For a video, the dictionaries must include the following fields:
 110
 111     id:             Video identifier.
 112     title:          Video title, unescaped.
 113
 114     Additionally, it must contain either a formats entry or a url one:
 115
 116     formats:        A list of dictionaries for each format available, ordered
 117                     from worst to best quality.
 118
 119                     Potential fields:
 120                     * url        The mandatory URL representing the media:
 121                                    for plain file media - HTTP URL of this file,
 122                                    for RTMP - RTMP URL,
 123                                    for HLS - URL of the M3U8 media playlist,
 124                                    for HDS - URL of the F4M manifest,
 125                                    for DASH
 126                                      - HTTP URL to plain file media (in case of
 127                                        unfragmented media)
 128                                      - URL of the MPD manifest or base URL
 129                                        representing the media if MPD manifest
 130                                        is parsed from a string (in case of
 131                                        fragmented media)
 132                                    for MSS - URL of the ISM manifest.
 133                     * manifest_url
 134                                  The URL of the manifest file in case of
 135                                  fragmented media:
 136                                    for HLS - URL of the M3U8 master playlist,
 137                                    for HDS - URL of the F4M manifest,
 138                                    for DASH - URL of the MPD manifest,
 139                                    for MSS - URL of the ISM manifest.
 140                     * ext        Will be calculated from URL if missing
 141                     * format     A human-readable description of the format
 142                                  ("mp4 container with h264/opus").
 143                                  Calculated from the format_id, width, height.
 144                                  and format_note fields if missing.
 145                     * format_id  A short description of the format
 146                                  ("mp4_h264_opus" or "19").
 147                                 Technically optional, but strongly recommended.
 148                     * format_note Additional info about the format
 149                                  ("3D" or "DASH video")
 150                     * width      Width of the video, if known
 151                     * height     Height of the video, if known
 152                     * resolution Textual description of width and height
 153                     * dynamic_range The dynamic range of the video. One of:
 154                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 155                     * tbr        Average bitrate of audio and video in KBit/s
 156                     * abr        Average audio bitrate in KBit/s
 157                     * acodec     Name of the audio codec in use
 158                     * asr        Audio sampling rate in Hertz
 159                     * vbr        Average video bitrate in KBit/s
 160                     * fps        Frame rate
 161                     * vcodec     Name of the video codec in use
 162                     * container  Name of the container format
 163                     * filesize   The number of bytes, if known in advance
 164                     * filesize_approx  An estimate for the number of bytes
 165                     * player_url SWF Player URL (used for rtmpdump).
 166                     * protocol   The protocol that will be used for the actual
 167                                  download, lower-case.
 168                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 169                                  "m3u8", "m3u8_native" or "http_dash_segments".
 170                     * fragment_base_url
 171                                  Base URL for fragments. Each fragment's path
 172                                  value (if present) will be relative to
 173                                  this URL.
 174                     * fragments  A list of fragments of a fragmented media.
 175                                  Each fragment entry must contain either an url
 176                                  or a path. If an url is present it should be
 177                                  considered by a client. Otherwise both path and
 178                                  fragment_base_url must be present. Here is
 179                                  the list of all potential fields:
 180                                  * "url" - fragment's URL
 181                                  * "path" - fragment's path relative to
 182                                             fragment_base_url
 183                                  * "duration" (optional, int or float)
 184                                  * "filesize" (optional, int)
 185                     * preference Order number of this format. If this field is
 186                                  present and not None, the formats get sorted
 187                                  by this field, regardless of all other values.
 188                                  -1 for default (order by other properties),
 189                                  -2 or smaller for less than default.
 190                                  < -1000 to hide the format (if there is
 191                                     another one which is strictly better)
 192                     * language   Language code, e.g. "de" or "en-US".
 193                     * language_preference  Is this in the language mentioned in
 194                                  the URL?
 195                                  10 if it's what the URL is about,
 196                                  -1 for default (don't know),
 197                                  -10 otherwise, other values reserved for now.
 198                     * quality    Order number of the video quality of this
 199                                  format, irrespective of the file format.
 200                                  -1 for default (order by other properties),
 201                                  -2 or smaller for less than default.
 202                     * source_preference  Order number for this video source
 203                                   (quality takes higher priority)
 204                                  -1 for default (order by other properties),
 205                                  -2 or smaller for less than default.
 206                     * http_headers  A dictionary of additional HTTP headers
 207                                  to add to the request.
 208                     * stretched_ratio  If given and not 1, indicates that the
 209                                  video's pixels are not square.
 210                                  width : height ratio as float.
 211                     * no_resume  The server does not support resuming the
 212                                  (HTTP or RTMP) download. Boolean.
 213                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 214                     * downloader_options  A dictionary of downloader options as
 215                                  described in FileDownloader
 216                     RTMP formats can also have the additional fields: page_url,
 217                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 218                     rtmp_protocol, rtmp_real_time
 219
 220     url:            Final video URL.
 221     ext:            Video filename extension.
 222     format:         The video format, defaults to ext (used for --get-format)
 223     player_url:     SWF Player URL (used for rtmpdump).
 224
 225     The following fields are optional:
 226
 227     alt_title:      A secondary title of the video.
 228     display_id      An alternative identifier for the video, not necessarily
 229                     unique, but available before title. Typically, id is
 230                     something like "4234987", title "Dancing naked mole rats",
 231                     and display_id "dancing-naked-mole-rats"
 232     thumbnails:     A list of dictionaries, with the following entries:
 233                         * "id" (optional, string) - Thumbnail format ID
 234                         * "url"
 235                         * "preference" (optional, int) - quality of the image
 236                         * "width" (optional, int)
 237                         * "height" (optional, int)
 238                         * "resolution" (optional, string "{width}x{height}",
 239                                         deprecated)
 240                         * "filesize" (optional, int)
 241     thumbnail:      Full URL to a video thumbnail image.
 242     description:    Full video description.
 243     uploader:       Full name of the video uploader.
 244     license:        License name the video is licensed under.
 245     creator:        The creator of the video.
 246     release_timestamp: UNIX timestamp of the moment the video was released.
 247     release_date:   The date (YYYYMMDD) when the video was released.
 248     timestamp:      UNIX timestamp of the moment the video was uploaded
 249     upload_date:    Video upload date (YYYYMMDD).
 250                     If not explicitly set, calculated from timestamp.
 251     uploader_id:    Nickname or id of the video uploader.
 252     uploader_url:   Full URL to a personal webpage of the video uploader.
 253     channel:        Full name of the channel the video is uploaded on.
 254                     Note that channel fields may or may not repeat uploader
 255                     fields. This depends on a particular extractor.
 256     channel_id:     Id of the channel.
 257     channel_url:    Full URL to a channel webpage.
 258     location:       Physical location where the video was filmed.
 259     subtitles:      The available subtitles as a dictionary in the format
 260                     {tag: subformats}. "tag" is usually a language code, and
 261                     "subformats" is a list sorted from lower to higher
 262                     preference, each element is a dictionary with the "ext"
 263                     entry and one of:
 264                         * "data": The subtitles file contents
 265                         * "url": A URL pointing to the subtitles file
 266                     It can optionally also have:
 267                         * "name": Name or description of the subtitles
 268                     "ext" will be calculated from URL if missing
 269     automatic_captions: Like 'subtitles'; contains automatically generated
 270                     captions instead of normal subtitles
 271     duration:       Length of the video in seconds, as an integer or float.
 272     view_count:     How many users have watched the video on the platform.
 273     like_count:     Number of positive ratings of the video
 274     dislike_count:  Number of negative ratings of the video
 275     repost_count:   Number of reposts of the video
 276     average_rating: Average rating give by users, the scale used depends on the webpage
 277     comment_count:  Number of comments on the video
 278     comments:       A list of comments, each with one or more of the following
 279                     properties (all but one of text or html optional):
 280                         * "author" - human-readable name of the comment author
 281                         * "author_id" - user ID of the comment author
 282                         * "author_thumbnail" - The thumbnail of the comment author
 283                         * "id" - Comment ID
 284                         * "html" - Comment as HTML
 285                         * "text" - Plain text of the comment
 286                         * "timestamp" - UNIX timestamp of comment
 287                         * "parent" - ID of the comment this one is replying to.
 288                                      Set to "root" to indicate that this is a
 289                                      comment to the original video.
 290                         * "like_count" - Number of positive ratings of the comment
 291                         * "dislike_count" - Number of negative ratings of the comment
 292                         * "is_favorited" - Whether the comment is marked as
 293                                            favorite by the video uploader
 294                         * "author_is_uploader" - Whether the comment is made by
 295                                                  the video uploader
 296     age_limit:      Age restriction for the video, as an integer (years)
 297     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 298                     should allow to get the same result again. (It will be set
 299                     by YoutubeDL if it's missing)
 300     categories:     A list of categories that the video falls in, for example
 301                     ["Sports", "Berlin"]
 302     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 303     cast:           A list of the video cast
 304     is_live:        True, False, or None (=unknown). Whether this video is a
 305                     live stream that goes on instead of a fixed-length video.
 306     was_live:       True, False, or None (=unknown). Whether this video was
 307                     originally a live stream.
 308     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 309                     If absent, automatically set from is_live, was_live
 310     start_time:     Time in seconds where the reproduction should start, as
 311                     specified in the URL.
 312     end_time:       Time in seconds where the reproduction should end, as
 313                     specified in the URL.
 314     chapters:       A list of dictionaries, with the following entries:
 315                         * "start_time" - The start time of the chapter in seconds
 316                         * "end_time" - The end time of the chapter in seconds
 317                         * "title" (optional, string)
 318     playable_in_embed: Whether this video is allowed to play in embedded
 319                     players on other sites. Can be True (=always allowed),
 320                     False (=never allowed), None (=unknown), or a string
 321                     specifying the criteria for embedability (Eg: 'whitelist')
 322     availability:   Under what condition the video is available. One of
 323                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 324                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 325                     to set it
 326     __post_extractor: A function to be called just before the metadata is
 327                     written to either disk, logger or console. The function
 328                     must return a dict which will be added to the info_dict.
 329                     This is usefull for additional information that is
 330                     time-consuming to extract. Note that the fields thus
 331                     extracted will not be available to output template and
 332                     match_filter. So, only "comments" and "comment_count" are
 333                     currently allowed to be extracted via this method.
 334
 335     The following fields should only be used when the video belongs to some logical
 336     chapter or section:
 337
 338     chapter:        Name or title of the chapter the video belongs to.
 339     chapter_number: Number of the chapter the video belongs to, as an integer.
 340     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 341
 342     The following fields should only be used when the video is an episode of some
 343     series, programme or podcast:
 344
 345     series:         Title of the series or programme the video episode belongs to.
 346     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 347     season:         Title of the season the video episode belongs to.
 348     season_number:  Number of the season the video episode belongs to, as an integer.
 349     season_id:      Id of the season the video episode belongs to, as a unicode string.
 350     episode:        Title of the video episode. Unlike mandatory video title field,
 351                     this field should denote the exact title of the video episode
 352                     without any kind of decoration.
 353     episode_number: Number of the video episode within a season, as an integer.
 354     episode_id:     Id of the video episode, as a unicode string.
 355
 356     The following fields should only be used when the media is a track or a part of
 357     a music album:
 358
 359     track:          Title of the track.
 360     track_number:   Number of the track within an album or a disc, as an integer.
 361     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 362                     as a unicode string.
 363     artist:         Artist(s) of the track.
 364     genre:          Genre(s) of the track.
 365     album:          Title of the album the track belongs to.
 366     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 367     album_artist:   List of all artists appeared on the album (e.g.
 368                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 369                     and compilations).
 370     disc_number:    Number of the disc or other physical medium the track belongs to,
 371                     as an integer.
 372     release_year:   Year (YYYY) when the album was released.
 373
 374     Unless mentioned otherwise, the fields should be Unicode strings.
 375
 376     Unless mentioned otherwise, None is equivalent to absence of information.
 377
 378
 379     _type "playlist" indicates multiple videos.
 380     There must be a key "entries", which is a list, an iterable, or a PagedList
 381     object, each element of which is a valid dictionary by this specification.
 382
 383     Additionally, playlists can have "id", "title", and any other relevent
 384     attributes with the same semantics as videos (see above).
 385
 386
 387     _type "multi_video" indicates that there are multiple videos that
 388     form a single show, for examples multiple acts of an opera or TV episode.
 389     It must have an entries key like a playlist and contain all the keys
 390     required for a video at the same time.
 391
 392
 393     _type "url" indicates that the video must be extracted from another
 394     location, possibly by a different extractor. Its only required key is:
 395     "url" - the next URL to extract.
 396     The key "ie_key" can be set to the class name (minus the trailing "IE",
 397     e.g. "Youtube") if the extractor class is known in advance.
 398     Additionally, the dictionary may have any properties of the resolved entity
 399     known in advance, for example "title" if the title of the referred video is
 400     known ahead of time.
 401
 402
 403     _type "url_transparent" entities have the same specification as "url", but
 404     indicate that the given additional information is more precise than the one
 405     associated with the resolved URL.
 406     This is useful when a site employs a video service that hosts the video and
 407     its technical metadata, but that video service does not embed a useful
 408     title, description etc.
 409
 410
 411     Subclasses of this one should re-define the _real_initialize() and
 412     _real_extract() methods and define a _VALID_URL regexp.
 413     Probably, they should also be added to the list of extractors.
 414
 415     Subclasses may also override suitable() if necessary, but ensure the function
 416     signature is preserved and that this function imports everything it needs
 417     (except other extractors), so that lazy_extractors works correctly
 418
 419     _GEO_BYPASS attribute may be set to False in order to disable
 420     geo restriction bypass mechanisms for a particular extractor.
 421     Though it won't disable explicit geo restriction bypass based on
 422     country code provided with geo_bypass_country.
 423
 424     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 425     countries for this extractor. One of these countries will be used by
 426     geo restriction bypass mechanism right away in order to bypass
 427     geo restriction, of course, if the mechanism is not disabled.
 428
 429     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 430     IP blocks in CIDR notation for this extractor. One of these IP blocks
 431     will be used by geo restriction bypass mechanism similarly
 432     to _GEO_COUNTRIES.
 433
 434     The _WORKING attribute should be set to False for broken IEs
 435     in order to warn the users and skip the tests.
 436     """
 437
 438     _ready = False
 439     _downloader = None
 440     _x_forwarded_for_ip = None
 441     _GEO_BYPASS = True
 442     _GEO_COUNTRIES = None
 443     _GEO_IP_BLOCKS = None
 444     _WORKING = True
 445
 446     _LOGIN_HINTS = {
 447         'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
 448         'cookies': (
 449             'Use --cookies-from-browser or --cookies for the authentication. '
 450             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 451         'password': 'Use --username and --password, or --netrc to provide account credentials',
 452     }
 453
 454     def __init__(self, downloader=None):
 455         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 456         If a downloader is not passed during initialization,
 457         it must be set using "set_downloader()" before "extract()" is called"""
 458         self._ready = False
 459         self._x_forwarded_for_ip = None
 460         self._printed_messages = set()
 461         self.set_downloader(downloader)
 462
 463     @classmethod
 464     def _match_valid_url(cls, url):
 465         # This does not use has/getattr intentionally - we want to know whether
 466         # we have cached the regexp for *this* class, whereas getattr would also
 467         # match the superclass
 468         if '_VALID_URL_RE' not in cls.__dict__:
 469             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 470         return cls._VALID_URL_RE.match(url)
 471
 472     @classmethod
 473     def suitable(cls, url):
 474         """Receives a URL and returns True if suitable for this IE."""
 475         # This function must import everything it needs (except other extractors),
 476         # so that lazy_extractors works correctly
 477         return cls._match_valid_url(url) is not None
 478
 479     @classmethod
 480     def _match_id(cls, url):
 481         return cls._match_valid_url(url).group('id')
 482
 483     @classmethod
 484     def get_temp_id(cls, url):
 485         try:
 486             return cls._match_id(url)
 487         except (IndexError, AttributeError):
 488             return None
 489
 490     @classmethod
 491     def working(cls):
 492         """Getter method for _WORKING."""
 493         return cls._WORKING
 494
 495     def initialize(self):
 496         """Initializes an instance (authentication, etc)."""
 497         self._printed_messages = set()
 498         self._initialize_geo_bypass({
 499             'countries': self._GEO_COUNTRIES,
 500             'ip_blocks': self._GEO_IP_BLOCKS,
 501         })
 502         if not self._ready:
 503             self._real_initialize()
 504             self._ready = True
 505
 506     def _initialize_geo_bypass(self, geo_bypass_context):
 507         """
 508         Initialize geo restriction bypass mechanism.
 509
 510         This method is used to initialize geo bypass mechanism based on faking
 511         X-Forwarded-For HTTP header. A random country from provided country list
 512         is selected and a random IP belonging to this country is generated. This
 513         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 514         HTTP requests.
 515
 516         This method will be used for initial geo bypass mechanism initialization
 517         during the instance initialization with _GEO_COUNTRIES and
 518         _GEO_IP_BLOCKS.
 519
 520         You may also manually call it from extractor's code if geo bypass
 521         information is not available beforehand (e.g. obtained during
 522         extraction) or due to some other reason. In this case you should pass
 523         this information in geo bypass context passed as first argument. It may
 524         contain following fields:
 525
 526         countries:  List of geo unrestricted countries (similar
 527                     to _GEO_COUNTRIES)
 528         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 529                     (similar to _GEO_IP_BLOCKS)
 530
 531         """
 532         if not self._x_forwarded_for_ip:
 533
 534             # Geo bypass mechanism is explicitly disabled by user
 535             if not self.get_param('geo_bypass', True):
 536                 return
 537
 538             if not geo_bypass_context:
 539                 geo_bypass_context = {}
 540
 541             # Backward compatibility: previously _initialize_geo_bypass
 542             # expected a list of countries, some 3rd party code may still use
 543             # it this way
 544             if isinstance(geo_bypass_context, (list, tuple)):
 545                 geo_bypass_context = {
 546                     'countries': geo_bypass_context,
 547                 }
 548
 549             # The whole point of geo bypass mechanism is to fake IP
 550             # as X-Forwarded-For HTTP header based on some IP block or
 551             # country code.
 552
 553             # Path 1: bypassing based on IP block in CIDR notation
 554
 555             # Explicit IP block specified by user, use it right away
 556             # regardless of whether extractor is geo bypassable or not
 557             ip_block = self.get_param('geo_bypass_ip_block', None)
 558
 559             # Otherwise use random IP block from geo bypass context but only
 560             # if extractor is known as geo bypassable
 561             if not ip_block:
 562                 ip_blocks = geo_bypass_context.get('ip_blocks')
 563                 if self._GEO_BYPASS and ip_blocks:
 564                     ip_block = random.choice(ip_blocks)
 565
 566             if ip_block:
 567                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 568                 self._downloader.write_debug(
 569                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 570                 return
 571
 572             # Path 2: bypassing based on country code
 573
 574             # Explicit country code specified by user, use it right away
 575             # regardless of whether extractor is geo bypassable or not
 576             country = self.get_param('geo_bypass_country', None)
 577
 578             # Otherwise use random country code from geo bypass context but
 579             # only if extractor is known as geo bypassable
 580             if not country:
 581                 countries = geo_bypass_context.get('countries')
 582                 if self._GEO_BYPASS and countries:
 583                     country = random.choice(countries)
 584
 585             if country:
 586                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 587                 self._downloader.write_debug(
 588                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 589
 590     def extract(self, url):
 591         """Extracts URL information and returns it in list of dicts."""
 592         try:
 593             for _ in range(2):
 594                 try:
 595                     self.initialize()
 596                     self.write_debug('Extracting URL: %s' % url)
 597                     ie_result = self._real_extract(url)
 598                     if ie_result is None:
 599                         return None
 600                     if self._x_forwarded_for_ip:
 601                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 602                     subtitles = ie_result.get('subtitles')
 603                     if (subtitles and 'live_chat' in subtitles
 604                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 605                         del subtitles['live_chat']
 606                     return ie_result
 607                 except GeoRestrictedError as e:
 608                     if self.__maybe_fake_ip_and_retry(e.countries):
 609                         continue
 610                     raise
 611         except UnsupportedError:
 612             raise
 613         except ExtractorError as e:
 614             kwargs = {
 615                 'video_id': e.video_id or self.get_temp_id(url),
 616                 'ie': self.IE_NAME,
 617                 'tb': e.traceback,
 618                 'expected': e.expected,
 619                 'cause': e.cause
 620             }
 621             if hasattr(e, 'countries'):
 622                 kwargs['countries'] = e.countries
 623             raise type(e)(e.msg, **kwargs)
 624         except compat_http_client.IncompleteRead as e:
 625             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 626         except (KeyError, StopIteration) as e:
 627             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 628
 629     def __maybe_fake_ip_and_retry(self, countries):
 630         if (not self.get_param('geo_bypass_country', None)
 631                 and self._GEO_BYPASS
 632                 and self.get_param('geo_bypass', True)
 633                 and not self._x_forwarded_for_ip
 634                 and countries):
 635             country_code = random.choice(countries)
 636             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 637             if self._x_forwarded_for_ip:
 638                 self.report_warning(
 639                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 640                     % (self._x_forwarded_for_ip, country_code.upper()))
 641                 return True
 642         return False
 643
 644     def set_downloader(self, downloader):
 645         """Sets the downloader for this IE."""
 646         self._downloader = downloader
 647
 648     def _real_initialize(self):
 649         """Real initialization process. Redefine in subclasses."""
 650         pass
 651
 652     def _real_extract(self, url):
 653         """Real extraction process. Redefine in subclasses."""
 654         pass
 655
 656     @classmethod
 657     def ie_key(cls):
 658         """A string for getting the InfoExtractor with get_info_extractor"""
 659         return cls.__name__[:-2]
 660
 661     @property
 662     def IE_NAME(self):
 663         return compat_str(type(self).__name__[:-2])
 664
 665     @staticmethod
 666     def __can_accept_status_code(err, expected_status):
 667         assert isinstance(err, compat_urllib_error.HTTPError)
 668         if expected_status is None:
 669             return False
 670         elif callable(expected_status):
 671             return expected_status(err.code) is True
 672         else:
 673             return err.code in variadic(expected_status)
 674
 675     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 676         """
 677         Return the response handle.
 678
 679         See _download_webpage docstring for arguments specification.
 680         """
 681         if not self._downloader._first_webpage_request:
 682             sleep_interval = self.get_param('sleep_interval_requests') or 0
 683             if sleep_interval > 0:
 684                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 685                 time.sleep(sleep_interval)
 686         else:
 687             self._downloader._first_webpage_request = False
 688
 689         if note is None:
 690             self.report_download_webpage(video_id)
 691         elif note is not False:
 692             if video_id is None:
 693                 self.to_screen('%s' % (note,))
 694             else:
 695                 self.to_screen('%s: %s' % (video_id, note))
 696
 697         # Some sites check X-Forwarded-For HTTP header in order to figure out
 698         # the origin of the client behind proxy. This allows bypassing geo
 699         # restriction by faking this header's value to IP that belongs to some
 700         # geo unrestricted country. We will do so once we encounter any
 701         # geo restriction error.
 702         if self._x_forwarded_for_ip:
 703             if 'X-Forwarded-For' not in headers:
 704                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 705
 706         if isinstance(url_or_request, compat_urllib_request.Request):
 707             url_or_request = update_Request(
 708                 url_or_request, data=data, headers=headers, query=query)
 709         else:
 710             if query:
 711                 url_or_request = update_url_query(url_or_request, query)
 712             if data is not None or headers:
 713                 url_or_request = sanitized_Request(url_or_request, data, headers)
 714         try:
 715             return self._downloader.urlopen(url_or_request)
 716         except network_exceptions as err:
 717             if isinstance(err, compat_urllib_error.HTTPError):
 718                 if self.__can_accept_status_code(err, expected_status):
 719                     # Retain reference to error to prevent file object from
 720                     # being closed before it can be read. Works around the
 721                     # effects of <https://bugs.python.org/issue15002>
 722                     # introduced in Python 3.4.1.
 723                     err.fp._error = err
 724                     return err.fp
 725
 726             if errnote is False:
 727                 return False
 728             if errnote is None:
 729                 errnote = 'Unable to download webpage'
 730
 731             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 732             if fatal:
 733                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 734             else:
 735                 self.report_warning(errmsg)
 736                 return False
 737
 738     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 739         """
 740         Return a tuple (page content as string, URL handle).
 741
 742         See _download_webpage docstring for arguments specification.
 743         """
 744         # Strip hashes from the URL (#1038)
 745         if isinstance(url_or_request, (compat_str, str)):
 746             url_or_request = url_or_request.partition('#')[0]
 747
 748         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 749         if urlh is False:
 750             assert not fatal
 751             return False
 752         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 753         return (content, urlh)
 754
 755     @staticmethod
 756     def _guess_encoding_from_content(content_type, webpage_bytes):
 757         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 758         if m:
 759             encoding = m.group(1)
 760         else:
 761             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 762                           webpage_bytes[:1024])
 763             if m:
 764                 encoding = m.group(1).decode('ascii')
 765             elif webpage_bytes.startswith(b'\xff\xfe'):
 766                 encoding = 'utf-16'
 767             else:
 768                 encoding = 'utf-8'
 769
 770         return encoding
 771
 772     def __check_blocked(self, content):
 773         first_block = content[:512]
 774         if ('<title>Access to this site is blocked</title>' in content
 775                 and 'Websense' in first_block):
 776             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 777             blocked_iframe = self._html_search_regex(
 778                 r'<iframe src="([^"]+)"', content,
 779                 'Websense information URL', default=None)
 780             if blocked_iframe:
 781                 msg += ' Visit %s for more details' % blocked_iframe
 782             raise ExtractorError(msg, expected=True)
 783         if '<title>The URL you requested has been blocked</title>' in first_block:
 784             msg = (
 785                 'Access to this webpage has been blocked by Indian censorship. '
 786                 'Use a VPN or proxy server (with --proxy) to route around it.')
 787             block_msg = self._html_search_regex(
 788                 r'</h1><p>(.*?)</p>',
 789                 content, 'block message', default=None)
 790             if block_msg:
 791                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 792             raise ExtractorError(msg, expected=True)
 793         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 794                 and 'blocklist.rkn.gov.ru' in content):
 795             raise ExtractorError(
 796                 'Access to this webpage has been blocked by decision of the Russian government. '
 797                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 798                 expected=True)
 799
 800     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 801         content_type = urlh.headers.get('Content-Type', '')
 802         webpage_bytes = urlh.read()
 803         if prefix is not None:
 804             webpage_bytes = prefix + webpage_bytes
 805         if not encoding:
 806             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 807         if self.get_param('dump_intermediate_pages', False):
 808             self.to_screen('Dumping request to ' + urlh.geturl())
 809             dump = base64.b64encode(webpage_bytes).decode('ascii')
 810             self._downloader.to_screen(dump)
 811         if self.get_param('write_pages', False):
 812             basen = '%s_%s' % (video_id, urlh.geturl())
 813             trim_length = self.get_param('trim_file_name') or 240
 814             if len(basen) > trim_length:
 815                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 816                 basen = basen[:trim_length - len(h)] + h
 817             raw_filename = basen + '.dump'
 818             filename = sanitize_filename(raw_filename, restricted=True)
 819             self.to_screen('Saving request to ' + filename)
 820             # Working around MAX_PATH limitation on Windows (see
 821             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 822             if compat_os_name == 'nt':
 823                 absfilepath = os.path.abspath(filename)
 824                 if len(absfilepath) > 259:
 825                     filename = '\\\\?\\' + absfilepath
 826             with open(filename, 'wb') as outf:
 827                 outf.write(webpage_bytes)
 828
 829         try:
 830             content = webpage_bytes.decode(encoding, 'replace')
 831         except LookupError:
 832             content = webpage_bytes.decode('utf-8', 'replace')
 833
 834         self.__check_blocked(content)
 835
 836         return content
 837
 838     def _download_webpage(
 839             self, url_or_request, video_id, note=None, errnote=None,
 840             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 841             headers={}, query={}, expected_status=None):
 842         """
 843         Return the data of the page as a string.
 844
 845         Arguments:
 846         url_or_request -- plain text URL as a string or
 847             a compat_urllib_request.Requestobject
 848         video_id -- Video/playlist/item identifier (string)
 849
 850         Keyword arguments:
 851         note -- note printed before downloading (string)
 852         errnote -- note printed in case of an error (string)
 853         fatal -- flag denoting whether error should be considered fatal,
 854             i.e. whether it should cause ExtractionError to be raised,
 855             otherwise a warning will be reported and extraction continued
 856         tries -- number of tries
 857         timeout -- sleep interval between tries
 858         encoding -- encoding for a page content decoding, guessed automatically
 859             when not explicitly specified
 860         data -- POST data (bytes)
 861         headers -- HTTP headers (dict)
 862         query -- URL query (dict)
 863         expected_status -- allows to accept failed HTTP requests (non 2xx
 864             status code) by explicitly specifying a set of accepted status
 865             codes. Can be any of the following entities:
 866                 - an integer type specifying an exact failed status code to
 867                   accept
 868                 - a list or a tuple of integer types specifying a list of
 869                   failed status codes to accept
 870                 - a callable accepting an actual failed status code and
 871                   returning True if it should be accepted
 872             Note that this argument does not affect success status codes (2xx)
 873             which are always accepted.
 874         """
 875
 876         success = False
 877         try_count = 0
 878         while success is False:
 879             try:
 880                 res = self._download_webpage_handle(
 881                     url_or_request, video_id, note, errnote, fatal,
 882                     encoding=encoding, data=data, headers=headers, query=query,
 883                     expected_status=expected_status)
 884                 success = True
 885             except compat_http_client.IncompleteRead as e:
 886                 try_count += 1
 887                 if try_count >= tries:
 888                     raise e
 889                 self._sleep(timeout, video_id)
 890         if res is False:
 891             return res
 892         else:
 893             content, _ = res
 894             return content
 895
 896     def _download_xml_handle(
 897             self, url_or_request, video_id, note='Downloading XML',
 898             errnote='Unable to download XML', transform_source=None,
 899             fatal=True, encoding=None, data=None, headers={}, query={},
 900             expected_status=None):
 901         """
 902         Return a tuple (xml as an compat_etree_Element, URL handle).
 903
 904         See _download_webpage docstring for arguments specification.
 905         """
 906         res = self._download_webpage_handle(
 907             url_or_request, video_id, note, errnote, fatal=fatal,
 908             encoding=encoding, data=data, headers=headers, query=query,
 909             expected_status=expected_status)
 910         if res is False:
 911             return res
 912         xml_string, urlh = res
 913         return self._parse_xml(
 914             xml_string, video_id, transform_source=transform_source,
 915             fatal=fatal), urlh
 916
 917     def _download_xml(
 918             self, url_or_request, video_id,
 919             note='Downloading XML', errnote='Unable to download XML',
 920             transform_source=None, fatal=True, encoding=None,
 921             data=None, headers={}, query={}, expected_status=None):
 922         """
 923         Return the xml as an compat_etree_Element.
 924
 925         See _download_webpage docstring for arguments specification.
 926         """
 927         res = self._download_xml_handle(
 928             url_or_request, video_id, note=note, errnote=errnote,
 929             transform_source=transform_source, fatal=fatal, encoding=encoding,
 930             data=data, headers=headers, query=query,
 931             expected_status=expected_status)
 932         return res if res is False else res[0]
 933
 934     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 935         if transform_source:
 936             xml_string = transform_source(xml_string)
 937         try:
 938             return compat_etree_fromstring(xml_string.encode('utf-8'))
 939         except compat_xml_parse_error as ve:
 940             errmsg = '%s: Failed to parse XML ' % video_id
 941             if fatal:
 942                 raise ExtractorError(errmsg, cause=ve)
 943             else:
 944                 self.report_warning(errmsg + str(ve))
 945
 946     def _download_json_handle(
 947             self, url_or_request, video_id, note='Downloading JSON metadata',
 948             errnote='Unable to download JSON metadata', transform_source=None,
 949             fatal=True, encoding=None, data=None, headers={}, query={},
 950             expected_status=None):
 951         """
 952         Return a tuple (JSON object, URL handle).
 953
 954         See _download_webpage docstring for arguments specification.
 955         """
 956         res = self._download_webpage_handle(
 957             url_or_request, video_id, note, errnote, fatal=fatal,
 958             encoding=encoding, data=data, headers=headers, query=query,
 959             expected_status=expected_status)
 960         if res is False:
 961             return res
 962         json_string, urlh = res
 963         return self._parse_json(
 964             json_string, video_id, transform_source=transform_source,
 965             fatal=fatal), urlh
 966
 967     def _download_json(
 968             self, url_or_request, video_id, note='Downloading JSON metadata',
 969             errnote='Unable to download JSON metadata', transform_source=None,
 970             fatal=True, encoding=None, data=None, headers={}, query={},
 971             expected_status=None):
 972         """
 973         Return the JSON object as a dict.
 974
 975         See _download_webpage docstring for arguments specification.
 976         """
 977         res = self._download_json_handle(
 978             url_or_request, video_id, note=note, errnote=errnote,
 979             transform_source=transform_source, fatal=fatal, encoding=encoding,
 980             data=data, headers=headers, query=query,
 981             expected_status=expected_status)
 982         return res if res is False else res[0]
 983
 984     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 985         if transform_source:
 986             json_string = transform_source(json_string)
 987         try:
 988             return json.loads(json_string)
 989         except ValueError as ve:
 990             errmsg = '%s: Failed to parse JSON ' % video_id
 991             if fatal:
 992                 raise ExtractorError(errmsg, cause=ve)
 993             else:
 994                 self.report_warning(errmsg + str(ve))
 995
 996     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 997         return self._parse_json(
 998             data[data.find('{'):data.rfind('}') + 1],
 999             video_id, transform_source, fatal)
1000
1001     def _download_socket_json_handle(
1002             self, url_or_request, video_id, note='Polling socket',
1003             errnote='Unable to poll socket', transform_source=None,
1004             fatal=True, encoding=None, data=None, headers={}, query={},
1005             expected_status=None):
1006         """
1007         Return a tuple (JSON object, URL handle).
1008
1009         See _download_webpage docstring for arguments specification.
1010         """
1011         res = self._download_webpage_handle(
1012             url_or_request, video_id, note, errnote, fatal=fatal,
1013             encoding=encoding, data=data, headers=headers, query=query,
1014             expected_status=expected_status)
1015         if res is False:
1016             return res
1017         webpage, urlh = res
1018         return self._parse_socket_response_as_json(
1019             webpage, video_id, transform_source=transform_source,
1020             fatal=fatal), urlh
1021
1022     def _download_socket_json(
1023             self, url_or_request, video_id, note='Polling socket',
1024             errnote='Unable to poll socket', transform_source=None,
1025             fatal=True, encoding=None, data=None, headers={}, query={},
1026             expected_status=None):
1027         """
1028         Return the JSON object as a dict.
1029
1030         See _download_webpage docstring for arguments specification.
1031         """
1032         res = self._download_socket_json_handle(
1033             url_or_request, video_id, note=note, errnote=errnote,
1034             transform_source=transform_source, fatal=fatal, encoding=encoding,
1035             data=data, headers=headers, query=query,
1036             expected_status=expected_status)
1037         return res if res is False else res[0]
1038
1039     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1040         idstr = format_field(video_id, template='%s: ')
1041         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1042         if only_once:
1043             if f'WARNING: {msg}' in self._printed_messages:
1044                 return
1045             self._printed_messages.add(f'WARNING: {msg}')
1046         self._downloader.report_warning(msg, *args, **kwargs)
1047
1048     def to_screen(self, msg, *args, **kwargs):
1049         """Print msg to screen, prefixing it with '[ie_name]'"""
1050         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1051
1052     def write_debug(self, msg, *args, **kwargs):
1053         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1054
1055     def get_param(self, name, default=None, *args, **kwargs):
1056         if self._downloader:
1057             return self._downloader.params.get(name, default, *args, **kwargs)
1058         return default
1059
1060     def report_drm(self, video_id, partial=False):
1061         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1062
1063     def report_extraction(self, id_or_name):
1064         """Report information extraction."""
1065         self.to_screen('%s: Extracting information' % id_or_name)
1066
1067     def report_download_webpage(self, video_id):
1068         """Report webpage download."""
1069         self.to_screen('%s: Downloading webpage' % video_id)
1070
1071     def report_age_confirmation(self):
1072         """Report attempt to confirm age."""
1073         self.to_screen('Confirming age')
1074
1075     def report_login(self):
1076         """Report attempt to log in."""
1077         self.to_screen('Logging in')
1078
1079     def raise_login_required(
1080             self, msg='This video is only available for registered users',
1081             metadata_available=False, method='any'):
1082         if metadata_available and self.get_param('ignore_no_formats_error'):
1083             self.report_warning(msg)
1084         if method is not None:
1085             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1086         raise ExtractorError(msg, expected=True)
1087
1088     def raise_geo_restricted(
1089             self, msg='This video is not available from your location due to geo restriction',
1090             countries=None, metadata_available=False):
1091         if metadata_available and self.get_param('ignore_no_formats_error'):
1092             self.report_warning(msg)
1093         else:
1094             raise GeoRestrictedError(msg, countries=countries)
1095
1096     def raise_no_formats(self, msg, expected=False, video_id=None):
1097         if expected and self.get_param('ignore_no_formats_error'):
1098             self.report_warning(msg, video_id)
1099         elif isinstance(msg, ExtractorError):
1100             raise msg
1101         else:
1102             raise ExtractorError(msg, expected=expected, video_id=video_id)
1103
1104     # Methods for following #608
1105     @staticmethod
1106     def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1107         """Returns a URL that points to a page that should be processed"""
1108         # TODO: ie should be the class used for getting the info
1109         video_info = {'_type': 'url',
1110                       'url': url,
1111                       'ie_key': ie}
1112         video_info.update(kwargs)
1113         if video_id is not None:
1114             video_info['id'] = video_id
1115         if video_title is not None:
1116             video_info['title'] = video_title
1117         return video_info
1118
1119     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1120         urls = orderedSet(
1121             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1122             for m in matches)
1123         return self.playlist_result(
1124             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1125
1126     @staticmethod
1127     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1128         """Returns a playlist"""
1129         video_info = {'_type': 'playlist',
1130                       'entries': entries}
1131         video_info.update(kwargs)
1132         if playlist_id:
1133             video_info['id'] = playlist_id
1134         if playlist_title:
1135             video_info['title'] = playlist_title
1136         if playlist_description is not None:
1137             video_info['description'] = playlist_description
1138         return video_info
1139
1140     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1141         """
1142         Perform a regex search on the given string, using a single or a list of
1143         patterns returning the first matching group.
1144         In case of failure return a default value or raise a WARNING or a
1145         RegexNotFoundError, depending on fatal, specifying the field name.
1146         """
1147         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1148             mobj = re.search(pattern, string, flags)
1149         else:
1150             for p in pattern:
1151                 mobj = re.search(p, string, flags)
1152                 if mobj:
1153                     break
1154
1155         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1156
1157         if mobj:
1158             if group is None:
1159                 # return the first matching group
1160                 return next(g for g in mobj.groups() if g is not None)
1161             elif isinstance(group, (list, tuple)):
1162                 return tuple(mobj.group(g) for g in group)
1163             else:
1164                 return mobj.group(group)
1165         elif default is not NO_DEFAULT:
1166             return default
1167         elif fatal:
1168             raise RegexNotFoundError('Unable to extract %s' % _name)
1169         else:
1170             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1171             return None
1172
1173     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1174         """
1175         Like _search_regex, but strips HTML tags and unescapes entities.
1176         """
1177         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1178         if res:
1179             return clean_html(res).strip()
1180         else:
1181             return res
1182
1183     def _get_netrc_login_info(self, netrc_machine=None):
1184         username = None
1185         password = None
1186         netrc_machine = netrc_machine or self._NETRC_MACHINE
1187
1188         if self.get_param('usenetrc', False):
1189             try:
1190                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1191                 if os.path.isdir(netrc_file):
1192                     netrc_file = os.path.join(netrc_file, '.netrc')
1193                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1194                 if info is not None:
1195                     username = info[0]
1196                     password = info[2]
1197                 else:
1198                     raise netrc.NetrcParseError(
1199                         'No authenticators for %s' % netrc_machine)
1200             except (IOError, netrc.NetrcParseError) as err:
1201                 self.report_warning(
1202                     'parsing .netrc: %s' % error_to_compat_str(err))
1203
1204         return username, password
1205
1206     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1207         """
1208         Get the login info as (username, password)
1209         First look for the manually specified credentials using username_option
1210         and password_option as keys in params dictionary. If no such credentials
1211         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1212         value.
1213         If there's no info available, return (None, None)
1214         """
1215
1216         # Attempt to use provided username and password or .netrc data
1217         username = self.get_param(username_option)
1218         if username is not None:
1219             password = self.get_param(password_option)
1220         else:
1221             username, password = self._get_netrc_login_info(netrc_machine)
1222
1223         return username, password
1224
1225     def _get_tfa_info(self, note='two-factor verification code'):
1226         """
1227         Get the two-factor authentication info
1228         TODO - asking the user will be required for sms/phone verify
1229         currently just uses the command line option
1230         If there's no info available, return None
1231         """
1232
1233         tfa = self.get_param('twofactor')
1234         if tfa is not None:
1235             return tfa
1236
1237         return compat_getpass('Type %s and press [Return]: ' % note)
1238
1239     # Helper functions for extracting OpenGraph info
1240     @staticmethod
1241     def _og_regexes(prop):
1242         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1243         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1244                        % {'prop': re.escape(prop)})
1245         template = r'<meta[^>]+?%s[^>]+?%s'
1246         return [
1247             template % (property_re, content_re),
1248             template % (content_re, property_re),
1249         ]
1250
1251     @staticmethod
1252     def _meta_regex(prop):
1253         return r'''(?isx)<meta
1254                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1255                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1256
1257     def _og_search_property(self, prop, html, name=None, **kargs):
1258         prop = variadic(prop)
1259         if name is None:
1260             name = 'OpenGraph %s' % prop[0]
1261         og_regexes = []
1262         for p in prop:
1263             og_regexes.extend(self._og_regexes(p))
1264         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1265         if escaped is None:
1266             return None
1267         return unescapeHTML(escaped)
1268
1269     def _og_search_thumbnail(self, html, **kargs):
1270         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1271
1272     def _og_search_description(self, html, **kargs):
1273         return self._og_search_property('description', html, fatal=False, **kargs)
1274
1275     def _og_search_title(self, html, **kargs):
1276         return self._og_search_property('title', html, **kargs)
1277
1278     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1279         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1280         if secure:
1281             regexes = self._og_regexes('video:secure_url') + regexes
1282         return self._html_search_regex(regexes, html, name, **kargs)
1283
1284     def _og_search_url(self, html, **kargs):
1285         return self._og_search_property('url', html, **kargs)
1286
1287     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1288         name = variadic(name)
1289         if display_name is None:
1290             display_name = name[0]
1291         return self._html_search_regex(
1292             [self._meta_regex(n) for n in name],
1293             html, display_name, fatal=fatal, group='content', **kwargs)
1294
1295     def _dc_search_uploader(self, html):
1296         return self._html_search_meta('dc.creator', html, 'uploader')
1297
1298     def _rta_search(self, html):
1299         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1300         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1301                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1302                      html):
1303             return 18
1304         return 0
1305
1306     def _media_rating_search(self, html):
1307         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1308         rating = self._html_search_meta('rating', html)
1309
1310         if not rating:
1311             return None
1312
1313         RATING_TABLE = {
1314             'safe for kids': 0,
1315             'general': 8,
1316             '14 years': 14,
1317             'mature': 17,
1318             'restricted': 19,
1319         }
1320         return RATING_TABLE.get(rating.lower())
1321
1322     def _family_friendly_search(self, html):
1323         # See http://schema.org/VideoObject
1324         family_friendly = self._html_search_meta(
1325             'isFamilyFriendly', html, default=None)
1326
1327         if not family_friendly:
1328             return None
1329
1330         RATING_TABLE = {
1331             '1': 0,
1332             'true': 0,
1333             '0': 18,
1334             'false': 18,
1335         }
1336         return RATING_TABLE.get(family_friendly.lower())
1337
1338     def _twitter_search_player(self, html):
1339         return self._html_search_meta('twitter:player', html,
1340                                       'twitter card player')
1341
1342     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1343         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1344         default = kwargs.get('default', NO_DEFAULT)
1345         # JSON-LD may be malformed and thus `fatal` should be respected.
1346         # At the same time `default` may be passed that assumes `fatal=False`
1347         # for _search_regex. Let's simulate the same behavior here as well.
1348         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1349         json_ld = []
1350         for mobj in json_ld_list:
1351             json_ld_item = self._parse_json(
1352                 mobj.group('json_ld'), video_id, fatal=fatal)
1353             if not json_ld_item:
1354                 continue
1355             if isinstance(json_ld_item, dict):
1356                 json_ld.append(json_ld_item)
1357             elif isinstance(json_ld_item, (list, tuple)):
1358                 json_ld.extend(json_ld_item)
1359         if json_ld:
1360             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1361         if json_ld:
1362             return json_ld
1363         if default is not NO_DEFAULT:
1364             return default
1365         elif fatal:
1366             raise RegexNotFoundError('Unable to extract JSON-LD')
1367         else:
1368             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1369             return {}
1370
1371     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1372         if isinstance(json_ld, compat_str):
1373             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1374         if not json_ld:
1375             return {}
1376         info = {}
1377         if not isinstance(json_ld, (list, tuple, dict)):
1378             return info
1379         if isinstance(json_ld, dict):
1380             json_ld = [json_ld]
1381
1382         INTERACTION_TYPE_MAP = {
1383             'CommentAction': 'comment',
1384             'AgreeAction': 'like',
1385             'DisagreeAction': 'dislike',
1386             'LikeAction': 'like',
1387             'DislikeAction': 'dislike',
1388             'ListenAction': 'view',
1389             'WatchAction': 'view',
1390             'ViewAction': 'view',
1391         }
1392
1393         def extract_interaction_type(e):
1394             interaction_type = e.get('interactionType')
1395             if isinstance(interaction_type, dict):
1396                 interaction_type = interaction_type.get('@type')
1397             return str_or_none(interaction_type)
1398
1399         def extract_interaction_statistic(e):
1400             interaction_statistic = e.get('interactionStatistic')
1401             if isinstance(interaction_statistic, dict):
1402                 interaction_statistic = [interaction_statistic]
1403             if not isinstance(interaction_statistic, list):
1404                 return
1405             for is_e in interaction_statistic:
1406                 if not isinstance(is_e, dict):
1407                     continue
1408                 if is_e.get('@type') != 'InteractionCounter':
1409                     continue
1410                 interaction_type = extract_interaction_type(is_e)
1411                 if not interaction_type:
1412                     continue
1413                 # For interaction count some sites provide string instead of
1414                 # an integer (as per spec) with non digit characters (e.g. ",")
1415                 # so extracting count with more relaxed str_to_int
1416                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1417                 if interaction_count is None:
1418                     continue
1419                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1420                 if not count_kind:
1421                     continue
1422                 count_key = '%s_count' % count_kind
1423                 if info.get(count_key) is not None:
1424                     continue
1425                 info[count_key] = interaction_count
1426
1427         def extract_video_object(e):
1428             assert e['@type'] == 'VideoObject'
1429             author = e.get('author')
1430             info.update({
1431                 'url': url_or_none(e.get('contentUrl')),
1432                 'title': unescapeHTML(e.get('name')),
1433                 'description': unescapeHTML(e.get('description')),
1434                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1435                 'duration': parse_duration(e.get('duration')),
1436                 'timestamp': unified_timestamp(e.get('uploadDate')),
1437                 # author can be an instance of 'Organization' or 'Person' types.
1438                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1439                 # however some websites are using 'Text' type instead.
1440                 # 1. https://schema.org/VideoObject
1441                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1442                 'filesize': float_or_none(e.get('contentSize')),
1443                 'tbr': int_or_none(e.get('bitrate')),
1444                 'width': int_or_none(e.get('width')),
1445                 'height': int_or_none(e.get('height')),
1446                 'view_count': int_or_none(e.get('interactionCount')),
1447             })
1448             extract_interaction_statistic(e)
1449
1450         for e in json_ld:
1451             if '@context' in e:
1452                 item_type = e.get('@type')
1453                 if expected_type is not None and expected_type != item_type:
1454                     continue
1455                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1456                 if rating is not None:
1457                     info['average_rating'] = rating
1458                 if item_type in ('TVEpisode', 'Episode'):
1459                     episode_name = unescapeHTML(e.get('name'))
1460                     info.update({
1461                         'episode': episode_name,
1462                         'episode_number': int_or_none(e.get('episodeNumber')),
1463                         'description': unescapeHTML(e.get('description')),
1464                     })
1465                     if not info.get('title') and episode_name:
1466                         info['title'] = episode_name
1467                     part_of_season = e.get('partOfSeason')
1468                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1469                         info.update({
1470                             'season': unescapeHTML(part_of_season.get('name')),
1471                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1472                         })
1473                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1474                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1475                         info['series'] = unescapeHTML(part_of_series.get('name'))
1476                 elif item_type == 'Movie':
1477                     info.update({
1478                         'title': unescapeHTML(e.get('name')),
1479                         'description': unescapeHTML(e.get('description')),
1480                         'duration': parse_duration(e.get('duration')),
1481                         'timestamp': unified_timestamp(e.get('dateCreated')),
1482                     })
1483                 elif item_type in ('Article', 'NewsArticle'):
1484                     info.update({
1485                         'timestamp': parse_iso8601(e.get('datePublished')),
1486                         'title': unescapeHTML(e.get('headline')),
1487                         'description': unescapeHTML(e.get('articleBody')),
1488                     })
1489                 elif item_type == 'VideoObject':
1490                     extract_video_object(e)
1491                     if expected_type is None:
1492                         continue
1493                     else:
1494                         break
1495                 video = e.get('video')
1496                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1497                     extract_video_object(video)
1498                 if expected_type is None:
1499                     continue
1500                 else:
1501                     break
1502         return dict((k, v) for k, v in info.items() if v is not None)
1503
1504     def _search_nextjs_data(self, webpage, video_id, **kw):
1505         return self._parse_json(
1506             self._search_regex(
1507                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1508                 webpage, 'next.js data', **kw),
1509             video_id, **kw)
1510
1511     @staticmethod
1512     def _hidden_inputs(html):
1513         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1514         hidden_inputs = {}
1515         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1516             attrs = extract_attributes(input)
1517             if not input:
1518                 continue
1519             if attrs.get('type') not in ('hidden', 'submit'):
1520                 continue
1521             name = attrs.get('name') or attrs.get('id')
1522             value = attrs.get('value')
1523             if name and value is not None:
1524                 hidden_inputs[name] = value
1525         return hidden_inputs
1526
1527     def _form_hidden_inputs(self, form_id, html):
1528         form = self._search_regex(
1529             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1530             html, '%s form' % form_id, group='form')
1531         return self._hidden_inputs(form)
1532
1533     class FormatSort:
1534         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1535
1536         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1537                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1538                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1539         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1540                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1541                         'fps', 'fs_approx', 'source', 'format_id')
1542
1543         settings = {
1544             'vcodec': {'type': 'ordered', 'regex': True,
1545                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1546             'acodec': {'type': 'ordered', 'regex': True,
1547                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1548             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1549                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1550             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1551                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1552             'vext': {'type': 'ordered', 'field': 'video_ext',
1553                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1554                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1555             'aext': {'type': 'ordered', 'field': 'audio_ext',
1556                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1557                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1558             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1559             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1560                            'field': ('vcodec', 'acodec'),
1561                            'function': lambda it: int(any(v != 'none' for v in it))},
1562             'ie_pref': {'priority': True, 'type': 'extractor'},
1563             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1564             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1565             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1566             'quality': {'convert': 'float', 'default': -1},
1567             'filesize': {'convert': 'bytes'},
1568             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1569             'id': {'convert': 'string', 'field': 'format_id'},
1570             'height': {'convert': 'float_none'},
1571             'width': {'convert': 'float_none'},
1572             'fps': {'convert': 'float_none'},
1573             'tbr': {'convert': 'float_none'},
1574             'vbr': {'convert': 'float_none'},
1575             'abr': {'convert': 'float_none'},
1576             'asr': {'convert': 'float_none'},
1577             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1578
1579             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1580             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1581             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1582             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1583             'res': {'type': 'multiple', 'field': ('height', 'width'),
1584                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1585
1586             # Most of these exist only for compatibility reasons
1587             'dimension': {'type': 'alias', 'field': 'res'},
1588             'resolution': {'type': 'alias', 'field': 'res'},
1589             'extension': {'type': 'alias', 'field': 'ext'},
1590             'bitrate': {'type': 'alias', 'field': 'br'},
1591             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1592             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1593             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1594             'framerate': {'type': 'alias', 'field': 'fps'},
1595             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1596             'protocol': {'type': 'alias', 'field': 'proto'},
1597             'source_preference': {'type': 'alias', 'field': 'source'},
1598             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1599             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1600             'samplerate': {'type': 'alias', 'field': 'asr'},
1601             'video_ext': {'type': 'alias', 'field': 'vext'},
1602             'audio_ext': {'type': 'alias', 'field': 'aext'},
1603             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1604             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1605             'video': {'type': 'alias', 'field': 'hasvid'},
1606             'has_video': {'type': 'alias', 'field': 'hasvid'},
1607             'audio': {'type': 'alias', 'field': 'hasaud'},
1608             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1609             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1610             'preference': {'type': 'alias', 'field': 'ie_pref'},
1611             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1612             'format_id': {'type': 'alias', 'field': 'id'},
1613         }
1614
1615         _order = []
1616
1617         def _get_field_setting(self, field, key):
1618             if field not in self.settings:
1619                 self.settings[field] = {}
1620             propObj = self.settings[field]
1621             if key not in propObj:
1622                 type = propObj.get('type')
1623                 if key == 'field':
1624                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1625                 elif key == 'convert':
1626                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1627                 else:
1628                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1629                 propObj[key] = default
1630             return propObj[key]
1631
1632         def _resolve_field_value(self, field, value, convertNone=False):
1633             if value is None:
1634                 if not convertNone:
1635                     return None
1636             else:
1637                 value = value.lower()
1638             conversion = self._get_field_setting(field, 'convert')
1639             if conversion == 'ignore':
1640                 return None
1641             if conversion == 'string':
1642                 return value
1643             elif conversion == 'float_none':
1644                 return float_or_none(value)
1645             elif conversion == 'bytes':
1646                 return FileDownloader.parse_bytes(value)
1647             elif conversion == 'order':
1648                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1649                 use_regex = self._get_field_setting(field, 'regex')
1650                 list_length = len(order_list)
1651                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1652                 if use_regex and value is not None:
1653                     for i, regex in enumerate(order_list):
1654                         if regex and re.match(regex, value):
1655                             return list_length - i
1656                     return list_length - empty_pos  # not in list
1657                 else:  # not regex or  value = None
1658                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1659             else:
1660                 if value.isnumeric():
1661                     return float(value)
1662                 else:
1663                     self.settings[field]['convert'] = 'string'
1664                     return value
1665
1666         def evaluate_params(self, params, sort_extractor):
1667             self._use_free_order = params.get('prefer_free_formats', False)
1668             self._sort_user = params.get('format_sort', [])
1669             self._sort_extractor = sort_extractor
1670
1671             def add_item(field, reverse, closest, limit_text):
1672                 field = field.lower()
1673                 if field in self._order:
1674                     return
1675                 self._order.append(field)
1676                 limit = self._resolve_field_value(field, limit_text)
1677                 data = {
1678                     'reverse': reverse,
1679                     'closest': False if limit is None else closest,
1680                     'limit_text': limit_text,
1681                     'limit': limit}
1682                 if field in self.settings:
1683                     self.settings[field].update(data)
1684                 else:
1685                     self.settings[field] = data
1686
1687             sort_list = (
1688                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1689                 + (tuple() if params.get('format_sort_force', False)
1690                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1691                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1692
1693             for item in sort_list:
1694                 match = re.match(self.regex, item)
1695                 if match is None:
1696                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1697                 field = match.group('field')
1698                 if field is None:
1699                     continue
1700                 if self._get_field_setting(field, 'type') == 'alias':
1701                     field = self._get_field_setting(field, 'field')
1702                 reverse = match.group('reverse') is not None
1703                 closest = match.group('separator') == '~'
1704                 limit_text = match.group('limit')
1705
1706                 has_limit = limit_text is not None
1707                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1708                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1709
1710                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1711                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1712                 limit_count = len(limits)
1713                 for (i, f) in enumerate(fields):
1714                     add_item(f, reverse, closest,
1715                              limits[i] if i < limit_count
1716                              else limits[0] if has_limit and not has_multiple_limits
1717                              else None)
1718
1719         def print_verbose_info(self, write_debug):
1720             if self._sort_user:
1721                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1722             if self._sort_extractor:
1723                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1724             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1725                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1726                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1727                               self._get_field_setting(field, 'limit_text'),
1728                               self._get_field_setting(field, 'limit'))
1729                 if self._get_field_setting(field, 'limit_text') is not None else '')
1730                 for field in self._order if self._get_field_setting(field, 'visible')]))
1731
1732         def _calculate_field_preference_from_value(self, format, field, type, value):
1733             reverse = self._get_field_setting(field, 'reverse')
1734             closest = self._get_field_setting(field, 'closest')
1735             limit = self._get_field_setting(field, 'limit')
1736
1737             if type == 'extractor':
1738                 maximum = self._get_field_setting(field, 'max')
1739                 if value is None or (maximum is not None and value >= maximum):
1740                     value = -1
1741             elif type == 'boolean':
1742                 in_list = self._get_field_setting(field, 'in_list')
1743                 not_in_list = self._get_field_setting(field, 'not_in_list')
1744                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1745             elif type == 'ordered':
1746                 value = self._resolve_field_value(field, value, True)
1747
1748             # try to convert to number
1749             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1750             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1751             if is_num:
1752                 value = val_num
1753
1754             return ((-10, 0) if value is None
1755                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1756                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1757                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1758                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1759                     else (-1, value, 0))
1760
1761         def _calculate_field_preference(self, format, field):
1762             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1763             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1764             if type == 'multiple':
1765                 type = 'field'  # Only 'field' is allowed in multiple for now
1766                 actual_fields = self._get_field_setting(field, 'field')
1767
1768                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1769             else:
1770                 value = get_value(field)
1771             return self._calculate_field_preference_from_value(format, field, type, value)
1772
1773         def calculate_preference(self, format):
1774             # Determine missing protocol
1775             if not format.get('protocol'):
1776                 format['protocol'] = determine_protocol(format)
1777
1778             # Determine missing ext
1779             if not format.get('ext') and 'url' in format:
1780                 format['ext'] = determine_ext(format['url'])
1781             if format.get('vcodec') == 'none':
1782                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1783                 format['video_ext'] = 'none'
1784             else:
1785                 format['video_ext'] = format['ext']
1786                 format['audio_ext'] = 'none'
1787             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1788             #    format['preference'] = -1000
1789
1790             # Determine missing bitrates
1791             if format.get('tbr') is None:
1792                 if format.get('vbr') is not None and format.get('abr') is not None:
1793                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1794             else:
1795                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1796                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1797                 if format.get('acodec') != 'none' and format.get('abr') is None:
1798                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1799
1800             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1801
1802     def _sort_formats(self, formats, field_preference=[]):
1803         if not formats:
1804             return
1805         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1806         format_sort.evaluate_params(self._downloader.params, field_preference)
1807         if self.get_param('verbose', False):
1808             format_sort.print_verbose_info(self._downloader.write_debug)
1809         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1810
1811     def _check_formats(self, formats, video_id):
1812         if formats:
1813             formats[:] = filter(
1814                 lambda f: self._is_valid_url(
1815                     f['url'], video_id,
1816                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1817                 formats)
1818
1819     @staticmethod
1820     def _remove_duplicate_formats(formats):
1821         format_urls = set()
1822         unique_formats = []
1823         for f in formats:
1824             if f['url'] not in format_urls:
1825                 format_urls.add(f['url'])
1826                 unique_formats.append(f)
1827         formats[:] = unique_formats
1828
1829     def _is_valid_url(self, url, video_id, item='video', headers={}):
1830         url = self._proto_relative_url(url, scheme='http:')
1831         # For now assume non HTTP(S) URLs always valid
1832         if not (url.startswith('http://') or url.startswith('https://')):
1833             return True
1834         try:
1835             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1836             return True
1837         except ExtractorError as e:
1838             self.to_screen(
1839                 '%s: %s URL is invalid, skipping: %s'
1840                 % (video_id, item, error_to_compat_str(e.cause)))
1841             return False
1842
1843     def http_scheme(self):
1844         """ Either "http:" or "https:", depending on the user's preferences """
1845         return (
1846             'http:'
1847             if self.get_param('prefer_insecure', False)
1848             else 'https:')
1849
1850     def _proto_relative_url(self, url, scheme=None):
1851         if url is None:
1852             return url
1853         if url.startswith('//'):
1854             if scheme is None:
1855                 scheme = self.http_scheme()
1856             return scheme + url
1857         else:
1858             return url
1859
1860     def _sleep(self, timeout, video_id, msg_template=None):
1861         if msg_template is None:
1862             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1863         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1864         self.to_screen(msg)
1865         time.sleep(timeout)
1866
1867     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1868                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1869                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1870         manifest = self._download_xml(
1871             manifest_url, video_id, 'Downloading f4m manifest',
1872             'Unable to download f4m manifest',
1873             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1874             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1875             transform_source=transform_source,
1876             fatal=fatal, data=data, headers=headers, query=query)
1877
1878         if manifest is False:
1879             return []
1880
1881         return self._parse_f4m_formats(
1882             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1883             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1884
1885     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1886                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1887                            fatal=True, m3u8_id=None):
1888         if not isinstance(manifest, compat_etree_Element) and not fatal:
1889             return []
1890
1891         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1892         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1893         if akamai_pv is not None and ';' in akamai_pv.text:
1894             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1895             if playerVerificationChallenge.strip() != '':
1896                 return []
1897
1898         formats = []
1899         manifest_version = '1.0'
1900         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1901         if not media_nodes:
1902             manifest_version = '2.0'
1903             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1904         # Remove unsupported DRM protected media from final formats
1905         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1906         media_nodes = remove_encrypted_media(media_nodes)
1907         if not media_nodes:
1908             return formats
1909
1910         manifest_base_url = get_base_url(manifest)
1911
1912         bootstrap_info = xpath_element(
1913             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1914             'bootstrap info', default=None)
1915
1916         vcodec = None
1917         mime_type = xpath_text(
1918             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1919             'base URL', default=None)
1920         if mime_type and mime_type.startswith('audio/'):
1921             vcodec = 'none'
1922
1923         for i, media_el in enumerate(media_nodes):
1924             tbr = int_or_none(media_el.attrib.get('bitrate'))
1925             width = int_or_none(media_el.attrib.get('width'))
1926             height = int_or_none(media_el.attrib.get('height'))
1927             format_id = join_nonempty(f4m_id, tbr or i)
1928             # If <bootstrapInfo> is present, the specified f4m is a
1929             # stream-level manifest, and only set-level manifests may refer to
1930             # external resources.  See section 11.4 and section 4 of F4M spec
1931             if bootstrap_info is None:
1932                 media_url = None
1933                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1934                 if manifest_version == '2.0':
1935                     media_url = media_el.attrib.get('href')
1936                 if media_url is None:
1937                     media_url = media_el.attrib.get('url')
1938                 if not media_url:
1939                     continue
1940                 manifest_url = (
1941                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1942                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1943                 # If media_url is itself a f4m manifest do the recursive extraction
1944                 # since bitrates in parent manifest (this one) and media_url manifest
1945                 # may differ leading to inability to resolve the format by requested
1946                 # bitrate in f4m downloader
1947                 ext = determine_ext(manifest_url)
1948                 if ext == 'f4m':
1949                     f4m_formats = self._extract_f4m_formats(
1950                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1951                         transform_source=transform_source, fatal=fatal)
1952                     # Sometimes stream-level manifest contains single media entry that
1953                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1954                     # At the same time parent's media entry in set-level manifest may
1955                     # contain it. We will copy it from parent in such cases.
1956                     if len(f4m_formats) == 1:
1957                         f = f4m_formats[0]
1958                         f.update({
1959                             'tbr': f.get('tbr') or tbr,
1960                             'width': f.get('width') or width,
1961                             'height': f.get('height') or height,
1962                             'format_id': f.get('format_id') if not tbr else format_id,
1963                             'vcodec': vcodec,
1964                         })
1965                     formats.extend(f4m_formats)
1966                     continue
1967                 elif ext == 'm3u8':
1968                     formats.extend(self._extract_m3u8_formats(
1969                         manifest_url, video_id, 'mp4', preference=preference,
1970                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1971                     continue
1972             formats.append({
1973                 'format_id': format_id,
1974                 'url': manifest_url,
1975                 'manifest_url': manifest_url,
1976                 'ext': 'flv' if bootstrap_info is not None else None,
1977                 'protocol': 'f4m',
1978                 'tbr': tbr,
1979                 'width': width,
1980                 'height': height,
1981                 'vcodec': vcodec,
1982                 'preference': preference,
1983                 'quality': quality,
1984             })
1985         return formats
1986
1987     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1988         return {
1989             'format_id': join_nonempty(m3u8_id, 'meta'),
1990             'url': m3u8_url,
1991             'ext': ext,
1992             'protocol': 'm3u8',
1993             'preference': preference - 100 if preference else -100,
1994             'quality': quality,
1995             'resolution': 'multiple',
1996             'format_note': 'Quality selection URL',
1997         }
1998
1999     def _report_ignoring_subs(self, name):
2000         self.report_warning(bug_reports_message(
2001             f'Ignoring subtitle tracks found in the {name} manifest; '
2002             'if any subtitle tracks are missing,'
2003         ), only_once=True)
2004
2005     def _extract_m3u8_formats(self, *args, **kwargs):
2006         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2007         if subs:
2008             self._report_ignoring_subs('HLS')
2009         return fmts
2010
2011     def _extract_m3u8_formats_and_subtitles(
2012             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2013             preference=None, quality=None, m3u8_id=None, note=None,
2014             errnote=None, fatal=True, live=False, data=None, headers={},
2015             query={}):
2016
2017         res = self._download_webpage_handle(
2018             m3u8_url, video_id,
2019             note='Downloading m3u8 information' if note is None else note,
2020             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2021             fatal=fatal, data=data, headers=headers, query=query)
2022
2023         if res is False:
2024             return [], {}
2025
2026         m3u8_doc, urlh = res
2027         m3u8_url = urlh.geturl()
2028
2029         return self._parse_m3u8_formats_and_subtitles(
2030             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2031             preference=preference, quality=quality, m3u8_id=m3u8_id,
2032             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2033             headers=headers, query=query, video_id=video_id)
2034
2035     def _parse_m3u8_formats_and_subtitles(
2036             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2037             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2038             errnote=None, fatal=True, data=None, headers={}, query={},
2039             video_id=None):
2040         formats, subtitles = [], {}
2041
2042         has_drm = re.search('|'.join([
2043             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2044             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2045         ]), m3u8_doc)
2046
2047         def format_url(url):
2048             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2049
2050         if self.get_param('hls_split_discontinuity', False):
2051             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2052                 if not m3u8_doc:
2053                     if not manifest_url:
2054                         return []
2055                     m3u8_doc = self._download_webpage(
2056                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2057                         note=False, errnote='Failed to download m3u8 playlist information')
2058                     if m3u8_doc is False:
2059                         return []
2060                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2061
2062         else:
2063             def _extract_m3u8_playlist_indices(*args, **kwargs):
2064                 return [None]
2065
2066         # References:
2067         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2068         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2069         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2070
2071         # We should try extracting formats only from master playlists [1, 4.3.4],
2072         # i.e. playlists that describe available qualities. On the other hand
2073         # media playlists [1, 4.3.3] should be returned as is since they contain
2074         # just the media without qualities renditions.
2075         # Fortunately, master playlist can be easily distinguished from media
2076         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2077         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2078         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2079         # media playlist and MUST NOT appear in master playlist thus we can
2080         # clearly detect media playlist with this criterion.
2081
2082         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2083             formats = [{
2084                 'format_id': join_nonempty(m3u8_id, idx),
2085                 'format_index': idx,
2086                 'url': m3u8_url,
2087                 'ext': ext,
2088                 'protocol': entry_protocol,
2089                 'preference': preference,
2090                 'quality': quality,
2091                 'has_drm': has_drm,
2092             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2093
2094             return formats, subtitles
2095
2096         groups = {}
2097         last_stream_inf = {}
2098
2099         def extract_media(x_media_line):
2100             media = parse_m3u8_attributes(x_media_line)
2101             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2102             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2103             if not (media_type and group_id and name):
2104                 return
2105             groups.setdefault(group_id, []).append(media)
2106             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2107             if media_type == 'SUBTITLES':
2108                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2109                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2110                 # However, lack of URI has been spotted in the wild.
2111                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2112                 if not media.get('URI'):
2113                     return
2114                 url = format_url(media['URI'])
2115                 sub_info = {
2116                     'url': url,
2117                     'ext': determine_ext(url),
2118                 }
2119                 if sub_info['ext'] == 'm3u8':
2120                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2121                     # files may contain is WebVTT:
2122                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2123                     sub_info['ext'] = 'vtt'
2124                     sub_info['protocol'] = 'm3u8_native'
2125                 lang = media.get('LANGUAGE') or 'und'
2126                 subtitles.setdefault(lang, []).append(sub_info)
2127             if media_type not in ('VIDEO', 'AUDIO'):
2128                 return
2129             media_url = media.get('URI')
2130             if media_url:
2131                 manifest_url = format_url(media_url)
2132                 formats.extend({
2133                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2134                     'format_note': name,
2135                     'format_index': idx,
2136                     'url': manifest_url,
2137                     'manifest_url': m3u8_url,
2138                     'language': media.get('LANGUAGE'),
2139                     'ext': ext,
2140                     'protocol': entry_protocol,
2141                     'preference': preference,
2142                     'quality': quality,
2143                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2144                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2145
2146         def build_stream_name():
2147             # Despite specification does not mention NAME attribute for
2148             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2149             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2150             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2151             stream_name = last_stream_inf.get('NAME')
2152             if stream_name:
2153                 return stream_name
2154             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2155             # from corresponding rendition group
2156             stream_group_id = last_stream_inf.get('VIDEO')
2157             if not stream_group_id:
2158                 return
2159             stream_group = groups.get(stream_group_id)
2160             if not stream_group:
2161                 return stream_group_id
2162             rendition = stream_group[0]
2163             return rendition.get('NAME') or stream_group_id
2164
2165         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2166         # chance to detect video only formats when EXT-X-STREAM-INF tags
2167         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2168         for line in m3u8_doc.splitlines():
2169             if line.startswith('#EXT-X-MEDIA:'):
2170                 extract_media(line)
2171
2172         for line in m3u8_doc.splitlines():
2173             if line.startswith('#EXT-X-STREAM-INF:'):
2174                 last_stream_inf = parse_m3u8_attributes(line)
2175             elif line.startswith('#') or not line.strip():
2176                 continue
2177             else:
2178                 tbr = float_or_none(
2179                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2180                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2181                 manifest_url = format_url(line.strip())
2182
2183                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2184                     format_id = [m3u8_id, None, idx]
2185                     # Bandwidth of live streams may differ over time thus making
2186                     # format_id unpredictable. So it's better to keep provided
2187                     # format_id intact.
2188                     if not live:
2189                         stream_name = build_stream_name()
2190                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2191                     f = {
2192                         'format_id': join_nonempty(*format_id),
2193                         'format_index': idx,
2194                         'url': manifest_url,
2195                         'manifest_url': m3u8_url,
2196                         'tbr': tbr,
2197                         'ext': ext,
2198                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2199                         'protocol': entry_protocol,
2200                         'preference': preference,
2201                         'quality': quality,
2202                     }
2203                     resolution = last_stream_inf.get('RESOLUTION')
2204                     if resolution:
2205                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2206                         if mobj:
2207                             f['width'] = int(mobj.group('width'))
2208                             f['height'] = int(mobj.group('height'))
2209                     # Unified Streaming Platform
2210                     mobj = re.search(
2211                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2212                     if mobj:
2213                         abr, vbr = mobj.groups()
2214                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2215                         f.update({
2216                             'vbr': vbr,
2217                             'abr': abr,
2218                         })
2219                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2220                     f.update(codecs)
2221                     audio_group_id = last_stream_inf.get('AUDIO')
2222                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2223                     # references a rendition group MUST have a CODECS attribute.
2224                     # However, this is not always respected, for example, [2]
2225                     # contains EXT-X-STREAM-INF tag which references AUDIO
2226                     # rendition group but does not have CODECS and despite
2227                     # referencing an audio group it represents a complete
2228                     # (with audio and video) format. So, for such cases we will
2229                     # ignore references to rendition groups and treat them
2230                     # as complete formats.
2231                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2232                         audio_group = groups.get(audio_group_id)
2233                         if audio_group and audio_group[0].get('URI'):
2234                             # TODO: update acodec for audio only formats with
2235                             # the same GROUP-ID
2236                             f['acodec'] = 'none'
2237                     if not f.get('ext'):
2238                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2239                     formats.append(f)
2240
2241                     # for DailyMotion
2242                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2243                     if progressive_uri:
2244                         http_f = f.copy()
2245                         del http_f['manifest_url']
2246                         http_f.update({
2247                             'format_id': f['format_id'].replace('hls-', 'http-'),
2248                             'protocol': 'http',
2249                             'url': progressive_uri,
2250                         })
2251                         formats.append(http_f)
2252
2253                 last_stream_inf = {}
2254         return formats, subtitles
2255
2256     def _extract_m3u8_vod_duration(
2257             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2258
2259         m3u8_vod = self._download_webpage(
2260             m3u8_vod_url, video_id,
2261             note='Downloading m3u8 VOD manifest' if note is None else note,
2262             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2263             fatal=False, data=data, headers=headers, query=query)
2264
2265         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2266
2267     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2268         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2269             return None
2270
2271         return int(sum(
2272             float(line[len('#EXTINF:'):].split(',')[0])
2273             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2274
2275     @staticmethod
2276     def _xpath_ns(path, namespace=None):
2277         if not namespace:
2278             return path
2279         out = []
2280         for c in path.split('/'):
2281             if not c or c == '.':
2282                 out.append(c)
2283             else:
2284                 out.append('{%s}%s' % (namespace, c))
2285         return '/'.join(out)
2286
2287     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2288         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2289
2290         if smil is False:
2291             assert not fatal
2292             return []
2293
2294         namespace = self._parse_smil_namespace(smil)
2295
2296         fmts = self._parse_smil_formats(
2297             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2298         subs = self._parse_smil_subtitles(
2299             smil, namespace=namespace)
2300
2301         return fmts, subs
2302
2303     def _extract_smil_formats(self, *args, **kwargs):
2304         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2305         if subs:
2306             self._report_ignoring_subs('SMIL')
2307         return fmts
2308
2309     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2310         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2311         if smil is False:
2312             return {}
2313         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2314
2315     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2316         return self._download_xml(
2317             smil_url, video_id, 'Downloading SMIL file',
2318             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2319
2320     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2321         namespace = self._parse_smil_namespace(smil)
2322
2323         formats = self._parse_smil_formats(
2324             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2325         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2326
2327         video_id = os.path.splitext(url_basename(smil_url))[0]
2328         title = None
2329         description = None
2330         upload_date = None
2331         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2332             name = meta.attrib.get('name')
2333             content = meta.attrib.get('content')
2334             if not name or not content:
2335                 continue
2336             if not title and name == 'title':
2337                 title = content
2338             elif not description and name in ('description', 'abstract'):
2339                 description = content
2340             elif not upload_date and name == 'date':
2341                 upload_date = unified_strdate(content)
2342
2343         thumbnails = [{
2344             'id': image.get('type'),
2345             'url': image.get('src'),
2346             'width': int_or_none(image.get('width')),
2347             'height': int_or_none(image.get('height')),
2348         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2349
2350         return {
2351             'id': video_id,
2352             'title': title or video_id,
2353             'description': description,
2354             'upload_date': upload_date,
2355             'thumbnails': thumbnails,
2356             'formats': formats,
2357             'subtitles': subtitles,
2358         }
2359
2360     def _parse_smil_namespace(self, smil):
2361         return self._search_regex(
2362             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2363
2364     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2365         base = smil_url
2366         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2367             b = meta.get('base') or meta.get('httpBase')
2368             if b:
2369                 base = b
2370                 break
2371
2372         formats = []
2373         rtmp_count = 0
2374         http_count = 0
2375         m3u8_count = 0
2376         imgs_count = 0
2377
2378         srcs = set()
2379         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2380         for medium in media:
2381             src = medium.get('src')
2382             if not src or src in srcs:
2383                 continue
2384             srcs.add(src)
2385
2386             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2387             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2388             width = int_or_none(medium.get('width'))
2389             height = int_or_none(medium.get('height'))
2390             proto = medium.get('proto')
2391             ext = medium.get('ext')
2392             src_ext = determine_ext(src)
2393             streamer = medium.get('streamer') or base
2394
2395             if proto == 'rtmp' or streamer.startswith('rtmp'):
2396                 rtmp_count += 1
2397                 formats.append({
2398                     'url': streamer,
2399                     'play_path': src,
2400                     'ext': 'flv',
2401                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2402                     'tbr': bitrate,
2403                     'filesize': filesize,
2404                     'width': width,
2405                     'height': height,
2406                 })
2407                 if transform_rtmp_url:
2408                     streamer, src = transform_rtmp_url(streamer, src)
2409                     formats[-1].update({
2410                         'url': streamer,
2411                         'play_path': src,
2412                     })
2413                 continue
2414
2415             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2416             src_url = src_url.strip()
2417
2418             if proto == 'm3u8' or src_ext == 'm3u8':
2419                 m3u8_formats = self._extract_m3u8_formats(
2420                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2421                 if len(m3u8_formats) == 1:
2422                     m3u8_count += 1
2423                     m3u8_formats[0].update({
2424                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2425                         'tbr': bitrate,
2426                         'width': width,
2427                         'height': height,
2428                     })
2429                 formats.extend(m3u8_formats)
2430             elif src_ext == 'f4m':
2431                 f4m_url = src_url
2432                 if not f4m_params:
2433                     f4m_params = {
2434                         'hdcore': '3.2.0',
2435                         'plugin': 'flowplayer-3.2.0.1',
2436                     }
2437                 f4m_url += '&' if '?' in f4m_url else '?'
2438                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2439                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2440             elif src_ext == 'mpd':
2441                 formats.extend(self._extract_mpd_formats(
2442                     src_url, video_id, mpd_id='dash', fatal=False))
2443             elif re.search(r'\.ism/[Mm]anifest', src_url):
2444                 formats.extend(self._extract_ism_formats(
2445                     src_url, video_id, ism_id='mss', fatal=False))
2446             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2447                 http_count += 1
2448                 formats.append({
2449                     'url': src_url,
2450                     'ext': ext or src_ext or 'flv',
2451                     'format_id': 'http-%d' % (bitrate or http_count),
2452                     'tbr': bitrate,
2453                     'filesize': filesize,
2454                     'width': width,
2455                     'height': height,
2456                 })
2457
2458         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2459             src = medium.get('src')
2460             if not src or src in srcs:
2461                 continue
2462             srcs.add(src)
2463
2464             imgs_count += 1
2465             formats.append({
2466                 'format_id': 'imagestream-%d' % (imgs_count),
2467                 'url': src,
2468                 'ext': mimetype2ext(medium.get('type')),
2469                 'acodec': 'none',
2470                 'vcodec': 'none',
2471                 'width': int_or_none(medium.get('width')),
2472                 'height': int_or_none(medium.get('height')),
2473                 'format_note': 'SMIL storyboards',
2474             })
2475
2476         return formats
2477
2478     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2479         urls = []
2480         subtitles = {}
2481         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2482             src = textstream.get('src')
2483             if not src or src in urls:
2484                 continue
2485             urls.append(src)
2486             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2487             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2488             subtitles.setdefault(lang, []).append({
2489                 'url': src,
2490                 'ext': ext,
2491             })
2492         return subtitles
2493
2494     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2495         xspf = self._download_xml(
2496             xspf_url, playlist_id, 'Downloading xpsf playlist',
2497             'Unable to download xspf manifest', fatal=fatal)
2498         if xspf is False:
2499             return []
2500         return self._parse_xspf(
2501             xspf, playlist_id, xspf_url=xspf_url,
2502             xspf_base_url=base_url(xspf_url))
2503
2504     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2505         NS_MAP = {
2506             'xspf': 'http://xspf.org/ns/0/',
2507             's1': 'http://static.streamone.nl/player/ns/0',
2508         }
2509
2510         entries = []
2511         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2512             title = xpath_text(
2513                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2514             description = xpath_text(
2515                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2516             thumbnail = xpath_text(
2517                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2518             duration = float_or_none(
2519                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2520
2521             formats = []
2522             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2523                 format_url = urljoin(xspf_base_url, location.text)
2524                 if not format_url:
2525                     continue
2526                 formats.append({
2527                     'url': format_url,
2528                     'manifest_url': xspf_url,
2529                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2530                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2531                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2532                 })
2533             self._sort_formats(formats)
2534
2535             entries.append({
2536                 'id': playlist_id,
2537                 'title': title,
2538                 'description': description,
2539                 'thumbnail': thumbnail,
2540                 'duration': duration,
2541                 'formats': formats,
2542             })
2543         return entries
2544
2545     def _extract_mpd_formats(self, *args, **kwargs):
2546         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2547         if subs:
2548             self._report_ignoring_subs('DASH')
2549         return fmts
2550
2551     def _extract_mpd_formats_and_subtitles(
2552             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2553             fatal=True, data=None, headers={}, query={}):
2554         res = self._download_xml_handle(
2555             mpd_url, video_id,
2556             note='Downloading MPD manifest' if note is None else note,
2557             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2558             fatal=fatal, data=data, headers=headers, query=query)
2559         if res is False:
2560             return [], {}
2561         mpd_doc, urlh = res
2562         if mpd_doc is None:
2563             return [], {}
2564         mpd_base_url = base_url(urlh.geturl())
2565
2566         return self._parse_mpd_formats_and_subtitles(
2567             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2568
2569     def _parse_mpd_formats(self, *args, **kwargs):
2570         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2571         if subs:
2572             self._report_ignoring_subs('DASH')
2573         return fmts
2574
2575     def _parse_mpd_formats_and_subtitles(
2576             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2577         """
2578         Parse formats from MPD manifest.
2579         References:
2580          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2581             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2582          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2583         """
2584         if not self.get_param('dynamic_mpd', True):
2585             if mpd_doc.get('type') == 'dynamic':
2586                 return [], {}
2587
2588         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2589
2590         def _add_ns(path):
2591             return self._xpath_ns(path, namespace)
2592
2593         def is_drm_protected(element):
2594             return element.find(_add_ns('ContentProtection')) is not None
2595
2596         def extract_multisegment_info(element, ms_parent_info):
2597             ms_info = ms_parent_info.copy()
2598
2599             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2600             # common attributes and elements.  We will only extract relevant
2601             # for us.
2602             def extract_common(source):
2603                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2604                 if segment_timeline is not None:
2605                     s_e = segment_timeline.findall(_add_ns('S'))
2606                     if s_e:
2607                         ms_info['total_number'] = 0
2608                         ms_info['s'] = []
2609                         for s in s_e:
2610                             r = int(s.get('r', 0))
2611                             ms_info['total_number'] += 1 + r
2612                             ms_info['s'].append({
2613                                 't': int(s.get('t', 0)),
2614                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2615                                 'd': int(s.attrib['d']),
2616                                 'r': r,
2617                             })
2618                 start_number = source.get('startNumber')
2619                 if start_number:
2620                     ms_info['start_number'] = int(start_number)
2621                 timescale = source.get('timescale')
2622                 if timescale:
2623                     ms_info['timescale'] = int(timescale)
2624                 segment_duration = source.get('duration')
2625                 if segment_duration:
2626                     ms_info['segment_duration'] = float(segment_duration)
2627
2628             def extract_Initialization(source):
2629                 initialization = source.find(_add_ns('Initialization'))
2630                 if initialization is not None:
2631                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2632
2633             segment_list = element.find(_add_ns('SegmentList'))
2634             if segment_list is not None:
2635                 extract_common(segment_list)
2636                 extract_Initialization(segment_list)
2637                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2638                 if segment_urls_e:
2639                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2640             else:
2641                 segment_template = element.find(_add_ns('SegmentTemplate'))
2642                 if segment_template is not None:
2643                     extract_common(segment_template)
2644                     media = segment_template.get('media')
2645                     if media:
2646                         ms_info['media'] = media
2647                     initialization = segment_template.get('initialization')
2648                     if initialization:
2649                         ms_info['initialization'] = initialization
2650                     else:
2651                         extract_Initialization(segment_template)
2652             return ms_info
2653
2654         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2655         formats, subtitles = [], {}
2656         stream_numbers = collections.defaultdict(int)
2657         for period in mpd_doc.findall(_add_ns('Period')):
2658             period_duration = parse_duration(period.get('duration')) or mpd_duration
2659             period_ms_info = extract_multisegment_info(period, {
2660                 'start_number': 1,
2661                 'timescale': 1,
2662             })
2663             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2664                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2665                 for representation in adaptation_set.findall(_add_ns('Representation')):
2666                     representation_attrib = adaptation_set.attrib.copy()
2667                     representation_attrib.update(representation.attrib)
2668                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2669                     mime_type = representation_attrib['mimeType']
2670                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2671
2672                     codecs = representation_attrib.get('codecs', '')
2673                     if content_type not in ('video', 'audio', 'text'):
2674                         if mime_type == 'image/jpeg':
2675                             content_type = mime_type
2676                         elif codecs.split('.')[0] == 'stpp':
2677                             content_type = 'text'
2678                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2679                             content_type = 'text'
2680                         else:
2681                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2682                             continue
2683
2684                     base_url = ''
2685                     for element in (representation, adaptation_set, period, mpd_doc):
2686                         base_url_e = element.find(_add_ns('BaseURL'))
2687                         if base_url_e is not None:
2688                             base_url = base_url_e.text + base_url
2689                             if re.match(r'^https?://', base_url):
2690                                 break
2691                     if mpd_base_url and base_url.startswith('/'):
2692                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2693                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2694                         if not mpd_base_url.endswith('/'):
2695                             mpd_base_url += '/'
2696                         base_url = mpd_base_url + base_url
2697                     representation_id = representation_attrib.get('id')
2698                     lang = representation_attrib.get('lang')
2699                     url_el = representation.find(_add_ns('BaseURL'))
2700                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2701                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2702                     if representation_id is not None:
2703                         format_id = representation_id
2704                     else:
2705                         format_id = content_type
2706                     if mpd_id:
2707                         format_id = mpd_id + '-' + format_id
2708                     if content_type in ('video', 'audio'):
2709                         f = {
2710                             'format_id': format_id,
2711                             'manifest_url': mpd_url,
2712                             'ext': mimetype2ext(mime_type),
2713                             'width': int_or_none(representation_attrib.get('width')),
2714                             'height': int_or_none(representation_attrib.get('height')),
2715                             'tbr': float_or_none(bandwidth, 1000),
2716                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2717                             'fps': int_or_none(representation_attrib.get('frameRate')),
2718                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2719                             'format_note': 'DASH %s' % content_type,
2720                             'filesize': filesize,
2721                             'container': mimetype2ext(mime_type) + '_dash',
2722                         }
2723                         f.update(parse_codecs(codecs))
2724                     elif content_type == 'text':
2725                         f = {
2726                             'ext': mimetype2ext(mime_type),
2727                             'manifest_url': mpd_url,
2728                             'filesize': filesize,
2729                         }
2730                     elif content_type == 'image/jpeg':
2731                         # See test case in VikiIE
2732                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2733                         f = {
2734                             'format_id': format_id,
2735                             'ext': 'mhtml',
2736                             'manifest_url': mpd_url,
2737                             'format_note': 'DASH storyboards (jpeg)',
2738                             'acodec': 'none',
2739                             'vcodec': 'none',
2740                         }
2741                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2742                         f['has_drm'] = True
2743                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2744
2745                     def prepare_template(template_name, identifiers):
2746                         tmpl = representation_ms_info[template_name]
2747                         # First of, % characters outside $...$ templates
2748                         # must be escaped by doubling for proper processing
2749                         # by % operator string formatting used further (see
2750                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2751                         t = ''
2752                         in_template = False
2753                         for c in tmpl:
2754                             t += c
2755                             if c == '$':
2756                                 in_template = not in_template
2757                             elif c == '%' and not in_template:
2758                                 t += c
2759                         # Next, $...$ templates are translated to their
2760                         # %(...) counterparts to be used with % operator
2761                         if representation_id is not None:
2762                             t = t.replace('$RepresentationID$', representation_id)
2763                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2764                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2765                         t.replace('$$', '$')
2766                         return t
2767
2768                     # @initialization is a regular template like @media one
2769                     # so it should be handled just the same way (see
2770                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2771                     if 'initialization' in representation_ms_info:
2772                         initialization_template = prepare_template(
2773                             'initialization',
2774                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2775                             # $Time$ shall not be included for @initialization thus
2776                             # only $Bandwidth$ remains
2777                             ('Bandwidth', ))
2778                         representation_ms_info['initialization_url'] = initialization_template % {
2779                             'Bandwidth': bandwidth,
2780                         }
2781
2782                     def location_key(location):
2783                         return 'url' if re.match(r'^https?://', location) else 'path'
2784
2785                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2786
2787                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2788                         media_location_key = location_key(media_template)
2789
2790                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2791                         # can't be used at the same time
2792                         if '%(Number' in media_template and 's' not in representation_ms_info:
2793                             segment_duration = None
2794                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2795                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2796                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2797                             representation_ms_info['fragments'] = [{
2798                                 media_location_key: media_template % {
2799                                     'Number': segment_number,
2800                                     'Bandwidth': bandwidth,
2801                                 },
2802                                 'duration': segment_duration,
2803                             } for segment_number in range(
2804                                 representation_ms_info['start_number'],
2805                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2806                         else:
2807                             # $Number*$ or $Time$ in media template with S list available
2808                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2809                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2810                             representation_ms_info['fragments'] = []
2811                             segment_time = 0
2812                             segment_d = None
2813                             segment_number = representation_ms_info['start_number']
2814
2815                             def add_segment_url():
2816                                 segment_url = media_template % {
2817                                     'Time': segment_time,
2818                                     'Bandwidth': bandwidth,
2819                                     'Number': segment_number,
2820                                 }
2821                                 representation_ms_info['fragments'].append({
2822                                     media_location_key: segment_url,
2823                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2824                                 })
2825
2826                             for num, s in enumerate(representation_ms_info['s']):
2827                                 segment_time = s.get('t') or segment_time
2828                                 segment_d = s['d']
2829                                 add_segment_url()
2830                                 segment_number += 1
2831                                 for r in range(s.get('r', 0)):
2832                                     segment_time += segment_d
2833                                     add_segment_url()
2834                                     segment_number += 1
2835                                 segment_time += segment_d
2836                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2837                         # No media template
2838                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2839                         # or any YouTube dashsegments video
2840                         fragments = []
2841                         segment_index = 0
2842                         timescale = representation_ms_info['timescale']
2843                         for s in representation_ms_info['s']:
2844                             duration = float_or_none(s['d'], timescale)
2845                             for r in range(s.get('r', 0) + 1):
2846                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2847                                 fragments.append({
2848                                     location_key(segment_uri): segment_uri,
2849                                     'duration': duration,
2850                                 })
2851                                 segment_index += 1
2852                         representation_ms_info['fragments'] = fragments
2853                     elif 'segment_urls' in representation_ms_info:
2854                         # Segment URLs with no SegmentTimeline
2855                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2856                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2857                         fragments = []
2858                         segment_duration = float_or_none(
2859                             representation_ms_info['segment_duration'],
2860                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2861                         for segment_url in representation_ms_info['segment_urls']:
2862                             fragment = {
2863                                 location_key(segment_url): segment_url,
2864                             }
2865                             if segment_duration:
2866                                 fragment['duration'] = segment_duration
2867                             fragments.append(fragment)
2868                         representation_ms_info['fragments'] = fragments
2869                     # If there is a fragments key available then we correctly recognized fragmented media.
2870                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2871                     # assumption is not necessarily correct since we may simply have no support for
2872                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2873                     if 'fragments' in representation_ms_info:
2874                         f.update({
2875                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2876                             'url': mpd_url or base_url,
2877                             'fragment_base_url': base_url,
2878                             'fragments': [],
2879                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2880                         })
2881                         if 'initialization_url' in representation_ms_info:
2882                             initialization_url = representation_ms_info['initialization_url']
2883                             if not f.get('url'):
2884                                 f['url'] = initialization_url
2885                             f['fragments'].append({location_key(initialization_url): initialization_url})
2886                         f['fragments'].extend(representation_ms_info['fragments'])
2887                     else:
2888                         # Assuming direct URL to unfragmented media.
2889                         f['url'] = base_url
2890                     if content_type in ('video', 'audio', 'image/jpeg'):
2891                         f['manifest_stream_number'] = stream_numbers[f['url']]
2892                         stream_numbers[f['url']] += 1
2893                         formats.append(f)
2894                     elif content_type == 'text':
2895                         subtitles.setdefault(lang or 'und', []).append(f)
2896
2897         return formats, subtitles
2898
2899     def _extract_ism_formats(self, *args, **kwargs):
2900         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2901         if subs:
2902             self._report_ignoring_subs('ISM')
2903         return fmts
2904
2905     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2906         res = self._download_xml_handle(
2907             ism_url, video_id,
2908             note='Downloading ISM manifest' if note is None else note,
2909             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2910             fatal=fatal, data=data, headers=headers, query=query)
2911         if res is False:
2912             return [], {}
2913         ism_doc, urlh = res
2914         if ism_doc is None:
2915             return [], {}
2916
2917         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2918
2919     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2920         """
2921         Parse formats from ISM manifest.
2922         References:
2923          1. [MS-SSTR]: Smooth Streaming Protocol,
2924             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2925         """
2926         if ism_doc.get('IsLive') == 'TRUE':
2927             return [], {}
2928
2929         duration = int(ism_doc.attrib['Duration'])
2930         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2931
2932         formats = []
2933         subtitles = {}
2934         for stream in ism_doc.findall('StreamIndex'):
2935             stream_type = stream.get('Type')
2936             if stream_type not in ('video', 'audio', 'text'):
2937                 continue
2938             url_pattern = stream.attrib['Url']
2939             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2940             stream_name = stream.get('Name')
2941             stream_language = stream.get('Language', 'und')
2942             for track in stream.findall('QualityLevel'):
2943                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2944                 # TODO: add support for WVC1 and WMAP
2945                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2946                     self.report_warning('%s is not a supported codec' % fourcc)
2947                     continue
2948                 tbr = int(track.attrib['Bitrate']) // 1000
2949                 # [1] does not mention Width and Height attributes. However,
2950                 # they're often present while MaxWidth and MaxHeight are
2951                 # missing, so should be used as fallbacks
2952                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2953                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2954                 sampling_rate = int_or_none(track.get('SamplingRate'))
2955
2956                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2957                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2958
2959                 fragments = []
2960                 fragment_ctx = {
2961                     'time': 0,
2962                 }
2963                 stream_fragments = stream.findall('c')
2964                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2965                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2966                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2967                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2968                     if not fragment_ctx['duration']:
2969                         try:
2970                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2971                         except IndexError:
2972                             next_fragment_time = duration
2973                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2974                     for _ in range(fragment_repeat):
2975                         fragments.append({
2976                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2977                             'duration': fragment_ctx['duration'] / stream_timescale,
2978                         })
2979                         fragment_ctx['time'] += fragment_ctx['duration']
2980
2981                 if stream_type == 'text':
2982                     subtitles.setdefault(stream_language, []).append({
2983                         'ext': 'ismt',
2984                         'protocol': 'ism',
2985                         'url': ism_url,
2986                         'manifest_url': ism_url,
2987                         'fragments': fragments,
2988                         '_download_params': {
2989                             'stream_type': stream_type,
2990                             'duration': duration,
2991                             'timescale': stream_timescale,
2992                             'fourcc': fourcc,
2993                             'language': stream_language,
2994                             'codec_private_data': track.get('CodecPrivateData'),
2995                         }
2996                     })
2997                 elif stream_type in ('video', 'audio'):
2998                     formats.append({
2999                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3000                         'url': ism_url,
3001                         'manifest_url': ism_url,
3002                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3003                         'width': width,
3004                         'height': height,
3005                         'tbr': tbr,
3006                         'asr': sampling_rate,
3007                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3008                         'acodec': 'none' if stream_type == 'video' else fourcc,
3009                         'protocol': 'ism',
3010                         'fragments': fragments,
3011                         'has_drm': ism_doc.find('Protection') is not None,
3012                         '_download_params': {
3013                             'stream_type': stream_type,
3014                             'duration': duration,
3015                             'timescale': stream_timescale,
3016                             'width': width or 0,
3017                             'height': height or 0,
3018                             'fourcc': fourcc,
3019                             'language': stream_language,
3020                             'codec_private_data': track.get('CodecPrivateData'),
3021                             'sampling_rate': sampling_rate,
3022                             'channels': int_or_none(track.get('Channels', 2)),
3023                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3024                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3025                         },
3026                     })
3027         return formats, subtitles
3028
3029     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3030         def absolute_url(item_url):
3031             return urljoin(base_url, item_url)
3032
3033         def parse_content_type(content_type):
3034             if not content_type:
3035                 return {}
3036             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3037             if ctr:
3038                 mimetype, codecs = ctr.groups()
3039                 f = parse_codecs(codecs)
3040                 f['ext'] = mimetype2ext(mimetype)
3041                 return f
3042             return {}
3043
3044         def _media_formats(src, cur_media_type, type_info={}):
3045             full_url = absolute_url(src)
3046             ext = type_info.get('ext') or determine_ext(full_url)
3047             if ext == 'm3u8':
3048                 is_plain_url = False
3049                 formats = self._extract_m3u8_formats(
3050                     full_url, video_id, ext='mp4',
3051                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3052                     preference=preference, quality=quality, fatal=False)
3053             elif ext == 'mpd':
3054                 is_plain_url = False
3055                 formats = self._extract_mpd_formats(
3056                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3057             else:
3058                 is_plain_url = True
3059                 formats = [{
3060                     'url': full_url,
3061                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3062                 }]
3063             return is_plain_url, formats
3064
3065         entries = []
3066         # amp-video and amp-audio are very similar to their HTML5 counterparts
3067         # so we wll include them right here (see
3068         # https://www.ampproject.org/docs/reference/components/amp-video)
3069         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3070         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3071         media_tags = [(media_tag, media_tag_name, media_type, '')
3072                       for media_tag, media_tag_name, media_type
3073                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3074         media_tags.extend(re.findall(
3075             # We only allow video|audio followed by a whitespace or '>'.
3076             # Allowing more characters may end up in significant slow down (see
3077             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3078             # http://www.porntrex.com/maps/videositemap.xml).
3079             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3080         for media_tag, _, media_type, media_content in media_tags:
3081             media_info = {
3082                 'formats': [],
3083                 'subtitles': {},
3084             }
3085             media_attributes = extract_attributes(media_tag)
3086             src = strip_or_none(media_attributes.get('src'))
3087             if src:
3088                 _, formats = _media_formats(src, media_type)
3089                 media_info['formats'].extend(formats)
3090             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3091             if media_content:
3092                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3093                     s_attr = extract_attributes(source_tag)
3094                     # data-video-src and data-src are non standard but seen
3095                     # several times in the wild
3096                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3097                     if not src:
3098                         continue
3099                     f = parse_content_type(s_attr.get('type'))
3100                     is_plain_url, formats = _media_formats(src, media_type, f)
3101                     if is_plain_url:
3102                         # width, height, res, label and title attributes are
3103                         # all not standard but seen several times in the wild
3104                         labels = [
3105                             s_attr.get(lbl)
3106                             for lbl in ('label', 'title')
3107                             if str_or_none(s_attr.get(lbl))
3108                         ]
3109                         width = int_or_none(s_attr.get('width'))
3110                         height = (int_or_none(s_attr.get('height'))
3111                                   or int_or_none(s_attr.get('res')))
3112                         if not width or not height:
3113                             for lbl in labels:
3114                                 resolution = parse_resolution(lbl)
3115                                 if not resolution:
3116                                     continue
3117                                 width = width or resolution.get('width')
3118                                 height = height or resolution.get('height')
3119                         for lbl in labels:
3120                             tbr = parse_bitrate(lbl)
3121                             if tbr:
3122                                 break
3123                         else:
3124                             tbr = None
3125                         f.update({
3126                             'width': width,
3127                             'height': height,
3128                             'tbr': tbr,
3129                             'format_id': s_attr.get('label') or s_attr.get('title'),
3130                         })
3131                         f.update(formats[0])
3132                         media_info['formats'].append(f)
3133                     else:
3134                         media_info['formats'].extend(formats)
3135                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3136                     track_attributes = extract_attributes(track_tag)
3137                     kind = track_attributes.get('kind')
3138                     if not kind or kind in ('subtitles', 'captions'):
3139                         src = strip_or_none(track_attributes.get('src'))
3140                         if not src:
3141                             continue
3142                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3143                         media_info['subtitles'].setdefault(lang, []).append({
3144                             'url': absolute_url(src),
3145                         })
3146             for f in media_info['formats']:
3147                 f.setdefault('http_headers', {})['Referer'] = base_url
3148             if media_info['formats'] or media_info['subtitles']:
3149                 entries.append(media_info)
3150         return entries
3151
3152     def _extract_akamai_formats(self, *args, **kwargs):
3153         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3154         if subs:
3155             self._report_ignoring_subs('akamai')
3156         return fmts
3157
3158     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3159         signed = 'hdnea=' in manifest_url
3160         if not signed:
3161             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3162             manifest_url = re.sub(
3163                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3164                 '', manifest_url).strip('?')
3165
3166         formats = []
3167         subtitles = {}
3168
3169         hdcore_sign = 'hdcore=3.7.0'
3170         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3171         hds_host = hosts.get('hds')
3172         if hds_host:
3173             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3174         if 'hdcore=' not in f4m_url:
3175             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3176         f4m_formats = self._extract_f4m_formats(
3177             f4m_url, video_id, f4m_id='hds', fatal=False)
3178         for entry in f4m_formats:
3179             entry.update({'extra_param_to_segment_url': hdcore_sign})
3180         formats.extend(f4m_formats)
3181
3182         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3183         hls_host = hosts.get('hls')
3184         if hls_host:
3185             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3186         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3187             m3u8_url, video_id, 'mp4', 'm3u8_native',
3188             m3u8_id='hls', fatal=False)
3189         formats.extend(m3u8_formats)
3190         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3191
3192         http_host = hosts.get('http')
3193         if http_host and m3u8_formats and not signed:
3194             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3195             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3196             qualities_length = len(qualities)
3197             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3198                 i = 0
3199                 for f in m3u8_formats:
3200                     if f['vcodec'] != 'none':
3201                         for protocol in ('http', 'https'):
3202                             http_f = f.copy()
3203                             del http_f['manifest_url']
3204                             http_url = re.sub(
3205                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3206                             http_f.update({
3207                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3208                                 'url': http_url,
3209                                 'protocol': protocol,
3210                             })
3211                             formats.append(http_f)
3212                         i += 1
3213
3214         return formats, subtitles
3215
3216     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3217         query = compat_urlparse.urlparse(url).query
3218         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3219         mobj = re.search(
3220             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3221         url_base = mobj.group('url')
3222         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3223         formats = []
3224
3225         def manifest_url(manifest):
3226             m_url = '%s/%s' % (http_base_url, manifest)
3227             if query:
3228                 m_url += '?%s' % query
3229             return m_url
3230
3231         if 'm3u8' not in skip_protocols:
3232             formats.extend(self._extract_m3u8_formats(
3233                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3234                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3235         if 'f4m' not in skip_protocols:
3236             formats.extend(self._extract_f4m_formats(
3237                 manifest_url('manifest.f4m'),
3238                 video_id, f4m_id='hds', fatal=False))
3239         if 'dash' not in skip_protocols:
3240             formats.extend(self._extract_mpd_formats(
3241                 manifest_url('manifest.mpd'),
3242                 video_id, mpd_id='dash', fatal=False))
3243         if re.search(r'(?:/smil:|\.smil)', url_base):
3244             if 'smil' not in skip_protocols:
3245                 rtmp_formats = self._extract_smil_formats(
3246                     manifest_url('jwplayer.smil'),
3247                     video_id, fatal=False)
3248                 for rtmp_format in rtmp_formats:
3249                     rtsp_format = rtmp_format.copy()
3250                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3251                     del rtsp_format['play_path']
3252                     del rtsp_format['ext']
3253                     rtsp_format.update({
3254                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3255                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3256                         'protocol': 'rtsp',
3257                     })
3258                     formats.extend([rtmp_format, rtsp_format])
3259         else:
3260             for protocol in ('rtmp', 'rtsp'):
3261                 if protocol not in skip_protocols:
3262                     formats.append({
3263                         'url': '%s:%s' % (protocol, url_base),
3264                         'format_id': protocol,
3265                         'protocol': protocol,
3266                     })
3267         return formats
3268
3269     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3270         mobj = re.search(
3271             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3272             webpage)
3273         if mobj:
3274             try:
3275                 jwplayer_data = self._parse_json(mobj.group('options'),
3276                                                  video_id=video_id,
3277                                                  transform_source=transform_source)
3278             except ExtractorError:
3279                 pass
3280             else:
3281                 if isinstance(jwplayer_data, dict):
3282                     return jwplayer_data
3283
3284     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3285         jwplayer_data = self._find_jwplayer_data(
3286             webpage, video_id, transform_source=js_to_json)
3287         return self._parse_jwplayer_data(
3288             jwplayer_data, video_id, *args, **kwargs)
3289
3290     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3291                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3292         # JWPlayer backward compatibility: flattened playlists
3293         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3294         if 'playlist' not in jwplayer_data:
3295             jwplayer_data = {'playlist': [jwplayer_data]}
3296
3297         entries = []
3298
3299         # JWPlayer backward compatibility: single playlist item
3300         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3301         if not isinstance(jwplayer_data['playlist'], list):
3302             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3303
3304         for video_data in jwplayer_data['playlist']:
3305             # JWPlayer backward compatibility: flattened sources
3306             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3307             if 'sources' not in video_data:
3308                 video_data['sources'] = [video_data]
3309
3310             this_video_id = video_id or video_data['mediaid']
3311
3312             formats = self._parse_jwplayer_formats(
3313                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3314                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3315
3316             subtitles = {}
3317             tracks = video_data.get('tracks')
3318             if tracks and isinstance(tracks, list):
3319                 for track in tracks:
3320                     if not isinstance(track, dict):
3321                         continue
3322                     track_kind = track.get('kind')
3323                     if not track_kind or not isinstance(track_kind, compat_str):
3324                         continue
3325                     if track_kind.lower() not in ('captions', 'subtitles'):
3326                         continue
3327                     track_url = urljoin(base_url, track.get('file'))
3328                     if not track_url:
3329                         continue
3330                     subtitles.setdefault(track.get('label') or 'en', []).append({
3331                         'url': self._proto_relative_url(track_url)
3332                     })
3333
3334             entry = {
3335                 'id': this_video_id,
3336                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3337                 'description': clean_html(video_data.get('description')),
3338                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3339                 'timestamp': int_or_none(video_data.get('pubdate')),
3340                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3341                 'subtitles': subtitles,
3342             }
3343             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3344             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3345                 entry.update({
3346                     '_type': 'url_transparent',
3347                     'url': formats[0]['url'],
3348                 })
3349             else:
3350                 self._sort_formats(formats)
3351                 entry['formats'] = formats
3352             entries.append(entry)
3353         if len(entries) == 1:
3354             return entries[0]
3355         else:
3356             return self.playlist_result(entries)
3357
3358     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3359                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3360         urls = []
3361         formats = []
3362         for source in jwplayer_sources_data:
3363             if not isinstance(source, dict):
3364                 continue
3365             source_url = urljoin(
3366                 base_url, self._proto_relative_url(source.get('file')))
3367             if not source_url or source_url in urls:
3368                 continue
3369             urls.append(source_url)
3370             source_type = source.get('type') or ''
3371             ext = mimetype2ext(source_type) or determine_ext(source_url)
3372             if source_type == 'hls' or ext == 'm3u8':
3373                 formats.extend(self._extract_m3u8_formats(
3374                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3375                     m3u8_id=m3u8_id, fatal=False))
3376             elif source_type == 'dash' or ext == 'mpd':
3377                 formats.extend(self._extract_mpd_formats(
3378                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3379             elif ext == 'smil':
3380                 formats.extend(self._extract_smil_formats(
3381                     source_url, video_id, fatal=False))
3382             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3383             elif source_type.startswith('audio') or ext in (
3384                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3385                 formats.append({
3386                     'url': source_url,
3387                     'vcodec': 'none',
3388                     'ext': ext,
3389                 })
3390             else:
3391                 height = int_or_none(source.get('height'))
3392                 if height is None:
3393                     # Often no height is provided but there is a label in
3394                     # format like "1080p", "720p SD", or 1080.
3395                     height = int_or_none(self._search_regex(
3396                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3397                         'height', default=None))
3398                 a_format = {
3399                     'url': source_url,
3400                     'width': int_or_none(source.get('width')),
3401                     'height': height,
3402                     'tbr': int_or_none(source.get('bitrate')),
3403                     'ext': ext,
3404                 }
3405                 if source_url.startswith('rtmp'):
3406                     a_format['ext'] = 'flv'
3407                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3408                     # of jwplayer.flash.swf
3409                     rtmp_url_parts = re.split(
3410                         r'((?:mp4|mp3|flv):)', source_url, 1)
3411                     if len(rtmp_url_parts) == 3:
3412                         rtmp_url, prefix, play_path = rtmp_url_parts
3413                         a_format.update({
3414                             'url': rtmp_url,
3415                             'play_path': prefix + play_path,
3416                         })
3417                     if rtmp_params:
3418                         a_format.update(rtmp_params)
3419                 formats.append(a_format)
3420         return formats
3421
3422     def _live_title(self, name):
3423         """ Generate the title for a live video """
3424         now = datetime.datetime.now()
3425         now_str = now.strftime('%Y-%m-%d %H:%M')
3426         return name + ' ' + now_str
3427
3428     def _int(self, v, name, fatal=False, **kwargs):
3429         res = int_or_none(v, **kwargs)
3430         if 'get_attr' in kwargs:
3431             print(getattr(v, kwargs['get_attr']))
3432         if res is None:
3433             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3434             if fatal:
3435                 raise ExtractorError(msg)
3436             else:
3437                 self.report_warning(msg)
3438         return res
3439
3440     def _float(self, v, name, fatal=False, **kwargs):
3441         res = float_or_none(v, **kwargs)
3442         if res is None:
3443             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3444             if fatal:
3445                 raise ExtractorError(msg)
3446             else:
3447                 self.report_warning(msg)
3448         return res
3449
3450     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3451                     path='/', secure=False, discard=False, rest={}, **kwargs):
3452         cookie = compat_cookiejar_Cookie(
3453             0, name, value, port, port is not None, domain, True,
3454             domain.startswith('.'), path, True, secure, expire_time,
3455             discard, None, None, rest)
3456         self._downloader.cookiejar.set_cookie(cookie)
3457
3458     def _get_cookies(self, url):
3459         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3460         req = sanitized_Request(url)
3461         self._downloader.cookiejar.add_cookie_header(req)
3462         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3463
3464     def _apply_first_set_cookie_header(self, url_handle, cookie):
3465         """
3466         Apply first Set-Cookie header instead of the last. Experimental.
3467
3468         Some sites (e.g. [1-3]) may serve two cookies under the same name
3469         in Set-Cookie header and expect the first (old) one to be set rather
3470         than second (new). However, as of RFC6265 the newer one cookie
3471         should be set into cookie store what actually happens.
3472         We will workaround this issue by resetting the cookie to
3473         the first one manually.
3474         1. https://new.vk.com/
3475         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3476         3. https://learning.oreilly.com/
3477         """
3478         for header, cookies in url_handle.headers.items():
3479             if header.lower() != 'set-cookie':
3480                 continue
3481             if sys.version_info[0] >= 3:
3482                 cookies = cookies.encode('iso-8859-1')
3483             cookies = cookies.decode('utf-8')
3484             cookie_value = re.search(
3485                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3486             if cookie_value:
3487                 value, domain = cookie_value.groups()
3488                 self._set_cookie(domain, cookie, value)
3489                 break
3490
3491     def get_testcases(self, include_onlymatching=False):
3492         t = getattr(self, '_TEST', None)
3493         if t:
3494             assert not hasattr(self, '_TESTS'), \
3495                 '%s has _TEST and _TESTS' % type(self).__name__
3496             tests = [t]
3497         else:
3498             tests = getattr(self, '_TESTS', [])
3499         for t in tests:
3500             if not include_onlymatching and t.get('only_matching', False):
3501                 continue
3502             t['name'] = type(self).__name__[:-len('IE')]
3503             yield t
3504
3505     def is_suitable(self, age_limit):
3506         """ Test whether the extractor is generally suitable for the given
3507         age limit (i.e. pornographic sites are not, all others usually are) """
3508
3509         any_restricted = False
3510         for tc in self.get_testcases(include_onlymatching=False):
3511             if tc.get('playlist', []):
3512                 tc = tc['playlist'][0]
3513             is_restricted = age_restricted(
3514                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3515             if not is_restricted:
3516                 return True
3517             any_restricted = any_restricted or is_restricted
3518         return not any_restricted
3519
3520     def extract_subtitles(self, *args, **kwargs):
3521         if (self.get_param('writesubtitles', False)
3522                 or self.get_param('listsubtitles')):
3523             return self._get_subtitles(*args, **kwargs)
3524         return {}
3525
3526     def _get_subtitles(self, *args, **kwargs):
3527         raise NotImplementedError('This method must be implemented by subclasses')
3528
3529     def extract_comments(self, *args, **kwargs):
3530         if not self.get_param('getcomments'):
3531             return None
3532         generator = self._get_comments(*args, **kwargs)
3533
3534         def extractor():
3535             comments = []
3536             try:
3537                 while True:
3538                     comments.append(next(generator))
3539             except KeyboardInterrupt:
3540                 interrupted = True
3541                 self.to_screen('Interrupted by user')
3542             except StopIteration:
3543                 interrupted = False
3544             comment_count = len(comments)
3545             self.to_screen(f'Extracted {comment_count} comments')
3546             return {
3547                 'comments': comments,
3548                 'comment_count': None if interrupted else comment_count
3549             }
3550         return extractor
3551
3552     def _get_comments(self, *args, **kwargs):
3553         raise NotImplementedError('This method must be implemented by subclasses')
3554
3555     @staticmethod
3556     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3557         """ Merge subtitle items for one language. Items with duplicated URLs
3558         will be dropped. """
3559         list1_urls = set([item['url'] for item in subtitle_list1])
3560         ret = list(subtitle_list1)
3561         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3562         return ret
3563
3564     @classmethod
3565     def _merge_subtitles(cls, *dicts, target=None):
3566         """ Merge subtitle dictionaries, language by language. """
3567         if target is None:
3568             target = {}
3569         for d in dicts:
3570             for lang, subs in d.items():
3571                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3572         return target
3573
3574     def extract_automatic_captions(self, *args, **kwargs):
3575         if (self.get_param('writeautomaticsub', False)
3576                 or self.get_param('listsubtitles')):
3577             return self._get_automatic_captions(*args, **kwargs)
3578         return {}
3579
3580     def _get_automatic_captions(self, *args, **kwargs):
3581         raise NotImplementedError('This method must be implemented by subclasses')
3582
3583     def mark_watched(self, *args, **kwargs):
3584         if not self.get_param('mark_watched', False):
3585             return
3586         if (self._get_login_info()[0] is not None
3587                 or self.get_param('cookiefile')
3588                 or self.get_param('cookiesfrombrowser')):
3589             self._mark_watched(*args, **kwargs)
3590
3591     def _mark_watched(self, *args, **kwargs):
3592         raise NotImplementedError('This method must be implemented by subclasses')
3593
3594     def geo_verification_headers(self):
3595         headers = {}
3596         geo_verification_proxy = self.get_param('geo_verification_proxy')
3597         if geo_verification_proxy:
3598             headers['Ytdl-request-proxy'] = geo_verification_proxy
3599         return headers
3600
3601     def _generic_id(self, url):
3602         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3603
3604     def _generic_title(self, url):
3605         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3606
3607     @staticmethod
3608     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3609         all_known = all(map(
3610             lambda x: x is not None,
3611             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3612         return (
3613             'private' if is_private
3614             else 'premium_only' if needs_premium
3615             else 'subscriber_only' if needs_subscription
3616             else 'needs_auth' if needs_auth
3617             else 'unlisted' if is_unlisted
3618             else 'public' if all_known
3619             else None)
3620
3621     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3622         '''
3623         @returns            A list of values for the extractor argument given by "key"
3624                             or "default" if no such key is present
3625         @param default      The default value to return when the key is not present (default: [])
3626         @param casesense    When false, the values are converted to lower case
3627         '''
3628         val = traverse_obj(
3629             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3630         if val is None:
3631             return [] if default is NO_DEFAULT else default
3632         return list(val) if casesense else [x.lower() for x in val]
3633
3634
3635 class SearchInfoExtractor(InfoExtractor):
3636     """
3637     Base class for paged search queries extractors.
3638     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3639     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3640     """
3641
3642     _MAX_RESULTS = float('inf')
3643
3644     @classmethod
3645     def _make_valid_url(cls):
3646         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3647
3648     @classmethod
3649     def suitable(cls, url):
3650         return re.match(cls._make_valid_url(), url) is not None
3651
3652     def _real_extract(self, query):
3653         mobj = re.match(self._make_valid_url(), query)
3654         if mobj is None:
3655             raise ExtractorError('Invalid search query "%s"' % query)
3656
3657         prefix = mobj.group('prefix')
3658         query = mobj.group('query')
3659         if prefix == '':
3660             return self._get_n_results(query, 1)
3661         elif prefix == 'all':
3662             return self._get_n_results(query, self._MAX_RESULTS)
3663         else:
3664             n = int(prefix)
3665             if n <= 0:
3666                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3667             elif n > self._MAX_RESULTS:
3668                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3669                 n = self._MAX_RESULTS
3670             return self._get_n_results(query, n)
3671
3672     def _get_n_results(self, query, n):
3673         """Get a specified number of results for a query.
3674         Either this function or _search_results must be overridden by subclasses """
3675         return self.playlist_result(
3676             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3677             query, query)
3678
3679     def _search_results(self, query):
3680         """Returns an iterator of search results"""
3681         raise NotImplementedError('This method must be implemented by subclasses')
3682
3683     @property
3684     def SEARCH_KEY(self):
3685         return self._SEARCH_KEY