yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import itertools
   8 import json
   9 import netrc
  10 import os
  11 import random
  12 import re
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar_Cookie,
  19     compat_cookies_SimpleCookie,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader import FileDownloader
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     error_to_compat_str,
  49     extract_attributes,
  50     ExtractorError,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     GeoRestrictedError,
  55     GeoUtils,
  56     int_or_none,
  57     js_to_json,
  58     JSON_LD_RE,
  59     mimetype2ext,
  60     network_exceptions,
  61     NO_DEFAULT,
  62     orderedSet,
  63     parse_bitrate,
  64     parse_codecs,
  65     parse_duration,
  66     parse_iso8601,
  67     parse_m3u8_attributes,
  68     parse_resolution,
  69     RegexNotFoundError,
  70     sanitize_filename,
  71     sanitized_Request,
  72     str_or_none,
  73     str_to_int,
  74     strip_or_none,
  75     traverse_obj,
  76     unescapeHTML,
  77     unified_strdate,
  78     unified_timestamp,
  79     update_Request,
  80     update_url_query,
  81     url_basename,
  82     url_or_none,
  83     urljoin,
  84     variadic,
  85     xpath_element,
  86     xpath_text,
  87     xpath_with_ns,
  88 )
  89
  90
  91 class InfoExtractor(object):
  92     """Information Extractor class.
  93
  94     Information extractors are the classes that, given a URL, extract
  95     information about the video (or videos) the URL refers to. This
  96     information includes the real video URL, the video title, author and
  97     others. The information is stored in a dictionary which is then
  98     passed to the YoutubeDL. The YoutubeDL processes this
  99     information possibly downloading the video to the file system, among
 100     other possible outcomes.
 101
 102     The type field determines the type of the result.
 103     By far the most common value (and the default if _type is missing) is
 104     "video", which indicates a single video.
 105
 106     For a video, the dictionaries must include the following fields:
 107
 108     id:             Video identifier.
 109     title:          Video title, unescaped.
 110
 111     Additionally, it must contain either a formats entry or a url one:
 112
 113     formats:        A list of dictionaries for each format available, ordered
 114                     from worst to best quality.
 115
 116                     Potential fields:
 117                     * url        The mandatory URL representing the media:
 118                                    for plain file media - HTTP URL of this file,
 119                                    for RTMP - RTMP URL,
 120                                    for HLS - URL of the M3U8 media playlist,
 121                                    for HDS - URL of the F4M manifest,
 122                                    for DASH
 123                                      - HTTP URL to plain file media (in case of
 124                                        unfragmented media)
 125                                      - URL of the MPD manifest or base URL
 126                                        representing the media if MPD manifest
 127                                        is parsed from a string (in case of
 128                                        fragmented media)
 129                                    for MSS - URL of the ISM manifest.
 130                     * manifest_url
 131                                  The URL of the manifest file in case of
 132                                  fragmented media:
 133                                    for HLS - URL of the M3U8 master playlist,
 134                                    for HDS - URL of the F4M manifest,
 135                                    for DASH - URL of the MPD manifest,
 136                                    for MSS - URL of the ISM manifest.
 137                     * ext        Will be calculated from URL if missing
 138                     * format     A human-readable description of the format
 139                                  ("mp4 container with h264/opus").
 140                                  Calculated from the format_id, width, height.
 141                                  and format_note fields if missing.
 142                     * format_id  A short description of the format
 143                                  ("mp4_h264_opus" or "19").
 144                                 Technically optional, but strongly recommended.
 145                     * format_note Additional info about the format
 146                                  ("3D" or "DASH video")
 147                     * width      Width of the video, if known
 148                     * height     Height of the video, if known
 149                     * resolution Textual description of width and height
 150                     * dynamic_range The dynamic range of the video. One of:
 151                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 152                     * tbr        Average bitrate of audio and video in KBit/s
 153                     * abr        Average audio bitrate in KBit/s
 154                     * acodec     Name of the audio codec in use
 155                     * asr        Audio sampling rate in Hertz
 156                     * vbr        Average video bitrate in KBit/s
 157                     * fps        Frame rate
 158                     * vcodec     Name of the video codec in use
 159                     * container  Name of the container format
 160                     * filesize   The number of bytes, if known in advance
 161                     * filesize_approx  An estimate for the number of bytes
 162                     * player_url SWF Player URL (used for rtmpdump).
 163                     * protocol   The protocol that will be used for the actual
 164                                  download, lower-case.
 165                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 166                                  "m3u8", "m3u8_native" or "http_dash_segments".
 167                     * fragment_base_url
 168                                  Base URL for fragments. Each fragment's path
 169                                  value (if present) will be relative to
 170                                  this URL.
 171                     * fragments  A list of fragments of a fragmented media.
 172                                  Each fragment entry must contain either an url
 173                                  or a path. If an url is present it should be
 174                                  considered by a client. Otherwise both path and
 175                                  fragment_base_url must be present. Here is
 176                                  the list of all potential fields:
 177                                  * "url" - fragment's URL
 178                                  * "path" - fragment's path relative to
 179                                             fragment_base_url
 180                                  * "duration" (optional, int or float)
 181                                  * "filesize" (optional, int)
 182                     * preference Order number of this format. If this field is
 183                                  present and not None, the formats get sorted
 184                                  by this field, regardless of all other values.
 185                                  -1 for default (order by other properties),
 186                                  -2 or smaller for less than default.
 187                                  < -1000 to hide the format (if there is
 188                                     another one which is strictly better)
 189                     * language   Language code, e.g. "de" or "en-US".
 190                     * language_preference  Is this in the language mentioned in
 191                                  the URL?
 192                                  10 if it's what the URL is about,
 193                                  -1 for default (don't know),
 194                                  -10 otherwise, other values reserved for now.
 195                     * quality    Order number of the video quality of this
 196                                  format, irrespective of the file format.
 197                                  -1 for default (order by other properties),
 198                                  -2 or smaller for less than default.
 199                     * source_preference  Order number for this video source
 200                                   (quality takes higher priority)
 201                                  -1 for default (order by other properties),
 202                                  -2 or smaller for less than default.
 203                     * http_headers  A dictionary of additional HTTP headers
 204                                  to add to the request.
 205                     * stretched_ratio  If given and not 1, indicates that the
 206                                  video's pixels are not square.
 207                                  width : height ratio as float.
 208                     * no_resume  The server does not support resuming the
 209                                  (HTTP or RTMP) download. Boolean.
 210                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 211                     * downloader_options  A dictionary of downloader options as
 212                                  described in FileDownloader
 213                     RTMP formats can also have the additional fields: page_url,
 214                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 215                     rtmp_protocol, rtmp_real_time
 216
 217     url:            Final video URL.
 218     ext:            Video filename extension.
 219     format:         The video format, defaults to ext (used for --get-format)
 220     player_url:     SWF Player URL (used for rtmpdump).
 221
 222     The following fields are optional:
 223
 224     alt_title:      A secondary title of the video.
 225     display_id      An alternative identifier for the video, not necessarily
 226                     unique, but available before title. Typically, id is
 227                     something like "4234987", title "Dancing naked mole rats",
 228                     and display_id "dancing-naked-mole-rats"
 229     thumbnails:     A list of dictionaries, with the following entries:
 230                         * "id" (optional, string) - Thumbnail format ID
 231                         * "url"
 232                         * "preference" (optional, int) - quality of the image
 233                         * "width" (optional, int)
 234                         * "height" (optional, int)
 235                         * "resolution" (optional, string "{width}x{height}",
 236                                         deprecated)
 237                         * "filesize" (optional, int)
 238     thumbnail:      Full URL to a video thumbnail image.
 239     description:    Full video description.
 240     uploader:       Full name of the video uploader.
 241     license:        License name the video is licensed under.
 242     creator:        The creator of the video.
 243     release_timestamp: UNIX timestamp of the moment the video was released.
 244     release_date:   The date (YYYYMMDD) when the video was released.
 245     timestamp:      UNIX timestamp of the moment the video was uploaded
 246     upload_date:    Video upload date (YYYYMMDD).
 247                     If not explicitly set, calculated from timestamp.
 248     uploader_id:    Nickname or id of the video uploader.
 249     uploader_url:   Full URL to a personal webpage of the video uploader.
 250     channel:        Full name of the channel the video is uploaded on.
 251                     Note that channel fields may or may not repeat uploader
 252                     fields. This depends on a particular extractor.
 253     channel_id:     Id of the channel.
 254     channel_url:    Full URL to a channel webpage.
 255     location:       Physical location where the video was filmed.
 256     subtitles:      The available subtitles as a dictionary in the format
 257                     {tag: subformats}. "tag" is usually a language code, and
 258                     "subformats" is a list sorted from lower to higher
 259                     preference, each element is a dictionary with the "ext"
 260                     entry and one of:
 261                         * "data": The subtitles file contents
 262                         * "url": A URL pointing to the subtitles file
 263                     It can optionally also have:
 264                         * "name": Name or description of the subtitles
 265                     "ext" will be calculated from URL if missing
 266     automatic_captions: Like 'subtitles'; contains automatically generated
 267                     captions instead of normal subtitles
 268     duration:       Length of the video in seconds, as an integer or float.
 269     view_count:     How many users have watched the video on the platform.
 270     like_count:     Number of positive ratings of the video
 271     dislike_count:  Number of negative ratings of the video
 272     repost_count:   Number of reposts of the video
 273     average_rating: Average rating give by users, the scale used depends on the webpage
 274     comment_count:  Number of comments on the video
 275     comments:       A list of comments, each with one or more of the following
 276                     properties (all but one of text or html optional):
 277                         * "author" - human-readable name of the comment author
 278                         * "author_id" - user ID of the comment author
 279                         * "author_thumbnail" - The thumbnail of the comment author
 280                         * "id" - Comment ID
 281                         * "html" - Comment as HTML
 282                         * "text" - Plain text of the comment
 283                         * "timestamp" - UNIX timestamp of comment
 284                         * "parent" - ID of the comment this one is replying to.
 285                                      Set to "root" to indicate that this is a
 286                                      comment to the original video.
 287                         * "like_count" - Number of positive ratings of the comment
 288                         * "dislike_count" - Number of negative ratings of the comment
 289                         * "is_favorited" - Whether the comment is marked as
 290                                            favorite by the video uploader
 291                         * "author_is_uploader" - Whether the comment is made by
 292                                                  the video uploader
 293     age_limit:      Age restriction for the video, as an integer (years)
 294     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 295                     should allow to get the same result again. (It will be set
 296                     by YoutubeDL if it's missing)
 297     categories:     A list of categories that the video falls in, for example
 298                     ["Sports", "Berlin"]
 299     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 300     cast:           A list of the video cast
 301     is_live:        True, False, or None (=unknown). Whether this video is a
 302                     live stream that goes on instead of a fixed-length video.
 303     was_live:       True, False, or None (=unknown). Whether this video was
 304                     originally a live stream.
 305     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 306                     If absent, automatically set from is_live, was_live
 307     start_time:     Time in seconds where the reproduction should start, as
 308                     specified in the URL.
 309     end_time:       Time in seconds where the reproduction should end, as
 310                     specified in the URL.
 311     chapters:       A list of dictionaries, with the following entries:
 312                         * "start_time" - The start time of the chapter in seconds
 313                         * "end_time" - The end time of the chapter in seconds
 314                         * "title" (optional, string)
 315     playable_in_embed: Whether this video is allowed to play in embedded
 316                     players on other sites. Can be True (=always allowed),
 317                     False (=never allowed), None (=unknown), or a string
 318                     specifying the criteria for embedability (Eg: 'whitelist')
 319     availability:   Under what condition the video is available. One of
 320                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 321                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 322                     to set it
 323     __post_extractor: A function to be called just before the metadata is
 324                     written to either disk, logger or console. The function
 325                     must return a dict which will be added to the info_dict.
 326                     This is usefull for additional information that is
 327                     time-consuming to extract. Note that the fields thus
 328                     extracted will not be available to output template and
 329                     match_filter. So, only "comments" and "comment_count" are
 330                     currently allowed to be extracted via this method.
 331
 332     The following fields should only be used when the video belongs to some logical
 333     chapter or section:
 334
 335     chapter:        Name or title of the chapter the video belongs to.
 336     chapter_number: Number of the chapter the video belongs to, as an integer.
 337     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 338
 339     The following fields should only be used when the video is an episode of some
 340     series, programme or podcast:
 341
 342     series:         Title of the series or programme the video episode belongs to.
 343     season:         Title of the season the video episode belongs to.
 344     season_number:  Number of the season the video episode belongs to, as an integer.
 345     season_id:      Id of the season the video episode belongs to, as a unicode string.
 346     episode:        Title of the video episode. Unlike mandatory video title field,
 347                     this field should denote the exact title of the video episode
 348                     without any kind of decoration.
 349     episode_number: Number of the video episode within a season, as an integer.
 350     episode_id:     Id of the video episode, as a unicode string.
 351
 352     The following fields should only be used when the media is a track or a part of
 353     a music album:
 354
 355     track:          Title of the track.
 356     track_number:   Number of the track within an album or a disc, as an integer.
 357     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 358                     as a unicode string.
 359     artist:         Artist(s) of the track.
 360     genre:          Genre(s) of the track.
 361     album:          Title of the album the track belongs to.
 362     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 363     album_artist:   List of all artists appeared on the album (e.g.
 364                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 365                     and compilations).
 366     disc_number:    Number of the disc or other physical medium the track belongs to,
 367                     as an integer.
 368     release_year:   Year (YYYY) when the album was released.
 369
 370     Unless mentioned otherwise, the fields should be Unicode strings.
 371
 372     Unless mentioned otherwise, None is equivalent to absence of information.
 373
 374
 375     _type "playlist" indicates multiple videos.
 376     There must be a key "entries", which is a list, an iterable, or a PagedList
 377     object, each element of which is a valid dictionary by this specification.
 378
 379     Additionally, playlists can have "id", "title", and any other relevent
 380     attributes with the same semantics as videos (see above).
 381
 382
 383     _type "multi_video" indicates that there are multiple videos that
 384     form a single show, for examples multiple acts of an opera or TV episode.
 385     It must have an entries key like a playlist and contain all the keys
 386     required for a video at the same time.
 387
 388
 389     _type "url" indicates that the video must be extracted from another
 390     location, possibly by a different extractor. Its only required key is:
 391     "url" - the next URL to extract.
 392     The key "ie_key" can be set to the class name (minus the trailing "IE",
 393     e.g. "Youtube") if the extractor class is known in advance.
 394     Additionally, the dictionary may have any properties of the resolved entity
 395     known in advance, for example "title" if the title of the referred video is
 396     known ahead of time.
 397
 398
 399     _type "url_transparent" entities have the same specification as "url", but
 400     indicate that the given additional information is more precise than the one
 401     associated with the resolved URL.
 402     This is useful when a site employs a video service that hosts the video and
 403     its technical metadata, but that video service does not embed a useful
 404     title, description etc.
 405
 406
 407     Subclasses of this one should re-define the _real_initialize() and
 408     _real_extract() methods and define a _VALID_URL regexp.
 409     Probably, they should also be added to the list of extractors.
 410
 411     Subclasses may also override suitable() if necessary, but ensure the function
 412     signature is preserved and that this function imports everything it needs
 413     (except other extractors), so that lazy_extractors works correctly
 414
 415     _GEO_BYPASS attribute may be set to False in order to disable
 416     geo restriction bypass mechanisms for a particular extractor.
 417     Though it won't disable explicit geo restriction bypass based on
 418     country code provided with geo_bypass_country.
 419
 420     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 421     countries for this extractor. One of these countries will be used by
 422     geo restriction bypass mechanism right away in order to bypass
 423     geo restriction, of course, if the mechanism is not disabled.
 424
 425     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 426     IP blocks in CIDR notation for this extractor. One of these IP blocks
 427     will be used by geo restriction bypass mechanism similarly
 428     to _GEO_COUNTRIES.
 429
 430     The _WORKING attribute should be set to False for broken IEs
 431     in order to warn the users and skip the tests.
 432     """
 433
 434     _ready = False
 435     _downloader = None
 436     _x_forwarded_for_ip = None
 437     _GEO_BYPASS = True
 438     _GEO_COUNTRIES = None
 439     _GEO_IP_BLOCKS = None
 440     _WORKING = True
 441
 442     _LOGIN_HINTS = {
 443         'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
 444         'cookies': (
 445             'Use --cookies-from-browser or --cookies for the authentication. '
 446             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 447         'password': 'Use --username and --password or --netrc to provide account credentials',
 448     }
 449
 450     def __init__(self, downloader=None):
 451         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 452         If a downloader is not passed during initialization,
 453         it must be set using "set_downloader()" before "extract()" is called"""
 454         self._ready = False
 455         self._x_forwarded_for_ip = None
 456         self._printed_messages = set()
 457         self.set_downloader(downloader)
 458
 459     @classmethod
 460     def _match_valid_url(cls, url):
 461         # This does not use has/getattr intentionally - we want to know whether
 462         # we have cached the regexp for *this* class, whereas getattr would also
 463         # match the superclass
 464         if '_VALID_URL_RE' not in cls.__dict__:
 465             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 466         return cls._VALID_URL_RE.match(url)
 467
 468     @classmethod
 469     def suitable(cls, url):
 470         """Receives a URL and returns True if suitable for this IE."""
 471         # This function must import everything it needs (except other extractors),
 472         # so that lazy_extractors works correctly
 473         return cls._match_valid_url(url) is not None
 474
 475     @classmethod
 476     def _match_id(cls, url):
 477         return cls._match_valid_url(url).group('id')
 478
 479     @classmethod
 480     def get_temp_id(cls, url):
 481         try:
 482             return cls._match_id(url)
 483         except (IndexError, AttributeError):
 484             return None
 485
 486     @classmethod
 487     def working(cls):
 488         """Getter method for _WORKING."""
 489         return cls._WORKING
 490
 491     def initialize(self):
 492         """Initializes an instance (authentication, etc)."""
 493         self._printed_messages = set()
 494         self._initialize_geo_bypass({
 495             'countries': self._GEO_COUNTRIES,
 496             'ip_blocks': self._GEO_IP_BLOCKS,
 497         })
 498         if not self._ready:
 499             self._real_initialize()
 500             self._ready = True
 501
 502     def _initialize_geo_bypass(self, geo_bypass_context):
 503         """
 504         Initialize geo restriction bypass mechanism.
 505
 506         This method is used to initialize geo bypass mechanism based on faking
 507         X-Forwarded-For HTTP header. A random country from provided country list
 508         is selected and a random IP belonging to this country is generated. This
 509         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 510         HTTP requests.
 511
 512         This method will be used for initial geo bypass mechanism initialization
 513         during the instance initialization with _GEO_COUNTRIES and
 514         _GEO_IP_BLOCKS.
 515
 516         You may also manually call it from extractor's code if geo bypass
 517         information is not available beforehand (e.g. obtained during
 518         extraction) or due to some other reason. In this case you should pass
 519         this information in geo bypass context passed as first argument. It may
 520         contain following fields:
 521
 522         countries:  List of geo unrestricted countries (similar
 523                     to _GEO_COUNTRIES)
 524         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 525                     (similar to _GEO_IP_BLOCKS)
 526
 527         """
 528         if not self._x_forwarded_for_ip:
 529
 530             # Geo bypass mechanism is explicitly disabled by user
 531             if not self.get_param('geo_bypass', True):
 532                 return
 533
 534             if not geo_bypass_context:
 535                 geo_bypass_context = {}
 536
 537             # Backward compatibility: previously _initialize_geo_bypass
 538             # expected a list of countries, some 3rd party code may still use
 539             # it this way
 540             if isinstance(geo_bypass_context, (list, tuple)):
 541                 geo_bypass_context = {
 542                     'countries': geo_bypass_context,
 543                 }
 544
 545             # The whole point of geo bypass mechanism is to fake IP
 546             # as X-Forwarded-For HTTP header based on some IP block or
 547             # country code.
 548
 549             # Path 1: bypassing based on IP block in CIDR notation
 550
 551             # Explicit IP block specified by user, use it right away
 552             # regardless of whether extractor is geo bypassable or not
 553             ip_block = self.get_param('geo_bypass_ip_block', None)
 554
 555             # Otherwise use random IP block from geo bypass context but only
 556             # if extractor is known as geo bypassable
 557             if not ip_block:
 558                 ip_blocks = geo_bypass_context.get('ip_blocks')
 559                 if self._GEO_BYPASS and ip_blocks:
 560                     ip_block = random.choice(ip_blocks)
 561
 562             if ip_block:
 563                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 564                 self._downloader.write_debug(
 565                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 566                 return
 567
 568             # Path 2: bypassing based on country code
 569
 570             # Explicit country code specified by user, use it right away
 571             # regardless of whether extractor is geo bypassable or not
 572             country = self.get_param('geo_bypass_country', None)
 573
 574             # Otherwise use random country code from geo bypass context but
 575             # only if extractor is known as geo bypassable
 576             if not country:
 577                 countries = geo_bypass_context.get('countries')
 578                 if self._GEO_BYPASS and countries:
 579                     country = random.choice(countries)
 580
 581             if country:
 582                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 583                 self._downloader.write_debug(
 584                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 585
 586     def extract(self, url):
 587         """Extracts URL information and returns it in list of dicts."""
 588         try:
 589             for _ in range(2):
 590                 try:
 591                     self.initialize()
 592                     self.write_debug('Extracting URL: %s' % url)
 593                     ie_result = self._real_extract(url)
 594                     if ie_result is None:
 595                         return None
 596                     if self._x_forwarded_for_ip:
 597                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 598                     subtitles = ie_result.get('subtitles')
 599                     if (subtitles and 'live_chat' in subtitles
 600                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 601                         del subtitles['live_chat']
 602                     return ie_result
 603                 except GeoRestrictedError as e:
 604                     if self.__maybe_fake_ip_and_retry(e.countries):
 605                         continue
 606                     raise
 607         except ExtractorError as e:
 608             video_id = e.video_id or self.get_temp_id(url)
 609             raise ExtractorError(
 610                 e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
 611         except compat_http_client.IncompleteRead as e:
 612             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 613         except (KeyError, StopIteration) as e:
 614             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 615
 616     def __maybe_fake_ip_and_retry(self, countries):
 617         if (not self.get_param('geo_bypass_country', None)
 618                 and self._GEO_BYPASS
 619                 and self.get_param('geo_bypass', True)
 620                 and not self._x_forwarded_for_ip
 621                 and countries):
 622             country_code = random.choice(countries)
 623             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 624             if self._x_forwarded_for_ip:
 625                 self.report_warning(
 626                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 627                     % (self._x_forwarded_for_ip, country_code.upper()))
 628                 return True
 629         return False
 630
 631     def set_downloader(self, downloader):
 632         """Sets the downloader for this IE."""
 633         self._downloader = downloader
 634
 635     def _real_initialize(self):
 636         """Real initialization process. Redefine in subclasses."""
 637         pass
 638
 639     def _real_extract(self, url):
 640         """Real extraction process. Redefine in subclasses."""
 641         pass
 642
 643     @classmethod
 644     def ie_key(cls):
 645         """A string for getting the InfoExtractor with get_info_extractor"""
 646         return cls.__name__[:-2]
 647
 648     @property
 649     def IE_NAME(self):
 650         return compat_str(type(self).__name__[:-2])
 651
 652     @staticmethod
 653     def __can_accept_status_code(err, expected_status):
 654         assert isinstance(err, compat_urllib_error.HTTPError)
 655         if expected_status is None:
 656             return False
 657         elif callable(expected_status):
 658             return expected_status(err.code) is True
 659         else:
 660             return err.code in variadic(expected_status)
 661
 662     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 663         """
 664         Return the response handle.
 665
 666         See _download_webpage docstring for arguments specification.
 667         """
 668         if not self._downloader._first_webpage_request:
 669             sleep_interval = self.get_param('sleep_interval_requests') or 0
 670             if sleep_interval > 0:
 671                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 672                 time.sleep(sleep_interval)
 673         else:
 674             self._downloader._first_webpage_request = False
 675
 676         if note is None:
 677             self.report_download_webpage(video_id)
 678         elif note is not False:
 679             if video_id is None:
 680                 self.to_screen('%s' % (note,))
 681             else:
 682                 self.to_screen('%s: %s' % (video_id, note))
 683
 684         # Some sites check X-Forwarded-For HTTP header in order to figure out
 685         # the origin of the client behind proxy. This allows bypassing geo
 686         # restriction by faking this header's value to IP that belongs to some
 687         # geo unrestricted country. We will do so once we encounter any
 688         # geo restriction error.
 689         if self._x_forwarded_for_ip:
 690             if 'X-Forwarded-For' not in headers:
 691                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 692
 693         if isinstance(url_or_request, compat_urllib_request.Request):
 694             url_or_request = update_Request(
 695                 url_or_request, data=data, headers=headers, query=query)
 696         else:
 697             if query:
 698                 url_or_request = update_url_query(url_or_request, query)
 699             if data is not None or headers:
 700                 url_or_request = sanitized_Request(url_or_request, data, headers)
 701         try:
 702             return self._downloader.urlopen(url_or_request)
 703         except network_exceptions as err:
 704             if isinstance(err, compat_urllib_error.HTTPError):
 705                 if self.__can_accept_status_code(err, expected_status):
 706                     # Retain reference to error to prevent file object from
 707                     # being closed before it can be read. Works around the
 708                     # effects of <https://bugs.python.org/issue15002>
 709                     # introduced in Python 3.4.1.
 710                     err.fp._error = err
 711                     return err.fp
 712
 713             if errnote is False:
 714                 return False
 715             if errnote is None:
 716                 errnote = 'Unable to download webpage'
 717
 718             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 719             if fatal:
 720                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 721             else:
 722                 self.report_warning(errmsg)
 723                 return False
 724
 725     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 726         """
 727         Return a tuple (page content as string, URL handle).
 728
 729         See _download_webpage docstring for arguments specification.
 730         """
 731         # Strip hashes from the URL (#1038)
 732         if isinstance(url_or_request, (compat_str, str)):
 733             url_or_request = url_or_request.partition('#')[0]
 734
 735         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 736         if urlh is False:
 737             assert not fatal
 738             return False
 739         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 740         return (content, urlh)
 741
 742     @staticmethod
 743     def _guess_encoding_from_content(content_type, webpage_bytes):
 744         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 745         if m:
 746             encoding = m.group(1)
 747         else:
 748             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 749                           webpage_bytes[:1024])
 750             if m:
 751                 encoding = m.group(1).decode('ascii')
 752             elif webpage_bytes.startswith(b'\xff\xfe'):
 753                 encoding = 'utf-16'
 754             else:
 755                 encoding = 'utf-8'
 756
 757         return encoding
 758
 759     def __check_blocked(self, content):
 760         first_block = content[:512]
 761         if ('<title>Access to this site is blocked</title>' in content
 762                 and 'Websense' in first_block):
 763             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 764             blocked_iframe = self._html_search_regex(
 765                 r'<iframe src="([^"]+)"', content,
 766                 'Websense information URL', default=None)
 767             if blocked_iframe:
 768                 msg += ' Visit %s for more details' % blocked_iframe
 769             raise ExtractorError(msg, expected=True)
 770         if '<title>The URL you requested has been blocked</title>' in first_block:
 771             msg = (
 772                 'Access to this webpage has been blocked by Indian censorship. '
 773                 'Use a VPN or proxy server (with --proxy) to route around it.')
 774             block_msg = self._html_search_regex(
 775                 r'</h1><p>(.*?)</p>',
 776                 content, 'block message', default=None)
 777             if block_msg:
 778                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 779             raise ExtractorError(msg, expected=True)
 780         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 781                 and 'blocklist.rkn.gov.ru' in content):
 782             raise ExtractorError(
 783                 'Access to this webpage has been blocked by decision of the Russian government. '
 784                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 785                 expected=True)
 786
 787     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 788         content_type = urlh.headers.get('Content-Type', '')
 789         webpage_bytes = urlh.read()
 790         if prefix is not None:
 791             webpage_bytes = prefix + webpage_bytes
 792         if not encoding:
 793             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 794         if self.get_param('dump_intermediate_pages', False):
 795             self.to_screen('Dumping request to ' + urlh.geturl())
 796             dump = base64.b64encode(webpage_bytes).decode('ascii')
 797             self._downloader.to_screen(dump)
 798         if self.get_param('write_pages', False):
 799             basen = '%s_%s' % (video_id, urlh.geturl())
 800             trim_length = self.get_param('trim_file_name') or 240
 801             if len(basen) > trim_length:
 802                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 803                 basen = basen[:trim_length - len(h)] + h
 804             raw_filename = basen + '.dump'
 805             filename = sanitize_filename(raw_filename, restricted=True)
 806             self.to_screen('Saving request to ' + filename)
 807             # Working around MAX_PATH limitation on Windows (see
 808             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 809             if compat_os_name == 'nt':
 810                 absfilepath = os.path.abspath(filename)
 811                 if len(absfilepath) > 259:
 812                     filename = '\\\\?\\' + absfilepath
 813             with open(filename, 'wb') as outf:
 814                 outf.write(webpage_bytes)
 815
 816         try:
 817             content = webpage_bytes.decode(encoding, 'replace')
 818         except LookupError:
 819             content = webpage_bytes.decode('utf-8', 'replace')
 820
 821         self.__check_blocked(content)
 822
 823         return content
 824
 825     def _download_webpage(
 826             self, url_or_request, video_id, note=None, errnote=None,
 827             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 828             headers={}, query={}, expected_status=None):
 829         """
 830         Return the data of the page as a string.
 831
 832         Arguments:
 833         url_or_request -- plain text URL as a string or
 834             a compat_urllib_request.Requestobject
 835         video_id -- Video/playlist/item identifier (string)
 836
 837         Keyword arguments:
 838         note -- note printed before downloading (string)
 839         errnote -- note printed in case of an error (string)
 840         fatal -- flag denoting whether error should be considered fatal,
 841             i.e. whether it should cause ExtractionError to be raised,
 842             otherwise a warning will be reported and extraction continued
 843         tries -- number of tries
 844         timeout -- sleep interval between tries
 845         encoding -- encoding for a page content decoding, guessed automatically
 846             when not explicitly specified
 847         data -- POST data (bytes)
 848         headers -- HTTP headers (dict)
 849         query -- URL query (dict)
 850         expected_status -- allows to accept failed HTTP requests (non 2xx
 851             status code) by explicitly specifying a set of accepted status
 852             codes. Can be any of the following entities:
 853                 - an integer type specifying an exact failed status code to
 854                   accept
 855                 - a list or a tuple of integer types specifying a list of
 856                   failed status codes to accept
 857                 - a callable accepting an actual failed status code and
 858                   returning True if it should be accepted
 859             Note that this argument does not affect success status codes (2xx)
 860             which are always accepted.
 861         """
 862
 863         success = False
 864         try_count = 0
 865         while success is False:
 866             try:
 867                 res = self._download_webpage_handle(
 868                     url_or_request, video_id, note, errnote, fatal,
 869                     encoding=encoding, data=data, headers=headers, query=query,
 870                     expected_status=expected_status)
 871                 success = True
 872             except compat_http_client.IncompleteRead as e:
 873                 try_count += 1
 874                 if try_count >= tries:
 875                     raise e
 876                 self._sleep(timeout, video_id)
 877         if res is False:
 878             return res
 879         else:
 880             content, _ = res
 881             return content
 882
 883     def _download_xml_handle(
 884             self, url_or_request, video_id, note='Downloading XML',
 885             errnote='Unable to download XML', transform_source=None,
 886             fatal=True, encoding=None, data=None, headers={}, query={},
 887             expected_status=None):
 888         """
 889         Return a tuple (xml as an compat_etree_Element, URL handle).
 890
 891         See _download_webpage docstring for arguments specification.
 892         """
 893         res = self._download_webpage_handle(
 894             url_or_request, video_id, note, errnote, fatal=fatal,
 895             encoding=encoding, data=data, headers=headers, query=query,
 896             expected_status=expected_status)
 897         if res is False:
 898             return res
 899         xml_string, urlh = res
 900         return self._parse_xml(
 901             xml_string, video_id, transform_source=transform_source,
 902             fatal=fatal), urlh
 903
 904     def _download_xml(
 905             self, url_or_request, video_id,
 906             note='Downloading XML', errnote='Unable to download XML',
 907             transform_source=None, fatal=True, encoding=None,
 908             data=None, headers={}, query={}, expected_status=None):
 909         """
 910         Return the xml as an compat_etree_Element.
 911
 912         See _download_webpage docstring for arguments specification.
 913         """
 914         res = self._download_xml_handle(
 915             url_or_request, video_id, note=note, errnote=errnote,
 916             transform_source=transform_source, fatal=fatal, encoding=encoding,
 917             data=data, headers=headers, query=query,
 918             expected_status=expected_status)
 919         return res if res is False else res[0]
 920
 921     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 922         if transform_source:
 923             xml_string = transform_source(xml_string)
 924         try:
 925             return compat_etree_fromstring(xml_string.encode('utf-8'))
 926         except compat_xml_parse_error as ve:
 927             errmsg = '%s: Failed to parse XML ' % video_id
 928             if fatal:
 929                 raise ExtractorError(errmsg, cause=ve)
 930             else:
 931                 self.report_warning(errmsg + str(ve))
 932
 933     def _download_json_handle(
 934             self, url_or_request, video_id, note='Downloading JSON metadata',
 935             errnote='Unable to download JSON metadata', transform_source=None,
 936             fatal=True, encoding=None, data=None, headers={}, query={},
 937             expected_status=None):
 938         """
 939         Return a tuple (JSON object, URL handle).
 940
 941         See _download_webpage docstring for arguments specification.
 942         """
 943         res = self._download_webpage_handle(
 944             url_or_request, video_id, note, errnote, fatal=fatal,
 945             encoding=encoding, data=data, headers=headers, query=query,
 946             expected_status=expected_status)
 947         if res is False:
 948             return res
 949         json_string, urlh = res
 950         return self._parse_json(
 951             json_string, video_id, transform_source=transform_source,
 952             fatal=fatal), urlh
 953
 954     def _download_json(
 955             self, url_or_request, video_id, note='Downloading JSON metadata',
 956             errnote='Unable to download JSON metadata', transform_source=None,
 957             fatal=True, encoding=None, data=None, headers={}, query={},
 958             expected_status=None):
 959         """
 960         Return the JSON object as a dict.
 961
 962         See _download_webpage docstring for arguments specification.
 963         """
 964         res = self._download_json_handle(
 965             url_or_request, video_id, note=note, errnote=errnote,
 966             transform_source=transform_source, fatal=fatal, encoding=encoding,
 967             data=data, headers=headers, query=query,
 968             expected_status=expected_status)
 969         return res if res is False else res[0]
 970
 971     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 972         if transform_source:
 973             json_string = transform_source(json_string)
 974         try:
 975             return json.loads(json_string)
 976         except ValueError as ve:
 977             errmsg = '%s: Failed to parse JSON ' % video_id
 978             if fatal:
 979                 raise ExtractorError(errmsg, cause=ve)
 980             else:
 981                 self.report_warning(errmsg + str(ve))
 982
 983     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 984         return self._parse_json(
 985             data[data.find('{'):data.rfind('}') + 1],
 986             video_id, transform_source, fatal)
 987
 988     def _download_socket_json_handle(
 989             self, url_or_request, video_id, note='Polling socket',
 990             errnote='Unable to poll socket', transform_source=None,
 991             fatal=True, encoding=None, data=None, headers={}, query={},
 992             expected_status=None):
 993         """
 994         Return a tuple (JSON object, URL handle).
 995
 996         See _download_webpage docstring for arguments specification.
 997         """
 998         res = self._download_webpage_handle(
 999             url_or_request, video_id, note, errnote, fatal=fatal,
1000             encoding=encoding, data=data, headers=headers, query=query,
1001             expected_status=expected_status)
1002         if res is False:
1003             return res
1004         webpage, urlh = res
1005         return self._parse_socket_response_as_json(
1006             webpage, video_id, transform_source=transform_source,
1007             fatal=fatal), urlh
1008
1009     def _download_socket_json(
1010             self, url_or_request, video_id, note='Polling socket',
1011             errnote='Unable to poll socket', transform_source=None,
1012             fatal=True, encoding=None, data=None, headers={}, query={},
1013             expected_status=None):
1014         """
1015         Return the JSON object as a dict.
1016
1017         See _download_webpage docstring for arguments specification.
1018         """
1019         res = self._download_socket_json_handle(
1020             url_or_request, video_id, note=note, errnote=errnote,
1021             transform_source=transform_source, fatal=fatal, encoding=encoding,
1022             data=data, headers=headers, query=query,
1023             expected_status=expected_status)
1024         return res if res is False else res[0]
1025
1026     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1027         idstr = format_field(video_id, template='%s: ')
1028         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1029         if only_once:
1030             if f'WARNING: {msg}' in self._printed_messages:
1031                 return
1032             self._printed_messages.add(f'WARNING: {msg}')
1033         self._downloader.report_warning(msg, *args, **kwargs)
1034
1035     def to_screen(self, msg, *args, **kwargs):
1036         """Print msg to screen, prefixing it with '[ie_name]'"""
1037         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1038
1039     def write_debug(self, msg, *args, **kwargs):
1040         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1041
1042     def get_param(self, name, default=None, *args, **kwargs):
1043         if self._downloader:
1044             return self._downloader.params.get(name, default, *args, **kwargs)
1045         return default
1046
1047     def report_drm(self, video_id, partial=False):
1048         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1049
1050     def report_extraction(self, id_or_name):
1051         """Report information extraction."""
1052         self.to_screen('%s: Extracting information' % id_or_name)
1053
1054     def report_download_webpage(self, video_id):
1055         """Report webpage download."""
1056         self.to_screen('%s: Downloading webpage' % video_id)
1057
1058     def report_age_confirmation(self):
1059         """Report attempt to confirm age."""
1060         self.to_screen('Confirming age')
1061
1062     def report_login(self):
1063         """Report attempt to log in."""
1064         self.to_screen('Logging in')
1065
1066     def raise_login_required(
1067             self, msg='This video is only available for registered users',
1068             metadata_available=False, method='any'):
1069         if metadata_available and self.get_param('ignore_no_formats_error'):
1070             self.report_warning(msg)
1071         if method is not None:
1072             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1073         raise ExtractorError(msg, expected=True)
1074
1075     def raise_geo_restricted(
1076             self, msg='This video is not available from your location due to geo restriction',
1077             countries=None, metadata_available=False):
1078         if metadata_available and self.get_param('ignore_no_formats_error'):
1079             self.report_warning(msg)
1080         else:
1081             raise GeoRestrictedError(msg, countries=countries)
1082
1083     def raise_no_formats(self, msg, expected=False, video_id=None):
1084         if expected and self.get_param('ignore_no_formats_error'):
1085             self.report_warning(msg, video_id)
1086         elif isinstance(msg, ExtractorError):
1087             raise msg
1088         else:
1089             raise ExtractorError(msg, expected=expected, video_id=video_id)
1090
1091     # Methods for following #608
1092     @staticmethod
1093     def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1094         """Returns a URL that points to a page that should be processed"""
1095         # TODO: ie should be the class used for getting the info
1096         video_info = {'_type': 'url',
1097                       'url': url,
1098                       'ie_key': ie}
1099         video_info.update(kwargs)
1100         if video_id is not None:
1101             video_info['id'] = video_id
1102         if video_title is not None:
1103             video_info['title'] = video_title
1104         return video_info
1105
1106     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1107         urls = orderedSet(
1108             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1109             for m in matches)
1110         return self.playlist_result(
1111             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1112
1113     @staticmethod
1114     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1115         """Returns a playlist"""
1116         video_info = {'_type': 'playlist',
1117                       'entries': entries}
1118         video_info.update(kwargs)
1119         if playlist_id:
1120             video_info['id'] = playlist_id
1121         if playlist_title:
1122             video_info['title'] = playlist_title
1123         if playlist_description is not None:
1124             video_info['description'] = playlist_description
1125         return video_info
1126
1127     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1128         """
1129         Perform a regex search on the given string, using a single or a list of
1130         patterns returning the first matching group.
1131         In case of failure return a default value or raise a WARNING or a
1132         RegexNotFoundError, depending on fatal, specifying the field name.
1133         """
1134         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1135             mobj = re.search(pattern, string, flags)
1136         else:
1137             for p in pattern:
1138                 mobj = re.search(p, string, flags)
1139                 if mobj:
1140                     break
1141
1142         _name = self._downloader._color_text(name, 'blue')
1143
1144         if mobj:
1145             if group is None:
1146                 # return the first matching group
1147                 return next(g for g in mobj.groups() if g is not None)
1148             elif isinstance(group, (list, tuple)):
1149                 return tuple(mobj.group(g) for g in group)
1150             else:
1151                 return mobj.group(group)
1152         elif default is not NO_DEFAULT:
1153             return default
1154         elif fatal:
1155             raise RegexNotFoundError('Unable to extract %s' % _name)
1156         else:
1157             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1158             return None
1159
1160     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1161         """
1162         Like _search_regex, but strips HTML tags and unescapes entities.
1163         """
1164         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1165         if res:
1166             return clean_html(res).strip()
1167         else:
1168             return res
1169
1170     def _get_netrc_login_info(self, netrc_machine=None):
1171         username = None
1172         password = None
1173         netrc_machine = netrc_machine or self._NETRC_MACHINE
1174
1175         if self.get_param('usenetrc', False):
1176             try:
1177                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1178                 if os.path.isdir(netrc_file):
1179                     netrc_file = os.path.join(netrc_file, '.netrc')
1180                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1181                 if info is not None:
1182                     username = info[0]
1183                     password = info[2]
1184                 else:
1185                     raise netrc.NetrcParseError(
1186                         'No authenticators for %s' % netrc_machine)
1187             except (IOError, netrc.NetrcParseError) as err:
1188                 self.report_warning(
1189                     'parsing .netrc: %s' % error_to_compat_str(err))
1190
1191         return username, password
1192
1193     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1194         """
1195         Get the login info as (username, password)
1196         First look for the manually specified credentials using username_option
1197         and password_option as keys in params dictionary. If no such credentials
1198         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1199         value.
1200         If there's no info available, return (None, None)
1201         """
1202
1203         # Attempt to use provided username and password or .netrc data
1204         username = self.get_param(username_option)
1205         if username is not None:
1206             password = self.get_param(password_option)
1207         else:
1208             username, password = self._get_netrc_login_info(netrc_machine)
1209
1210         return username, password
1211
1212     def _get_tfa_info(self, note='two-factor verification code'):
1213         """
1214         Get the two-factor authentication info
1215         TODO - asking the user will be required for sms/phone verify
1216         currently just uses the command line option
1217         If there's no info available, return None
1218         """
1219
1220         tfa = self.get_param('twofactor')
1221         if tfa is not None:
1222             return tfa
1223
1224         return compat_getpass('Type %s and press [Return]: ' % note)
1225
1226     # Helper functions for extracting OpenGraph info
1227     @staticmethod
1228     def _og_regexes(prop):
1229         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1230         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1231                        % {'prop': re.escape(prop)})
1232         template = r'<meta[^>]+?%s[^>]+?%s'
1233         return [
1234             template % (property_re, content_re),
1235             template % (content_re, property_re),
1236         ]
1237
1238     @staticmethod
1239     def _meta_regex(prop):
1240         return r'''(?isx)<meta
1241                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1242                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1243
1244     def _og_search_property(self, prop, html, name=None, **kargs):
1245         prop = variadic(prop)
1246         if name is None:
1247             name = 'OpenGraph %s' % prop[0]
1248         og_regexes = []
1249         for p in prop:
1250             og_regexes.extend(self._og_regexes(p))
1251         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1252         if escaped is None:
1253             return None
1254         return unescapeHTML(escaped)
1255
1256     def _og_search_thumbnail(self, html, **kargs):
1257         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1258
1259     def _og_search_description(self, html, **kargs):
1260         return self._og_search_property('description', html, fatal=False, **kargs)
1261
1262     def _og_search_title(self, html, **kargs):
1263         return self._og_search_property('title', html, **kargs)
1264
1265     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1266         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1267         if secure:
1268             regexes = self._og_regexes('video:secure_url') + regexes
1269         return self._html_search_regex(regexes, html, name, **kargs)
1270
1271     def _og_search_url(self, html, **kargs):
1272         return self._og_search_property('url', html, **kargs)
1273
1274     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1275         name = variadic(name)
1276         if display_name is None:
1277             display_name = name[0]
1278         return self._html_search_regex(
1279             [self._meta_regex(n) for n in name],
1280             html, display_name, fatal=fatal, group='content', **kwargs)
1281
1282     def _dc_search_uploader(self, html):
1283         return self._html_search_meta('dc.creator', html, 'uploader')
1284
1285     def _rta_search(self, html):
1286         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1287         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1288                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1289                      html):
1290             return 18
1291         return 0
1292
1293     def _media_rating_search(self, html):
1294         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1295         rating = self._html_search_meta('rating', html)
1296
1297         if not rating:
1298             return None
1299
1300         RATING_TABLE = {
1301             'safe for kids': 0,
1302             'general': 8,
1303             '14 years': 14,
1304             'mature': 17,
1305             'restricted': 19,
1306         }
1307         return RATING_TABLE.get(rating.lower())
1308
1309     def _family_friendly_search(self, html):
1310         # See http://schema.org/VideoObject
1311         family_friendly = self._html_search_meta(
1312             'isFamilyFriendly', html, default=None)
1313
1314         if not family_friendly:
1315             return None
1316
1317         RATING_TABLE = {
1318             '1': 0,
1319             'true': 0,
1320             '0': 18,
1321             'false': 18,
1322         }
1323         return RATING_TABLE.get(family_friendly.lower())
1324
1325     def _twitter_search_player(self, html):
1326         return self._html_search_meta('twitter:player', html,
1327                                       'twitter card player')
1328
1329     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1330         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1331         default = kwargs.get('default', NO_DEFAULT)
1332         # JSON-LD may be malformed and thus `fatal` should be respected.
1333         # At the same time `default` may be passed that assumes `fatal=False`
1334         # for _search_regex. Let's simulate the same behavior here as well.
1335         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1336         json_ld = []
1337         for mobj in json_ld_list:
1338             json_ld_item = self._parse_json(
1339                 mobj.group('json_ld'), video_id, fatal=fatal)
1340             if not json_ld_item:
1341                 continue
1342             if isinstance(json_ld_item, dict):
1343                 json_ld.append(json_ld_item)
1344             elif isinstance(json_ld_item, (list, tuple)):
1345                 json_ld.extend(json_ld_item)
1346         if json_ld:
1347             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1348         if json_ld:
1349             return json_ld
1350         if default is not NO_DEFAULT:
1351             return default
1352         elif fatal:
1353             raise RegexNotFoundError('Unable to extract JSON-LD')
1354         else:
1355             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1356             return {}
1357
1358     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1359         if isinstance(json_ld, compat_str):
1360             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1361         if not json_ld:
1362             return {}
1363         info = {}
1364         if not isinstance(json_ld, (list, tuple, dict)):
1365             return info
1366         if isinstance(json_ld, dict):
1367             json_ld = [json_ld]
1368
1369         INTERACTION_TYPE_MAP = {
1370             'CommentAction': 'comment',
1371             'AgreeAction': 'like',
1372             'DisagreeAction': 'dislike',
1373             'LikeAction': 'like',
1374             'DislikeAction': 'dislike',
1375             'ListenAction': 'view',
1376             'WatchAction': 'view',
1377             'ViewAction': 'view',
1378         }
1379
1380         def extract_interaction_type(e):
1381             interaction_type = e.get('interactionType')
1382             if isinstance(interaction_type, dict):
1383                 interaction_type = interaction_type.get('@type')
1384             return str_or_none(interaction_type)
1385
1386         def extract_interaction_statistic(e):
1387             interaction_statistic = e.get('interactionStatistic')
1388             if isinstance(interaction_statistic, dict):
1389                 interaction_statistic = [interaction_statistic]
1390             if not isinstance(interaction_statistic, list):
1391                 return
1392             for is_e in interaction_statistic:
1393                 if not isinstance(is_e, dict):
1394                     continue
1395                 if is_e.get('@type') != 'InteractionCounter':
1396                     continue
1397                 interaction_type = extract_interaction_type(is_e)
1398                 if not interaction_type:
1399                     continue
1400                 # For interaction count some sites provide string instead of
1401                 # an integer (as per spec) with non digit characters (e.g. ",")
1402                 # so extracting count with more relaxed str_to_int
1403                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1404                 if interaction_count is None:
1405                     continue
1406                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1407                 if not count_kind:
1408                     continue
1409                 count_key = '%s_count' % count_kind
1410                 if info.get(count_key) is not None:
1411                     continue
1412                 info[count_key] = interaction_count
1413
1414         def extract_video_object(e):
1415             assert e['@type'] == 'VideoObject'
1416             author = e.get('author')
1417             info.update({
1418                 'url': url_or_none(e.get('contentUrl')),
1419                 'title': unescapeHTML(e.get('name')),
1420                 'description': unescapeHTML(e.get('description')),
1421                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1422                 'duration': parse_duration(e.get('duration')),
1423                 'timestamp': unified_timestamp(e.get('uploadDate')),
1424                 # author can be an instance of 'Organization' or 'Person' types.
1425                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1426                 # however some websites are using 'Text' type instead.
1427                 # 1. https://schema.org/VideoObject
1428                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1429                 'filesize': float_or_none(e.get('contentSize')),
1430                 'tbr': int_or_none(e.get('bitrate')),
1431                 'width': int_or_none(e.get('width')),
1432                 'height': int_or_none(e.get('height')),
1433                 'view_count': int_or_none(e.get('interactionCount')),
1434             })
1435             extract_interaction_statistic(e)
1436
1437         for e in json_ld:
1438             if '@context' in e:
1439                 item_type = e.get('@type')
1440                 if expected_type is not None and expected_type != item_type:
1441                     continue
1442                 if item_type in ('TVEpisode', 'Episode'):
1443                     episode_name = unescapeHTML(e.get('name'))
1444                     info.update({
1445                         'episode': episode_name,
1446                         'episode_number': int_or_none(e.get('episodeNumber')),
1447                         'description': unescapeHTML(e.get('description')),
1448                     })
1449                     if not info.get('title') and episode_name:
1450                         info['title'] = episode_name
1451                     part_of_season = e.get('partOfSeason')
1452                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1453                         info.update({
1454                             'season': unescapeHTML(part_of_season.get('name')),
1455                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1456                         })
1457                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1458                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1459                         info['series'] = unescapeHTML(part_of_series.get('name'))
1460                 elif item_type == 'Movie':
1461                     info.update({
1462                         'title': unescapeHTML(e.get('name')),
1463                         'description': unescapeHTML(e.get('description')),
1464                         'duration': parse_duration(e.get('duration')),
1465                         'timestamp': unified_timestamp(e.get('dateCreated')),
1466                     })
1467                 elif item_type in ('Article', 'NewsArticle'):
1468                     info.update({
1469                         'timestamp': parse_iso8601(e.get('datePublished')),
1470                         'title': unescapeHTML(e.get('headline')),
1471                         'description': unescapeHTML(e.get('articleBody')),
1472                     })
1473                 elif item_type == 'VideoObject':
1474                     extract_video_object(e)
1475                     if expected_type is None:
1476                         continue
1477                     else:
1478                         break
1479                 video = e.get('video')
1480                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1481                     extract_video_object(video)
1482                 if expected_type is None:
1483                     continue
1484                 else:
1485                     break
1486         return dict((k, v) for k, v in info.items() if v is not None)
1487
1488     @staticmethod
1489     def _hidden_inputs(html):
1490         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1491         hidden_inputs = {}
1492         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1493             attrs = extract_attributes(input)
1494             if not input:
1495                 continue
1496             if attrs.get('type') not in ('hidden', 'submit'):
1497                 continue
1498             name = attrs.get('name') or attrs.get('id')
1499             value = attrs.get('value')
1500             if name and value is not None:
1501                 hidden_inputs[name] = value
1502         return hidden_inputs
1503
1504     def _form_hidden_inputs(self, form_id, html):
1505         form = self._search_regex(
1506             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1507             html, '%s form' % form_id, group='form')
1508         return self._hidden_inputs(form)
1509
1510     class FormatSort:
1511         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1512
1513         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1514                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1515                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1516         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1517                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1518                         'fps', 'fs_approx', 'source', 'format_id')
1519
1520         settings = {
1521             'vcodec': {'type': 'ordered', 'regex': True,
1522                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1523             'acodec': {'type': 'ordered', 'regex': True,
1524                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1525             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1526                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1527             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1528                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1529             'vext': {'type': 'ordered', 'field': 'video_ext',
1530                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1531                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1532             'aext': {'type': 'ordered', 'field': 'audio_ext',
1533                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1534                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1535             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1536             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1537                            'field': ('vcodec', 'acodec'),
1538                            'function': lambda it: int(any(v != 'none' for v in it))},
1539             'ie_pref': {'priority': True, 'type': 'extractor'},
1540             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1541             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1542             'lang': {'convert': 'ignore', 'field': 'language_preference'},
1543             'quality': {'convert': 'float_none', 'default': -1},
1544             'filesize': {'convert': 'bytes'},
1545             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1546             'id': {'convert': 'string', 'field': 'format_id'},
1547             'height': {'convert': 'float_none'},
1548             'width': {'convert': 'float_none'},
1549             'fps': {'convert': 'float_none'},
1550             'tbr': {'convert': 'float_none'},
1551             'vbr': {'convert': 'float_none'},
1552             'abr': {'convert': 'float_none'},
1553             'asr': {'convert': 'float_none'},
1554             'source': {'convert': 'ignore', 'field': 'source_preference'},
1555
1556             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1557             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1558             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1559             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1560             'res': {'type': 'multiple', 'field': ('height', 'width'),
1561                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1562
1563             # Most of these exist only for compatibility reasons
1564             'dimension': {'type': 'alias', 'field': 'res'},
1565             'resolution': {'type': 'alias', 'field': 'res'},
1566             'extension': {'type': 'alias', 'field': 'ext'},
1567             'bitrate': {'type': 'alias', 'field': 'br'},
1568             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1569             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1570             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1571             'framerate': {'type': 'alias', 'field': 'fps'},
1572             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1573             'protocol': {'type': 'alias', 'field': 'proto'},
1574             'source_preference': {'type': 'alias', 'field': 'source'},
1575             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1576             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1577             'samplerate': {'type': 'alias', 'field': 'asr'},
1578             'video_ext': {'type': 'alias', 'field': 'vext'},
1579             'audio_ext': {'type': 'alias', 'field': 'aext'},
1580             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1581             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1582             'video': {'type': 'alias', 'field': 'hasvid'},
1583             'has_video': {'type': 'alias', 'field': 'hasvid'},
1584             'audio': {'type': 'alias', 'field': 'hasaud'},
1585             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1586             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1587             'preference': {'type': 'alias', 'field': 'ie_pref'},
1588             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1589             'format_id': {'type': 'alias', 'field': 'id'},
1590         }
1591
1592         _order = []
1593
1594         def _get_field_setting(self, field, key):
1595             if field not in self.settings:
1596                 self.settings[field] = {}
1597             propObj = self.settings[field]
1598             if key not in propObj:
1599                 type = propObj.get('type')
1600                 if key == 'field':
1601                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1602                 elif key == 'convert':
1603                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1604                 else:
1605                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1606                 propObj[key] = default
1607             return propObj[key]
1608
1609         def _resolve_field_value(self, field, value, convertNone=False):
1610             if value is None:
1611                 if not convertNone:
1612                     return None
1613             else:
1614                 value = value.lower()
1615             conversion = self._get_field_setting(field, 'convert')
1616             if conversion == 'ignore':
1617                 return None
1618             if conversion == 'string':
1619                 return value
1620             elif conversion == 'float_none':
1621                 return float_or_none(value)
1622             elif conversion == 'bytes':
1623                 return FileDownloader.parse_bytes(value)
1624             elif conversion == 'order':
1625                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1626                 use_regex = self._get_field_setting(field, 'regex')
1627                 list_length = len(order_list)
1628                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1629                 if use_regex and value is not None:
1630                     for i, regex in enumerate(order_list):
1631                         if regex and re.match(regex, value):
1632                             return list_length - i
1633                     return list_length - empty_pos  # not in list
1634                 else:  # not regex or  value = None
1635                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1636             else:
1637                 if value.isnumeric():
1638                     return float(value)
1639                 else:
1640                     self.settings[field]['convert'] = 'string'
1641                     return value
1642
1643         def evaluate_params(self, params, sort_extractor):
1644             self._use_free_order = params.get('prefer_free_formats', False)
1645             self._sort_user = params.get('format_sort', [])
1646             self._sort_extractor = sort_extractor
1647
1648             def add_item(field, reverse, closest, limit_text):
1649                 field = field.lower()
1650                 if field in self._order:
1651                     return
1652                 self._order.append(field)
1653                 limit = self._resolve_field_value(field, limit_text)
1654                 data = {
1655                     'reverse': reverse,
1656                     'closest': False if limit is None else closest,
1657                     'limit_text': limit_text,
1658                     'limit': limit}
1659                 if field in self.settings:
1660                     self.settings[field].update(data)
1661                 else:
1662                     self.settings[field] = data
1663
1664             sort_list = (
1665                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1666                 + (tuple() if params.get('format_sort_force', False)
1667                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1668                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1669
1670             for item in sort_list:
1671                 match = re.match(self.regex, item)
1672                 if match is None:
1673                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1674                 field = match.group('field')
1675                 if field is None:
1676                     continue
1677                 if self._get_field_setting(field, 'type') == 'alias':
1678                     field = self._get_field_setting(field, 'field')
1679                 reverse = match.group('reverse') is not None
1680                 closest = match.group('separator') == '~'
1681                 limit_text = match.group('limit')
1682
1683                 has_limit = limit_text is not None
1684                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1685                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1686
1687                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1688                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1689                 limit_count = len(limits)
1690                 for (i, f) in enumerate(fields):
1691                     add_item(f, reverse, closest,
1692                              limits[i] if i < limit_count
1693                              else limits[0] if has_limit and not has_multiple_limits
1694                              else None)
1695
1696         def print_verbose_info(self, write_debug):
1697             if self._sort_user:
1698                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1699             if self._sort_extractor:
1700                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1701             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1702                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1703                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1704                               self._get_field_setting(field, 'limit_text'),
1705                               self._get_field_setting(field, 'limit'))
1706                 if self._get_field_setting(field, 'limit_text') is not None else '')
1707                 for field in self._order if self._get_field_setting(field, 'visible')]))
1708
1709         def _calculate_field_preference_from_value(self, format, field, type, value):
1710             reverse = self._get_field_setting(field, 'reverse')
1711             closest = self._get_field_setting(field, 'closest')
1712             limit = self._get_field_setting(field, 'limit')
1713
1714             if type == 'extractor':
1715                 maximum = self._get_field_setting(field, 'max')
1716                 if value is None or (maximum is not None and value >= maximum):
1717                     value = -1
1718             elif type == 'boolean':
1719                 in_list = self._get_field_setting(field, 'in_list')
1720                 not_in_list = self._get_field_setting(field, 'not_in_list')
1721                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1722             elif type == 'ordered':
1723                 value = self._resolve_field_value(field, value, True)
1724
1725             # try to convert to number
1726             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1727             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1728             if is_num:
1729                 value = val_num
1730
1731             return ((-10, 0) if value is None
1732                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1733                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1734                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1735                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1736                     else (-1, value, 0))
1737
1738         def _calculate_field_preference(self, format, field):
1739             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1740             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1741             if type == 'multiple':
1742                 type = 'field'  # Only 'field' is allowed in multiple for now
1743                 actual_fields = self._get_field_setting(field, 'field')
1744
1745                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1746             else:
1747                 value = get_value(field)
1748             return self._calculate_field_preference_from_value(format, field, type, value)
1749
1750         def calculate_preference(self, format):
1751             # Determine missing protocol
1752             if not format.get('protocol'):
1753                 format['protocol'] = determine_protocol(format)
1754
1755             # Determine missing ext
1756             if not format.get('ext') and 'url' in format:
1757                 format['ext'] = determine_ext(format['url'])
1758             if format.get('vcodec') == 'none':
1759                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1760                 format['video_ext'] = 'none'
1761             else:
1762                 format['video_ext'] = format['ext']
1763                 format['audio_ext'] = 'none'
1764             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1765             #    format['preference'] = -1000
1766
1767             # Determine missing bitrates
1768             if format.get('tbr') is None:
1769                 if format.get('vbr') is not None and format.get('abr') is not None:
1770                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1771             else:
1772                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1773                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1774                 if format.get('acodec') != 'none' and format.get('abr') is None:
1775                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1776
1777             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1778
1779     def _sort_formats(self, formats, field_preference=[]):
1780         if not formats:
1781             return
1782         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1783         format_sort.evaluate_params(self._downloader.params, field_preference)
1784         if self.get_param('verbose', False):
1785             format_sort.print_verbose_info(self._downloader.write_debug)
1786         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1787
1788     def _check_formats(self, formats, video_id):
1789         if formats:
1790             formats[:] = filter(
1791                 lambda f: self._is_valid_url(
1792                     f['url'], video_id,
1793                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1794                 formats)
1795
1796     @staticmethod
1797     def _remove_duplicate_formats(formats):
1798         format_urls = set()
1799         unique_formats = []
1800         for f in formats:
1801             if f['url'] not in format_urls:
1802                 format_urls.add(f['url'])
1803                 unique_formats.append(f)
1804         formats[:] = unique_formats
1805
1806     def _is_valid_url(self, url, video_id, item='video', headers={}):
1807         url = self._proto_relative_url(url, scheme='http:')
1808         # For now assume non HTTP(S) URLs always valid
1809         if not (url.startswith('http://') or url.startswith('https://')):
1810             return True
1811         try:
1812             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1813             return True
1814         except ExtractorError as e:
1815             self.to_screen(
1816                 '%s: %s URL is invalid, skipping: %s'
1817                 % (video_id, item, error_to_compat_str(e.cause)))
1818             return False
1819
1820     def http_scheme(self):
1821         """ Either "http:" or "https:", depending on the user's preferences """
1822         return (
1823             'http:'
1824             if self.get_param('prefer_insecure', False)
1825             else 'https:')
1826
1827     def _proto_relative_url(self, url, scheme=None):
1828         if url is None:
1829             return url
1830         if url.startswith('//'):
1831             if scheme is None:
1832                 scheme = self.http_scheme()
1833             return scheme + url
1834         else:
1835             return url
1836
1837     def _sleep(self, timeout, video_id, msg_template=None):
1838         if msg_template is None:
1839             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1840         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1841         self.to_screen(msg)
1842         time.sleep(timeout)
1843
1844     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1845                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1846                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1847         manifest = self._download_xml(
1848             manifest_url, video_id, 'Downloading f4m manifest',
1849             'Unable to download f4m manifest',
1850             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1851             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1852             transform_source=transform_source,
1853             fatal=fatal, data=data, headers=headers, query=query)
1854
1855         if manifest is False:
1856             return []
1857
1858         return self._parse_f4m_formats(
1859             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1860             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1861
1862     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1863                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1864                            fatal=True, m3u8_id=None):
1865         if not isinstance(manifest, compat_etree_Element) and not fatal:
1866             return []
1867
1868         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1869         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1870         if akamai_pv is not None and ';' in akamai_pv.text:
1871             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1872             if playerVerificationChallenge.strip() != '':
1873                 return []
1874
1875         formats = []
1876         manifest_version = '1.0'
1877         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1878         if not media_nodes:
1879             manifest_version = '2.0'
1880             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1881         # Remove unsupported DRM protected media from final formats
1882         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1883         media_nodes = remove_encrypted_media(media_nodes)
1884         if not media_nodes:
1885             return formats
1886
1887         manifest_base_url = get_base_url(manifest)
1888
1889         bootstrap_info = xpath_element(
1890             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1891             'bootstrap info', default=None)
1892
1893         vcodec = None
1894         mime_type = xpath_text(
1895             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1896             'base URL', default=None)
1897         if mime_type and mime_type.startswith('audio/'):
1898             vcodec = 'none'
1899
1900         for i, media_el in enumerate(media_nodes):
1901             tbr = int_or_none(media_el.attrib.get('bitrate'))
1902             width = int_or_none(media_el.attrib.get('width'))
1903             height = int_or_none(media_el.attrib.get('height'))
1904             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1905             # If <bootstrapInfo> is present, the specified f4m is a
1906             # stream-level manifest, and only set-level manifests may refer to
1907             # external resources.  See section 11.4 and section 4 of F4M spec
1908             if bootstrap_info is None:
1909                 media_url = None
1910                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1911                 if manifest_version == '2.0':
1912                     media_url = media_el.attrib.get('href')
1913                 if media_url is None:
1914                     media_url = media_el.attrib.get('url')
1915                 if not media_url:
1916                     continue
1917                 manifest_url = (
1918                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1919                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1920                 # If media_url is itself a f4m manifest do the recursive extraction
1921                 # since bitrates in parent manifest (this one) and media_url manifest
1922                 # may differ leading to inability to resolve the format by requested
1923                 # bitrate in f4m downloader
1924                 ext = determine_ext(manifest_url)
1925                 if ext == 'f4m':
1926                     f4m_formats = self._extract_f4m_formats(
1927                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1928                         transform_source=transform_source, fatal=fatal)
1929                     # Sometimes stream-level manifest contains single media entry that
1930                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1931                     # At the same time parent's media entry in set-level manifest may
1932                     # contain it. We will copy it from parent in such cases.
1933                     if len(f4m_formats) == 1:
1934                         f = f4m_formats[0]
1935                         f.update({
1936                             'tbr': f.get('tbr') or tbr,
1937                             'width': f.get('width') or width,
1938                             'height': f.get('height') or height,
1939                             'format_id': f.get('format_id') if not tbr else format_id,
1940                             'vcodec': vcodec,
1941                         })
1942                     formats.extend(f4m_formats)
1943                     continue
1944                 elif ext == 'm3u8':
1945                     formats.extend(self._extract_m3u8_formats(
1946                         manifest_url, video_id, 'mp4', preference=preference,
1947                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1948                     continue
1949             formats.append({
1950                 'format_id': format_id,
1951                 'url': manifest_url,
1952                 'manifest_url': manifest_url,
1953                 'ext': 'flv' if bootstrap_info is not None else None,
1954                 'protocol': 'f4m',
1955                 'tbr': tbr,
1956                 'width': width,
1957                 'height': height,
1958                 'vcodec': vcodec,
1959                 'preference': preference,
1960                 'quality': quality,
1961             })
1962         return formats
1963
1964     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1965         return {
1966             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1967             'url': m3u8_url,
1968             'ext': ext,
1969             'protocol': 'm3u8',
1970             'preference': preference - 100 if preference else -100,
1971             'quality': quality,
1972             'resolution': 'multiple',
1973             'format_note': 'Quality selection URL',
1974         }
1975
1976     def _report_ignoring_subs(self, name):
1977         self.report_warning(bug_reports_message(
1978             f'Ignoring subtitle tracks found in the {name} manifest; '
1979             'if any subtitle tracks are missing,'
1980         ), only_once=True)
1981
1982     def _extract_m3u8_formats(self, *args, **kwargs):
1983         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1984         if subs:
1985             self._report_ignoring_subs('HLS')
1986         return fmts
1987
1988     def _extract_m3u8_formats_and_subtitles(
1989             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1990             preference=None, quality=None, m3u8_id=None, note=None,
1991             errnote=None, fatal=True, live=False, data=None, headers={},
1992             query={}):
1993
1994         res = self._download_webpage_handle(
1995             m3u8_url, video_id,
1996             note='Downloading m3u8 information' if note is None else note,
1997             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1998             fatal=fatal, data=data, headers=headers, query=query)
1999
2000         if res is False:
2001             return [], {}
2002
2003         m3u8_doc, urlh = res
2004         m3u8_url = urlh.geturl()
2005
2006         return self._parse_m3u8_formats_and_subtitles(
2007             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2008             preference=preference, quality=quality, m3u8_id=m3u8_id,
2009             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2010             headers=headers, query=query, video_id=video_id)
2011
2012     def _parse_m3u8_formats_and_subtitles(
2013             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2014             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2015             errnote=None, fatal=True, data=None, headers={}, query={},
2016             video_id=None):
2017         formats, subtitles = [], {}
2018
2019         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
2020             return formats, subtitles
2021
2022         has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc)
2023
2024         def format_url(url):
2025             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2026
2027         if self.get_param('hls_split_discontinuity', False):
2028             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2029                 if not m3u8_doc:
2030                     if not manifest_url:
2031                         return []
2032                     m3u8_doc = self._download_webpage(
2033                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2034                         note=False, errnote='Failed to download m3u8 playlist information')
2035                     if m3u8_doc is False:
2036                         return []
2037                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2038
2039         else:
2040             def _extract_m3u8_playlist_indices(*args, **kwargs):
2041                 return [None]
2042
2043         # References:
2044         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2045         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2046         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2047
2048         # We should try extracting formats only from master playlists [1, 4.3.4],
2049         # i.e. playlists that describe available qualities. On the other hand
2050         # media playlists [1, 4.3.3] should be returned as is since they contain
2051         # just the media without qualities renditions.
2052         # Fortunately, master playlist can be easily distinguished from media
2053         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2054         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2055         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2056         # media playlist and MUST NOT appear in master playlist thus we can
2057         # clearly detect media playlist with this criterion.
2058
2059         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2060             formats = [{
2061                 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
2062                 'format_index': idx,
2063                 'url': m3u8_url,
2064                 'ext': ext,
2065                 'protocol': entry_protocol,
2066                 'preference': preference,
2067                 'quality': quality,
2068                 'has_drm': has_drm,
2069             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2070
2071             return formats, subtitles
2072
2073         groups = {}
2074         last_stream_inf = {}
2075
2076         def extract_media(x_media_line):
2077             media = parse_m3u8_attributes(x_media_line)
2078             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2079             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2080             if not (media_type and group_id and name):
2081                 return
2082             groups.setdefault(group_id, []).append(media)
2083             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2084             if media_type == 'SUBTITLES':
2085                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2086                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2087                 # However, lack of URI has been spotted in the wild.
2088                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2089                 if not media.get('URI'):
2090                     return
2091                 url = format_url(media['URI'])
2092                 sub_info = {
2093                     'url': url,
2094                     'ext': determine_ext(url),
2095                 }
2096                 if sub_info['ext'] == 'm3u8':
2097                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2098                     # files may contain is WebVTT:
2099                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2100                     sub_info['ext'] = 'vtt'
2101                     sub_info['protocol'] = 'm3u8_native'
2102                 lang = media.get('LANGUAGE') or 'und'
2103                 subtitles.setdefault(lang, []).append(sub_info)
2104             if media_type not in ('VIDEO', 'AUDIO'):
2105                 return
2106             media_url = media.get('URI')
2107             if media_url:
2108                 manifest_url = format_url(media_url)
2109                 formats.extend({
2110                     'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
2111                     'format_note': name,
2112                     'format_index': idx,
2113                     'url': manifest_url,
2114                     'manifest_url': m3u8_url,
2115                     'language': media.get('LANGUAGE'),
2116                     'ext': ext,
2117                     'protocol': entry_protocol,
2118                     'preference': preference,
2119                     'quality': quality,
2120                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2121                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2122
2123         def build_stream_name():
2124             # Despite specification does not mention NAME attribute for
2125             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2126             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2127             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2128             stream_name = last_stream_inf.get('NAME')
2129             if stream_name:
2130                 return stream_name
2131             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2132             # from corresponding rendition group
2133             stream_group_id = last_stream_inf.get('VIDEO')
2134             if not stream_group_id:
2135                 return
2136             stream_group = groups.get(stream_group_id)
2137             if not stream_group:
2138                 return stream_group_id
2139             rendition = stream_group[0]
2140             return rendition.get('NAME') or stream_group_id
2141
2142         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2143         # chance to detect video only formats when EXT-X-STREAM-INF tags
2144         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2145         for line in m3u8_doc.splitlines():
2146             if line.startswith('#EXT-X-MEDIA:'):
2147                 extract_media(line)
2148
2149         for line in m3u8_doc.splitlines():
2150             if line.startswith('#EXT-X-STREAM-INF:'):
2151                 last_stream_inf = parse_m3u8_attributes(line)
2152             elif line.startswith('#') or not line.strip():
2153                 continue
2154             else:
2155                 tbr = float_or_none(
2156                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2157                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2158                 manifest_url = format_url(line.strip())
2159
2160                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2161                     format_id = [m3u8_id, None, idx]
2162                     # Bandwidth of live streams may differ over time thus making
2163                     # format_id unpredictable. So it's better to keep provided
2164                     # format_id intact.
2165                     if not live:
2166                         stream_name = build_stream_name()
2167                         format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
2168                     f = {
2169                         'format_id': '-'.join(map(str, filter(None, format_id))),
2170                         'format_index': idx,
2171                         'url': manifest_url,
2172                         'manifest_url': m3u8_url,
2173                         'tbr': tbr,
2174                         'ext': ext,
2175                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2176                         'protocol': entry_protocol,
2177                         'preference': preference,
2178                         'quality': quality,
2179                     }
2180                     resolution = last_stream_inf.get('RESOLUTION')
2181                     if resolution:
2182                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2183                         if mobj:
2184                             f['width'] = int(mobj.group('width'))
2185                             f['height'] = int(mobj.group('height'))
2186                     # Unified Streaming Platform
2187                     mobj = re.search(
2188                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2189                     if mobj:
2190                         abr, vbr = mobj.groups()
2191                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2192                         f.update({
2193                             'vbr': vbr,
2194                             'abr': abr,
2195                         })
2196                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2197                     f.update(codecs)
2198                     audio_group_id = last_stream_inf.get('AUDIO')
2199                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2200                     # references a rendition group MUST have a CODECS attribute.
2201                     # However, this is not always respected, for example, [2]
2202                     # contains EXT-X-STREAM-INF tag which references AUDIO
2203                     # rendition group but does not have CODECS and despite
2204                     # referencing an audio group it represents a complete
2205                     # (with audio and video) format. So, for such cases we will
2206                     # ignore references to rendition groups and treat them
2207                     # as complete formats.
2208                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2209                         audio_group = groups.get(audio_group_id)
2210                         if audio_group and audio_group[0].get('URI'):
2211                             # TODO: update acodec for audio only formats with
2212                             # the same GROUP-ID
2213                             f['acodec'] = 'none'
2214                     if not f.get('ext'):
2215                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2216                     formats.append(f)
2217
2218                     # for DailyMotion
2219                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2220                     if progressive_uri:
2221                         http_f = f.copy()
2222                         del http_f['manifest_url']
2223                         http_f.update({
2224                             'format_id': f['format_id'].replace('hls-', 'http-'),
2225                             'protocol': 'http',
2226                             'url': progressive_uri,
2227                         })
2228                         formats.append(http_f)
2229
2230                 last_stream_inf = {}
2231         return formats, subtitles
2232
2233     def _extract_m3u8_vod_duration(
2234             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2235
2236         m3u8_vod = self._download_webpage(
2237             m3u8_vod_url, video_id,
2238             note='Downloading m3u8 VOD manifest' if note is None else note,
2239             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2240             fatal=False, data=data, headers=headers, query=query)
2241
2242         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2243
2244     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2245         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2246             return None
2247
2248         return int(sum(
2249             float(line[len('#EXTINF:'):].split(',')[0])
2250             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2251
2252     @staticmethod
2253     def _xpath_ns(path, namespace=None):
2254         if not namespace:
2255             return path
2256         out = []
2257         for c in path.split('/'):
2258             if not c or c == '.':
2259                 out.append(c)
2260             else:
2261                 out.append('{%s}%s' % (namespace, c))
2262         return '/'.join(out)
2263
2264     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2265         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2266
2267         if smil is False:
2268             assert not fatal
2269             return []
2270
2271         namespace = self._parse_smil_namespace(smil)
2272
2273         fmts = self._parse_smil_formats(
2274             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2275         subs = self._parse_smil_subtitles(
2276             smil, namespace=namespace)
2277
2278         return fmts, subs
2279
2280     def _extract_smil_formats(self, *args, **kwargs):
2281         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2282         if subs:
2283             self._report_ignoring_subs('SMIL')
2284         return fmts
2285
2286     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2287         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2288         if smil is False:
2289             return {}
2290         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2291
2292     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2293         return self._download_xml(
2294             smil_url, video_id, 'Downloading SMIL file',
2295             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2296
2297     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2298         namespace = self._parse_smil_namespace(smil)
2299
2300         formats = self._parse_smil_formats(
2301             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2302         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2303
2304         video_id = os.path.splitext(url_basename(smil_url))[0]
2305         title = None
2306         description = None
2307         upload_date = None
2308         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2309             name = meta.attrib.get('name')
2310             content = meta.attrib.get('content')
2311             if not name or not content:
2312                 continue
2313             if not title and name == 'title':
2314                 title = content
2315             elif not description and name in ('description', 'abstract'):
2316                 description = content
2317             elif not upload_date and name == 'date':
2318                 upload_date = unified_strdate(content)
2319
2320         thumbnails = [{
2321             'id': image.get('type'),
2322             'url': image.get('src'),
2323             'width': int_or_none(image.get('width')),
2324             'height': int_or_none(image.get('height')),
2325         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2326
2327         return {
2328             'id': video_id,
2329             'title': title or video_id,
2330             'description': description,
2331             'upload_date': upload_date,
2332             'thumbnails': thumbnails,
2333             'formats': formats,
2334             'subtitles': subtitles,
2335         }
2336
2337     def _parse_smil_namespace(self, smil):
2338         return self._search_regex(
2339             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2340
2341     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2342         base = smil_url
2343         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2344             b = meta.get('base') or meta.get('httpBase')
2345             if b:
2346                 base = b
2347                 break
2348
2349         formats = []
2350         rtmp_count = 0
2351         http_count = 0
2352         m3u8_count = 0
2353         imgs_count = 0
2354
2355         srcs = set()
2356         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2357         for medium in media:
2358             src = medium.get('src')
2359             if not src or src in srcs:
2360                 continue
2361             srcs.add(src)
2362
2363             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2364             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2365             width = int_or_none(medium.get('width'))
2366             height = int_or_none(medium.get('height'))
2367             proto = medium.get('proto')
2368             ext = medium.get('ext')
2369             src_ext = determine_ext(src)
2370             streamer = medium.get('streamer') or base
2371
2372             if proto == 'rtmp' or streamer.startswith('rtmp'):
2373                 rtmp_count += 1
2374                 formats.append({
2375                     'url': streamer,
2376                     'play_path': src,
2377                     'ext': 'flv',
2378                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2379                     'tbr': bitrate,
2380                     'filesize': filesize,
2381                     'width': width,
2382                     'height': height,
2383                 })
2384                 if transform_rtmp_url:
2385                     streamer, src = transform_rtmp_url(streamer, src)
2386                     formats[-1].update({
2387                         'url': streamer,
2388                         'play_path': src,
2389                     })
2390                 continue
2391
2392             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2393             src_url = src_url.strip()
2394
2395             if proto == 'm3u8' or src_ext == 'm3u8':
2396                 m3u8_formats = self._extract_m3u8_formats(
2397                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2398                 if len(m3u8_formats) == 1:
2399                     m3u8_count += 1
2400                     m3u8_formats[0].update({
2401                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2402                         'tbr': bitrate,
2403                         'width': width,
2404                         'height': height,
2405                     })
2406                 formats.extend(m3u8_formats)
2407             elif src_ext == 'f4m':
2408                 f4m_url = src_url
2409                 if not f4m_params:
2410                     f4m_params = {
2411                         'hdcore': '3.2.0',
2412                         'plugin': 'flowplayer-3.2.0.1',
2413                     }
2414                 f4m_url += '&' if '?' in f4m_url else '?'
2415                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2416                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2417             elif src_ext == 'mpd':
2418                 formats.extend(self._extract_mpd_formats(
2419                     src_url, video_id, mpd_id='dash', fatal=False))
2420             elif re.search(r'\.ism/[Mm]anifest', src_url):
2421                 formats.extend(self._extract_ism_formats(
2422                     src_url, video_id, ism_id='mss', fatal=False))
2423             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2424                 http_count += 1
2425                 formats.append({
2426                     'url': src_url,
2427                     'ext': ext or src_ext or 'flv',
2428                     'format_id': 'http-%d' % (bitrate or http_count),
2429                     'tbr': bitrate,
2430                     'filesize': filesize,
2431                     'width': width,
2432                     'height': height,
2433                 })
2434
2435         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2436             src = medium.get('src')
2437             if not src or src in srcs:
2438                 continue
2439             srcs.add(src)
2440
2441             imgs_count += 1
2442             formats.append({
2443                 'format_id': 'imagestream-%d' % (imgs_count),
2444                 'url': src,
2445                 'ext': mimetype2ext(medium.get('type')),
2446                 'acodec': 'none',
2447                 'vcodec': 'none',
2448                 'width': int_or_none(medium.get('width')),
2449                 'height': int_or_none(medium.get('height')),
2450                 'format_note': 'SMIL storyboards',
2451             })
2452
2453         return formats
2454
2455     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2456         urls = []
2457         subtitles = {}
2458         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2459             src = textstream.get('src')
2460             if not src or src in urls:
2461                 continue
2462             urls.append(src)
2463             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2464             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2465             subtitles.setdefault(lang, []).append({
2466                 'url': src,
2467                 'ext': ext,
2468             })
2469         return subtitles
2470
2471     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2472         xspf = self._download_xml(
2473             xspf_url, playlist_id, 'Downloading xpsf playlist',
2474             'Unable to download xspf manifest', fatal=fatal)
2475         if xspf is False:
2476             return []
2477         return self._parse_xspf(
2478             xspf, playlist_id, xspf_url=xspf_url,
2479             xspf_base_url=base_url(xspf_url))
2480
2481     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2482         NS_MAP = {
2483             'xspf': 'http://xspf.org/ns/0/',
2484             's1': 'http://static.streamone.nl/player/ns/0',
2485         }
2486
2487         entries = []
2488         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2489             title = xpath_text(
2490                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2491             description = xpath_text(
2492                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2493             thumbnail = xpath_text(
2494                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2495             duration = float_or_none(
2496                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2497
2498             formats = []
2499             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2500                 format_url = urljoin(xspf_base_url, location.text)
2501                 if not format_url:
2502                     continue
2503                 formats.append({
2504                     'url': format_url,
2505                     'manifest_url': xspf_url,
2506                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2507                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2508                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2509                 })
2510             self._sort_formats(formats)
2511
2512             entries.append({
2513                 'id': playlist_id,
2514                 'title': title,
2515                 'description': description,
2516                 'thumbnail': thumbnail,
2517                 'duration': duration,
2518                 'formats': formats,
2519             })
2520         return entries
2521
2522     def _extract_mpd_formats(self, *args, **kwargs):
2523         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2524         if subs:
2525             self._report_ignoring_subs('DASH')
2526         return fmts
2527
2528     def _extract_mpd_formats_and_subtitles(
2529             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2530             fatal=True, data=None, headers={}, query={}):
2531         res = self._download_xml_handle(
2532             mpd_url, video_id,
2533             note='Downloading MPD manifest' if note is None else note,
2534             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2535             fatal=fatal, data=data, headers=headers, query=query)
2536         if res is False:
2537             return [], {}
2538         mpd_doc, urlh = res
2539         if mpd_doc is None:
2540             return [], {}
2541         mpd_base_url = base_url(urlh.geturl())
2542
2543         return self._parse_mpd_formats_and_subtitles(
2544             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2545
2546     def _parse_mpd_formats(self, *args, **kwargs):
2547         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2548         if subs:
2549             self._report_ignoring_subs('DASH')
2550         return fmts
2551
2552     def _parse_mpd_formats_and_subtitles(
2553             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2554         """
2555         Parse formats from MPD manifest.
2556         References:
2557          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2558             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2559          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2560         """
2561         if not self.get_param('dynamic_mpd', True):
2562             if mpd_doc.get('type') == 'dynamic':
2563                 return [], {}
2564
2565         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2566
2567         def _add_ns(path):
2568             return self._xpath_ns(path, namespace)
2569
2570         def is_drm_protected(element):
2571             return element.find(_add_ns('ContentProtection')) is not None
2572
2573         def extract_multisegment_info(element, ms_parent_info):
2574             ms_info = ms_parent_info.copy()
2575
2576             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2577             # common attributes and elements.  We will only extract relevant
2578             # for us.
2579             def extract_common(source):
2580                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2581                 if segment_timeline is not None:
2582                     s_e = segment_timeline.findall(_add_ns('S'))
2583                     if s_e:
2584                         ms_info['total_number'] = 0
2585                         ms_info['s'] = []
2586                         for s in s_e:
2587                             r = int(s.get('r', 0))
2588                             ms_info['total_number'] += 1 + r
2589                             ms_info['s'].append({
2590                                 't': int(s.get('t', 0)),
2591                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2592                                 'd': int(s.attrib['d']),
2593                                 'r': r,
2594                             })
2595                 start_number = source.get('startNumber')
2596                 if start_number:
2597                     ms_info['start_number'] = int(start_number)
2598                 timescale = source.get('timescale')
2599                 if timescale:
2600                     ms_info['timescale'] = int(timescale)
2601                 segment_duration = source.get('duration')
2602                 if segment_duration:
2603                     ms_info['segment_duration'] = float(segment_duration)
2604
2605             def extract_Initialization(source):
2606                 initialization = source.find(_add_ns('Initialization'))
2607                 if initialization is not None:
2608                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2609
2610             segment_list = element.find(_add_ns('SegmentList'))
2611             if segment_list is not None:
2612                 extract_common(segment_list)
2613                 extract_Initialization(segment_list)
2614                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2615                 if segment_urls_e:
2616                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2617             else:
2618                 segment_template = element.find(_add_ns('SegmentTemplate'))
2619                 if segment_template is not None:
2620                     extract_common(segment_template)
2621                     media = segment_template.get('media')
2622                     if media:
2623                         ms_info['media'] = media
2624                     initialization = segment_template.get('initialization')
2625                     if initialization:
2626                         ms_info['initialization'] = initialization
2627                     else:
2628                         extract_Initialization(segment_template)
2629             return ms_info
2630
2631         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2632         formats, subtitles = [], {}
2633         stream_numbers = {'audio': 0, 'video': 0}
2634         for period in mpd_doc.findall(_add_ns('Period')):
2635             period_duration = parse_duration(period.get('duration')) or mpd_duration
2636             period_ms_info = extract_multisegment_info(period, {
2637                 'start_number': 1,
2638                 'timescale': 1,
2639             })
2640             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2641                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2642                 for representation in adaptation_set.findall(_add_ns('Representation')):
2643                     representation_attrib = adaptation_set.attrib.copy()
2644                     representation_attrib.update(representation.attrib)
2645                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2646                     mime_type = representation_attrib['mimeType']
2647                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2648
2649                     codecs = representation_attrib.get('codecs', '')
2650                     if content_type not in ('video', 'audio', 'text'):
2651                         if mime_type == 'image/jpeg':
2652                             content_type = mime_type
2653                         elif codecs.split('.')[0] == 'stpp':
2654                             content_type = 'text'
2655                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2656                             content_type = 'text'
2657                         else:
2658                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2659                             continue
2660
2661                     base_url = ''
2662                     for element in (representation, adaptation_set, period, mpd_doc):
2663                         base_url_e = element.find(_add_ns('BaseURL'))
2664                         if base_url_e is not None:
2665                             base_url = base_url_e.text + base_url
2666                             if re.match(r'^https?://', base_url):
2667                                 break
2668                     if mpd_base_url and base_url.startswith('/'):
2669                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2670                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2671                         if not mpd_base_url.endswith('/'):
2672                             mpd_base_url += '/'
2673                         base_url = mpd_base_url + base_url
2674                     representation_id = representation_attrib.get('id')
2675                     lang = representation_attrib.get('lang')
2676                     url_el = representation.find(_add_ns('BaseURL'))
2677                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2678                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2679                     if representation_id is not None:
2680                         format_id = representation_id
2681                     else:
2682                         format_id = content_type
2683                     if mpd_id:
2684                         format_id = mpd_id + '-' + format_id
2685                     if content_type in ('video', 'audio'):
2686                         f = {
2687                             'format_id': format_id,
2688                             'manifest_url': mpd_url,
2689                             'ext': mimetype2ext(mime_type),
2690                             'width': int_or_none(representation_attrib.get('width')),
2691                             'height': int_or_none(representation_attrib.get('height')),
2692                             'tbr': float_or_none(bandwidth, 1000),
2693                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2694                             'fps': int_or_none(representation_attrib.get('frameRate')),
2695                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2696                             'format_note': 'DASH %s' % content_type,
2697                             'filesize': filesize,
2698                             'container': mimetype2ext(mime_type) + '_dash',
2699                             'manifest_stream_number': stream_numbers[content_type]
2700                         }
2701                         f.update(parse_codecs(codecs))
2702                         stream_numbers[content_type] += 1
2703                     elif content_type == 'text':
2704                         f = {
2705                             'ext': mimetype2ext(mime_type),
2706                             'manifest_url': mpd_url,
2707                             'filesize': filesize,
2708                         }
2709                     elif content_type == 'image/jpeg':
2710                         # See test case in VikiIE
2711                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2712                         f = {
2713                             'format_id': format_id,
2714                             'ext': 'mhtml',
2715                             'manifest_url': mpd_url,
2716                             'format_note': 'DASH storyboards (jpeg)',
2717                             'acodec': 'none',
2718                             'vcodec': 'none',
2719                         }
2720                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2721                         f['has_drm'] = True
2722                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2723
2724                     def prepare_template(template_name, identifiers):
2725                         tmpl = representation_ms_info[template_name]
2726                         # First of, % characters outside $...$ templates
2727                         # must be escaped by doubling for proper processing
2728                         # by % operator string formatting used further (see
2729                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2730                         t = ''
2731                         in_template = False
2732                         for c in tmpl:
2733                             t += c
2734                             if c == '$':
2735                                 in_template = not in_template
2736                             elif c == '%' and not in_template:
2737                                 t += c
2738                         # Next, $...$ templates are translated to their
2739                         # %(...) counterparts to be used with % operator
2740                         if representation_id is not None:
2741                             t = t.replace('$RepresentationID$', representation_id)
2742                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2743                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2744                         t.replace('$$', '$')
2745                         return t
2746
2747                     # @initialization is a regular template like @media one
2748                     # so it should be handled just the same way (see
2749                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2750                     if 'initialization' in representation_ms_info:
2751                         initialization_template = prepare_template(
2752                             'initialization',
2753                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2754                             # $Time$ shall not be included for @initialization thus
2755                             # only $Bandwidth$ remains
2756                             ('Bandwidth', ))
2757                         representation_ms_info['initialization_url'] = initialization_template % {
2758                             'Bandwidth': bandwidth,
2759                         }
2760
2761                     def location_key(location):
2762                         return 'url' if re.match(r'^https?://', location) else 'path'
2763
2764                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2765
2766                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2767                         media_location_key = location_key(media_template)
2768
2769                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2770                         # can't be used at the same time
2771                         if '%(Number' in media_template and 's' not in representation_ms_info:
2772                             segment_duration = None
2773                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2774                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2775                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2776                             representation_ms_info['fragments'] = [{
2777                                 media_location_key: media_template % {
2778                                     'Number': segment_number,
2779                                     'Bandwidth': bandwidth,
2780                                 },
2781                                 'duration': segment_duration,
2782                             } for segment_number in range(
2783                                 representation_ms_info['start_number'],
2784                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2785                         else:
2786                             # $Number*$ or $Time$ in media template with S list available
2787                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2788                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2789                             representation_ms_info['fragments'] = []
2790                             segment_time = 0
2791                             segment_d = None
2792                             segment_number = representation_ms_info['start_number']
2793
2794                             def add_segment_url():
2795                                 segment_url = media_template % {
2796                                     'Time': segment_time,
2797                                     'Bandwidth': bandwidth,
2798                                     'Number': segment_number,
2799                                 }
2800                                 representation_ms_info['fragments'].append({
2801                                     media_location_key: segment_url,
2802                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2803                                 })
2804
2805                             for num, s in enumerate(representation_ms_info['s']):
2806                                 segment_time = s.get('t') or segment_time
2807                                 segment_d = s['d']
2808                                 add_segment_url()
2809                                 segment_number += 1
2810                                 for r in range(s.get('r', 0)):
2811                                     segment_time += segment_d
2812                                     add_segment_url()
2813                                     segment_number += 1
2814                                 segment_time += segment_d
2815                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2816                         # No media template
2817                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2818                         # or any YouTube dashsegments video
2819                         fragments = []
2820                         segment_index = 0
2821                         timescale = representation_ms_info['timescale']
2822                         for s in representation_ms_info['s']:
2823                             duration = float_or_none(s['d'], timescale)
2824                             for r in range(s.get('r', 0) + 1):
2825                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2826                                 fragments.append({
2827                                     location_key(segment_uri): segment_uri,
2828                                     'duration': duration,
2829                                 })
2830                                 segment_index += 1
2831                         representation_ms_info['fragments'] = fragments
2832                     elif 'segment_urls' in representation_ms_info:
2833                         # Segment URLs with no SegmentTimeline
2834                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2835                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2836                         fragments = []
2837                         segment_duration = float_or_none(
2838                             representation_ms_info['segment_duration'],
2839                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2840                         for segment_url in representation_ms_info['segment_urls']:
2841                             fragment = {
2842                                 location_key(segment_url): segment_url,
2843                             }
2844                             if segment_duration:
2845                                 fragment['duration'] = segment_duration
2846                             fragments.append(fragment)
2847                         representation_ms_info['fragments'] = fragments
2848                     # If there is a fragments key available then we correctly recognized fragmented media.
2849                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2850                     # assumption is not necessarily correct since we may simply have no support for
2851                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2852                     if 'fragments' in representation_ms_info:
2853                         f.update({
2854                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2855                             'url': mpd_url or base_url,
2856                             'fragment_base_url': base_url,
2857                             'fragments': [],
2858                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2859                         })
2860                         if 'initialization_url' in representation_ms_info:
2861                             initialization_url = representation_ms_info['initialization_url']
2862                             if not f.get('url'):
2863                                 f['url'] = initialization_url
2864                             f['fragments'].append({location_key(initialization_url): initialization_url})
2865                         f['fragments'].extend(representation_ms_info['fragments'])
2866                     else:
2867                         # Assuming direct URL to unfragmented media.
2868                         f['url'] = base_url
2869                     if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2870                         formats.append(f)
2871                     elif content_type == 'text':
2872                         subtitles.setdefault(lang or 'und', []).append(f)
2873
2874         return formats, subtitles
2875
2876     def _extract_ism_formats(self, *args, **kwargs):
2877         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2878         if subs:
2879             self._report_ignoring_subs('ISM')
2880         return fmts
2881
2882     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2883         res = self._download_xml_handle(
2884             ism_url, video_id,
2885             note='Downloading ISM manifest' if note is None else note,
2886             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2887             fatal=fatal, data=data, headers=headers, query=query)
2888         if res is False:
2889             return [], {}
2890         ism_doc, urlh = res
2891         if ism_doc is None:
2892             return [], {}
2893
2894         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2895
2896     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2897         """
2898         Parse formats from ISM manifest.
2899         References:
2900          1. [MS-SSTR]: Smooth Streaming Protocol,
2901             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2902         """
2903         if ism_doc.get('IsLive') == 'TRUE':
2904             return [], {}
2905
2906         duration = int(ism_doc.attrib['Duration'])
2907         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2908
2909         formats = []
2910         subtitles = {}
2911         for stream in ism_doc.findall('StreamIndex'):
2912             stream_type = stream.get('Type')
2913             if stream_type not in ('video', 'audio', 'text'):
2914                 continue
2915             url_pattern = stream.attrib['Url']
2916             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2917             stream_name = stream.get('Name')
2918             stream_language = stream.get('Language', 'und')
2919             for track in stream.findall('QualityLevel'):
2920                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2921                 # TODO: add support for WVC1 and WMAP
2922                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2923                     self.report_warning('%s is not a supported codec' % fourcc)
2924                     continue
2925                 tbr = int(track.attrib['Bitrate']) // 1000
2926                 # [1] does not mention Width and Height attributes. However,
2927                 # they're often present while MaxWidth and MaxHeight are
2928                 # missing, so should be used as fallbacks
2929                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2930                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2931                 sampling_rate = int_or_none(track.get('SamplingRate'))
2932
2933                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2934                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2935
2936                 fragments = []
2937                 fragment_ctx = {
2938                     'time': 0,
2939                 }
2940                 stream_fragments = stream.findall('c')
2941                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2942                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2943                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2944                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2945                     if not fragment_ctx['duration']:
2946                         try:
2947                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2948                         except IndexError:
2949                             next_fragment_time = duration
2950                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2951                     for _ in range(fragment_repeat):
2952                         fragments.append({
2953                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2954                             'duration': fragment_ctx['duration'] / stream_timescale,
2955                         })
2956                         fragment_ctx['time'] += fragment_ctx['duration']
2957
2958                 format_id = []
2959                 if ism_id:
2960                     format_id.append(ism_id)
2961                 if stream_name:
2962                     format_id.append(stream_name)
2963                 format_id.append(compat_str(tbr))
2964
2965                 if stream_type == 'text':
2966                     subtitles.setdefault(stream_language, []).append({
2967                         'ext': 'ismt',
2968                         'protocol': 'ism',
2969                         'url': ism_url,
2970                         'manifest_url': ism_url,
2971                         'fragments': fragments,
2972                         '_download_params': {
2973                             'stream_type': stream_type,
2974                             'duration': duration,
2975                             'timescale': stream_timescale,
2976                             'fourcc': fourcc,
2977                             'language': stream_language,
2978                             'codec_private_data': track.get('CodecPrivateData'),
2979                         }
2980                     })
2981                 elif stream_type in ('video', 'audio'):
2982                     formats.append({
2983                         'format_id': '-'.join(format_id),
2984                         'url': ism_url,
2985                         'manifest_url': ism_url,
2986                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2987                         'width': width,
2988                         'height': height,
2989                         'tbr': tbr,
2990                         'asr': sampling_rate,
2991                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2992                         'acodec': 'none' if stream_type == 'video' else fourcc,
2993                         'protocol': 'ism',
2994                         'fragments': fragments,
2995                         'has_drm': ism_doc.find('Protection') is not None,
2996                         '_download_params': {
2997                             'stream_type': stream_type,
2998                             'duration': duration,
2999                             'timescale': stream_timescale,
3000                             'width': width or 0,
3001                             'height': height or 0,
3002                             'fourcc': fourcc,
3003                             'language': stream_language,
3004                             'codec_private_data': track.get('CodecPrivateData'),
3005                             'sampling_rate': sampling_rate,
3006                             'channels': int_or_none(track.get('Channels', 2)),
3007                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3008                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3009                         },
3010                     })
3011         return formats, subtitles
3012
3013     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3014         def absolute_url(item_url):
3015             return urljoin(base_url, item_url)
3016
3017         def parse_content_type(content_type):
3018             if not content_type:
3019                 return {}
3020             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3021             if ctr:
3022                 mimetype, codecs = ctr.groups()
3023                 f = parse_codecs(codecs)
3024                 f['ext'] = mimetype2ext(mimetype)
3025                 return f
3026             return {}
3027
3028         def _media_formats(src, cur_media_type, type_info={}):
3029             full_url = absolute_url(src)
3030             ext = type_info.get('ext') or determine_ext(full_url)
3031             if ext == 'm3u8':
3032                 is_plain_url = False
3033                 formats = self._extract_m3u8_formats(
3034                     full_url, video_id, ext='mp4',
3035                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3036                     preference=preference, quality=quality, fatal=False)
3037             elif ext == 'mpd':
3038                 is_plain_url = False
3039                 formats = self._extract_mpd_formats(
3040                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3041             else:
3042                 is_plain_url = True
3043                 formats = [{
3044                     'url': full_url,
3045                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3046                 }]
3047             return is_plain_url, formats
3048
3049         entries = []
3050         # amp-video and amp-audio are very similar to their HTML5 counterparts
3051         # so we wll include them right here (see
3052         # https://www.ampproject.org/docs/reference/components/amp-video)
3053         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3054         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3055         media_tags = [(media_tag, media_tag_name, media_type, '')
3056                       for media_tag, media_tag_name, media_type
3057                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3058         media_tags.extend(re.findall(
3059             # We only allow video|audio followed by a whitespace or '>'.
3060             # Allowing more characters may end up in significant slow down (see
3061             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3062             # http://www.porntrex.com/maps/videositemap.xml).
3063             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3064         for media_tag, _, media_type, media_content in media_tags:
3065             media_info = {
3066                 'formats': [],
3067                 'subtitles': {},
3068             }
3069             media_attributes = extract_attributes(media_tag)
3070             src = strip_or_none(media_attributes.get('src'))
3071             if src:
3072                 _, formats = _media_formats(src, media_type)
3073                 media_info['formats'].extend(formats)
3074             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3075             if media_content:
3076                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3077                     s_attr = extract_attributes(source_tag)
3078                     # data-video-src and data-src are non standard but seen
3079                     # several times in the wild
3080                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3081                     if not src:
3082                         continue
3083                     f = parse_content_type(s_attr.get('type'))
3084                     is_plain_url, formats = _media_formats(src, media_type, f)
3085                     if is_plain_url:
3086                         # width, height, res, label and title attributes are
3087                         # all not standard but seen several times in the wild
3088                         labels = [
3089                             s_attr.get(lbl)
3090                             for lbl in ('label', 'title')
3091                             if str_or_none(s_attr.get(lbl))
3092                         ]
3093                         width = int_or_none(s_attr.get('width'))
3094                         height = (int_or_none(s_attr.get('height'))
3095                                   or int_or_none(s_attr.get('res')))
3096                         if not width or not height:
3097                             for lbl in labels:
3098                                 resolution = parse_resolution(lbl)
3099                                 if not resolution:
3100                                     continue
3101                                 width = width or resolution.get('width')
3102                                 height = height or resolution.get('height')
3103                         for lbl in labels:
3104                             tbr = parse_bitrate(lbl)
3105                             if tbr:
3106                                 break
3107                         else:
3108                             tbr = None
3109                         f.update({
3110                             'width': width,
3111                             'height': height,
3112                             'tbr': tbr,
3113                             'format_id': s_attr.get('label') or s_attr.get('title'),
3114                         })
3115                         f.update(formats[0])
3116                         media_info['formats'].append(f)
3117                     else:
3118                         media_info['formats'].extend(formats)
3119                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3120                     track_attributes = extract_attributes(track_tag)
3121                     kind = track_attributes.get('kind')
3122                     if not kind or kind in ('subtitles', 'captions'):
3123                         src = strip_or_none(track_attributes.get('src'))
3124                         if not src:
3125                             continue
3126                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3127                         media_info['subtitles'].setdefault(lang, []).append({
3128                             'url': absolute_url(src),
3129                         })
3130             for f in media_info['formats']:
3131                 f.setdefault('http_headers', {})['Referer'] = base_url
3132             if media_info['formats'] or media_info['subtitles']:
3133                 entries.append(media_info)
3134         return entries
3135
3136     def _extract_akamai_formats(self, *args, **kwargs):
3137         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3138         if subs:
3139             self._report_ignoring_subs('akamai')
3140         return fmts
3141
3142     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3143         signed = 'hdnea=' in manifest_url
3144         if not signed:
3145             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3146             manifest_url = re.sub(
3147                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3148                 '', manifest_url).strip('?')
3149
3150         formats = []
3151         subtitles = {}
3152
3153         hdcore_sign = 'hdcore=3.7.0'
3154         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3155         hds_host = hosts.get('hds')
3156         if hds_host:
3157             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3158         if 'hdcore=' not in f4m_url:
3159             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3160         f4m_formats = self._extract_f4m_formats(
3161             f4m_url, video_id, f4m_id='hds', fatal=False)
3162         for entry in f4m_formats:
3163             entry.update({'extra_param_to_segment_url': hdcore_sign})
3164         formats.extend(f4m_formats)
3165
3166         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3167         hls_host = hosts.get('hls')
3168         if hls_host:
3169             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3170         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3171             m3u8_url, video_id, 'mp4', 'm3u8_native',
3172             m3u8_id='hls', fatal=False)
3173         formats.extend(m3u8_formats)
3174         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3175
3176         http_host = hosts.get('http')
3177         if http_host and m3u8_formats and not signed:
3178             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3179             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3180             qualities_length = len(qualities)
3181             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3182                 i = 0
3183                 for f in m3u8_formats:
3184                     if f['vcodec'] != 'none':
3185                         for protocol in ('http', 'https'):
3186                             http_f = f.copy()
3187                             del http_f['manifest_url']
3188                             http_url = re.sub(
3189                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3190                             http_f.update({
3191                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3192                                 'url': http_url,
3193                                 'protocol': protocol,
3194                             })
3195                             formats.append(http_f)
3196                         i += 1
3197
3198         return formats, subtitles
3199
3200     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3201         query = compat_urlparse.urlparse(url).query
3202         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3203         mobj = re.search(
3204             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3205         url_base = mobj.group('url')
3206         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3207         formats = []
3208
3209         def manifest_url(manifest):
3210             m_url = '%s/%s' % (http_base_url, manifest)
3211             if query:
3212                 m_url += '?%s' % query
3213             return m_url
3214
3215         if 'm3u8' not in skip_protocols:
3216             formats.extend(self._extract_m3u8_formats(
3217                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3218                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3219         if 'f4m' not in skip_protocols:
3220             formats.extend(self._extract_f4m_formats(
3221                 manifest_url('manifest.f4m'),
3222                 video_id, f4m_id='hds', fatal=False))
3223         if 'dash' not in skip_protocols:
3224             formats.extend(self._extract_mpd_formats(
3225                 manifest_url('manifest.mpd'),
3226                 video_id, mpd_id='dash', fatal=False))
3227         if re.search(r'(?:/smil:|\.smil)', url_base):
3228             if 'smil' not in skip_protocols:
3229                 rtmp_formats = self._extract_smil_formats(
3230                     manifest_url('jwplayer.smil'),
3231                     video_id, fatal=False)
3232                 for rtmp_format in rtmp_formats:
3233                     rtsp_format = rtmp_format.copy()
3234                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3235                     del rtsp_format['play_path']
3236                     del rtsp_format['ext']
3237                     rtsp_format.update({
3238                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3239                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3240                         'protocol': 'rtsp',
3241                     })
3242                     formats.extend([rtmp_format, rtsp_format])
3243         else:
3244             for protocol in ('rtmp', 'rtsp'):
3245                 if protocol not in skip_protocols:
3246                     formats.append({
3247                         'url': '%s:%s' % (protocol, url_base),
3248                         'format_id': protocol,
3249                         'protocol': protocol,
3250                     })
3251         return formats
3252
3253     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3254         mobj = re.search(
3255             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3256             webpage)
3257         if mobj:
3258             try:
3259                 jwplayer_data = self._parse_json(mobj.group('options'),
3260                                                  video_id=video_id,
3261                                                  transform_source=transform_source)
3262             except ExtractorError:
3263                 pass
3264             else:
3265                 if isinstance(jwplayer_data, dict):
3266                     return jwplayer_data
3267
3268     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3269         jwplayer_data = self._find_jwplayer_data(
3270             webpage, video_id, transform_source=js_to_json)
3271         return self._parse_jwplayer_data(
3272             jwplayer_data, video_id, *args, **kwargs)
3273
3274     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3275                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3276         # JWPlayer backward compatibility: flattened playlists
3277         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3278         if 'playlist' not in jwplayer_data:
3279             jwplayer_data = {'playlist': [jwplayer_data]}
3280
3281         entries = []
3282
3283         # JWPlayer backward compatibility: single playlist item
3284         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3285         if not isinstance(jwplayer_data['playlist'], list):
3286             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3287
3288         for video_data in jwplayer_data['playlist']:
3289             # JWPlayer backward compatibility: flattened sources
3290             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3291             if 'sources' not in video_data:
3292                 video_data['sources'] = [video_data]
3293
3294             this_video_id = video_id or video_data['mediaid']
3295
3296             formats = self._parse_jwplayer_formats(
3297                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3298                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3299
3300             subtitles = {}
3301             tracks = video_data.get('tracks')
3302             if tracks and isinstance(tracks, list):
3303                 for track in tracks:
3304                     if not isinstance(track, dict):
3305                         continue
3306                     track_kind = track.get('kind')
3307                     if not track_kind or not isinstance(track_kind, compat_str):
3308                         continue
3309                     if track_kind.lower() not in ('captions', 'subtitles'):
3310                         continue
3311                     track_url = urljoin(base_url, track.get('file'))
3312                     if not track_url:
3313                         continue
3314                     subtitles.setdefault(track.get('label') or 'en', []).append({
3315                         'url': self._proto_relative_url(track_url)
3316                     })
3317
3318             entry = {
3319                 'id': this_video_id,
3320                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3321                 'description': clean_html(video_data.get('description')),
3322                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3323                 'timestamp': int_or_none(video_data.get('pubdate')),
3324                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3325                 'subtitles': subtitles,
3326             }
3327             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3328             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3329                 entry.update({
3330                     '_type': 'url_transparent',
3331                     'url': formats[0]['url'],
3332                 })
3333             else:
3334                 self._sort_formats(formats)
3335                 entry['formats'] = formats
3336             entries.append(entry)
3337         if len(entries) == 1:
3338             return entries[0]
3339         else:
3340             return self.playlist_result(entries)
3341
3342     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3343                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3344         urls = []
3345         formats = []
3346         for source in jwplayer_sources_data:
3347             if not isinstance(source, dict):
3348                 continue
3349             source_url = urljoin(
3350                 base_url, self._proto_relative_url(source.get('file')))
3351             if not source_url or source_url in urls:
3352                 continue
3353             urls.append(source_url)
3354             source_type = source.get('type') or ''
3355             ext = mimetype2ext(source_type) or determine_ext(source_url)
3356             if source_type == 'hls' or ext == 'm3u8':
3357                 formats.extend(self._extract_m3u8_formats(
3358                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3359                     m3u8_id=m3u8_id, fatal=False))
3360             elif source_type == 'dash' or ext == 'mpd':
3361                 formats.extend(self._extract_mpd_formats(
3362                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3363             elif ext == 'smil':
3364                 formats.extend(self._extract_smil_formats(
3365                     source_url, video_id, fatal=False))
3366             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3367             elif source_type.startswith('audio') or ext in (
3368                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3369                 formats.append({
3370                     'url': source_url,
3371                     'vcodec': 'none',
3372                     'ext': ext,
3373                 })
3374             else:
3375                 height = int_or_none(source.get('height'))
3376                 if height is None:
3377                     # Often no height is provided but there is a label in
3378                     # format like "1080p", "720p SD", or 1080.
3379                     height = int_or_none(self._search_regex(
3380                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3381                         'height', default=None))
3382                 a_format = {
3383                     'url': source_url,
3384                     'width': int_or_none(source.get('width')),
3385                     'height': height,
3386                     'tbr': int_or_none(source.get('bitrate')),
3387                     'ext': ext,
3388                 }
3389                 if source_url.startswith('rtmp'):
3390                     a_format['ext'] = 'flv'
3391                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3392                     # of jwplayer.flash.swf
3393                     rtmp_url_parts = re.split(
3394                         r'((?:mp4|mp3|flv):)', source_url, 1)
3395                     if len(rtmp_url_parts) == 3:
3396                         rtmp_url, prefix, play_path = rtmp_url_parts
3397                         a_format.update({
3398                             'url': rtmp_url,
3399                             'play_path': prefix + play_path,
3400                         })
3401                     if rtmp_params:
3402                         a_format.update(rtmp_params)
3403                 formats.append(a_format)
3404         return formats
3405
3406     def _live_title(self, name):
3407         """ Generate the title for a live video """
3408         now = datetime.datetime.now()
3409         now_str = now.strftime('%Y-%m-%d %H:%M')
3410         return name + ' ' + now_str
3411
3412     def _int(self, v, name, fatal=False, **kwargs):
3413         res = int_or_none(v, **kwargs)
3414         if 'get_attr' in kwargs:
3415             print(getattr(v, kwargs['get_attr']))
3416         if res is None:
3417             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3418             if fatal:
3419                 raise ExtractorError(msg)
3420             else:
3421                 self.report_warning(msg)
3422         return res
3423
3424     def _float(self, v, name, fatal=False, **kwargs):
3425         res = float_or_none(v, **kwargs)
3426         if res is None:
3427             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3428             if fatal:
3429                 raise ExtractorError(msg)
3430             else:
3431                 self.report_warning(msg)
3432         return res
3433
3434     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3435                     path='/', secure=False, discard=False, rest={}, **kwargs):
3436         cookie = compat_cookiejar_Cookie(
3437             0, name, value, port, port is not None, domain, True,
3438             domain.startswith('.'), path, True, secure, expire_time,
3439             discard, None, None, rest)
3440         self._downloader.cookiejar.set_cookie(cookie)
3441
3442     def _get_cookies(self, url):
3443         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3444         req = sanitized_Request(url)
3445         self._downloader.cookiejar.add_cookie_header(req)
3446         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3447
3448     def _apply_first_set_cookie_header(self, url_handle, cookie):
3449         """
3450         Apply first Set-Cookie header instead of the last. Experimental.
3451
3452         Some sites (e.g. [1-3]) may serve two cookies under the same name
3453         in Set-Cookie header and expect the first (old) one to be set rather
3454         than second (new). However, as of RFC6265 the newer one cookie
3455         should be set into cookie store what actually happens.
3456         We will workaround this issue by resetting the cookie to
3457         the first one manually.
3458         1. https://new.vk.com/
3459         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3460         3. https://learning.oreilly.com/
3461         """
3462         for header, cookies in url_handle.headers.items():
3463             if header.lower() != 'set-cookie':
3464                 continue
3465             if sys.version_info[0] >= 3:
3466                 cookies = cookies.encode('iso-8859-1')
3467             cookies = cookies.decode('utf-8')
3468             cookie_value = re.search(
3469                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3470             if cookie_value:
3471                 value, domain = cookie_value.groups()
3472                 self._set_cookie(domain, cookie, value)
3473                 break
3474
3475     def get_testcases(self, include_onlymatching=False):
3476         t = getattr(self, '_TEST', None)
3477         if t:
3478             assert not hasattr(self, '_TESTS'), \
3479                 '%s has _TEST and _TESTS' % type(self).__name__
3480             tests = [t]
3481         else:
3482             tests = getattr(self, '_TESTS', [])
3483         for t in tests:
3484             if not include_onlymatching and t.get('only_matching', False):
3485                 continue
3486             t['name'] = type(self).__name__[:-len('IE')]
3487             yield t
3488
3489     def is_suitable(self, age_limit):
3490         """ Test whether the extractor is generally suitable for the given
3491         age limit (i.e. pornographic sites are not, all others usually are) """
3492
3493         any_restricted = False
3494         for tc in self.get_testcases(include_onlymatching=False):
3495             if tc.get('playlist', []):
3496                 tc = tc['playlist'][0]
3497             is_restricted = age_restricted(
3498                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3499             if not is_restricted:
3500                 return True
3501             any_restricted = any_restricted or is_restricted
3502         return not any_restricted
3503
3504     def extract_subtitles(self, *args, **kwargs):
3505         if (self.get_param('writesubtitles', False)
3506                 or self.get_param('listsubtitles')):
3507             return self._get_subtitles(*args, **kwargs)
3508         return {}
3509
3510     def _get_subtitles(self, *args, **kwargs):
3511         raise NotImplementedError('This method must be implemented by subclasses')
3512
3513     def extract_comments(self, *args, **kwargs):
3514         if not self.get_param('getcomments'):
3515             return None
3516         generator = self._get_comments(*args, **kwargs)
3517
3518         def extractor():
3519             comments = []
3520             try:
3521                 while True:
3522                     comments.append(next(generator))
3523             except KeyboardInterrupt:
3524                 interrupted = True
3525                 self.to_screen('Interrupted by user')
3526             except StopIteration:
3527                 interrupted = False
3528             comment_count = len(comments)
3529             self.to_screen(f'Extracted {comment_count} comments')
3530             return {
3531                 'comments': comments,
3532                 'comment_count': None if interrupted else comment_count
3533             }
3534         return extractor
3535
3536     def _get_comments(self, *args, **kwargs):
3537         raise NotImplementedError('This method must be implemented by subclasses')
3538
3539     @staticmethod
3540     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3541         """ Merge subtitle items for one language. Items with duplicated URLs
3542         will be dropped. """
3543         list1_urls = set([item['url'] for item in subtitle_list1])
3544         ret = list(subtitle_list1)
3545         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3546         return ret
3547
3548     @classmethod
3549     def _merge_subtitles(cls, *dicts, target=None):
3550         """ Merge subtitle dictionaries, language by language. """
3551         if target is None:
3552             target = {}
3553         for d in dicts:
3554             for lang, subs in d.items():
3555                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3556         return target
3557
3558     def extract_automatic_captions(self, *args, **kwargs):
3559         if (self.get_param('writeautomaticsub', False)
3560                 or self.get_param('listsubtitles')):
3561             return self._get_automatic_captions(*args, **kwargs)
3562         return {}
3563
3564     def _get_automatic_captions(self, *args, **kwargs):
3565         raise NotImplementedError('This method must be implemented by subclasses')
3566
3567     def mark_watched(self, *args, **kwargs):
3568         if not self.get_param('mark_watched', False):
3569             return
3570         if (self._get_login_info()[0] is not None
3571                 or self.get_param('cookiefile')
3572                 or self.get_param('cookiesfrombrowser')):
3573             self._mark_watched(*args, **kwargs)
3574
3575     def _mark_watched(self, *args, **kwargs):
3576         raise NotImplementedError('This method must be implemented by subclasses')
3577
3578     def geo_verification_headers(self):
3579         headers = {}
3580         geo_verification_proxy = self.get_param('geo_verification_proxy')
3581         if geo_verification_proxy:
3582             headers['Ytdl-request-proxy'] = geo_verification_proxy
3583         return headers
3584
3585     def _generic_id(self, url):
3586         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3587
3588     def _generic_title(self, url):
3589         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3590
3591     @staticmethod
3592     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3593         all_known = all(map(
3594             lambda x: x is not None,
3595             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3596         return (
3597             'private' if is_private
3598             else 'premium_only' if needs_premium
3599             else 'subscriber_only' if needs_subscription
3600             else 'needs_auth' if needs_auth
3601             else 'unlisted' if is_unlisted
3602             else 'public' if all_known
3603             else None)
3604
3605     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3606         '''
3607         @returns            A list of values for the extractor argument given by "key"
3608                             or "default" if no such key is present
3609         @param default      The default value to return when the key is not present (default: [])
3610         @param casesense    When false, the values are converted to lower case
3611         '''
3612         val = traverse_obj(
3613             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3614         if val is None:
3615             return [] if default is NO_DEFAULT else default
3616         return list(val) if casesense else [x.lower() for x in val]
3617
3618
3619 class SearchInfoExtractor(InfoExtractor):
3620     """
3621     Base class for paged search queries extractors.
3622     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3623     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3624     """
3625
3626     @classmethod
3627     def _make_valid_url(cls):
3628         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3629
3630     @classmethod
3631     def suitable(cls, url):
3632         return re.match(cls._make_valid_url(), url) is not None
3633
3634     def _real_extract(self, query):
3635         mobj = re.match(self._make_valid_url(), query)
3636         if mobj is None:
3637             raise ExtractorError('Invalid search query "%s"' % query)
3638
3639         prefix = mobj.group('prefix')
3640         query = mobj.group('query')
3641         if prefix == '':
3642             return self._get_n_results(query, 1)
3643         elif prefix == 'all':
3644             return self._get_n_results(query, self._MAX_RESULTS)
3645         else:
3646             n = int(prefix)
3647             if n <= 0:
3648                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3649             elif n > self._MAX_RESULTS:
3650                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3651                 n = self._MAX_RESULTS
3652             return self._get_n_results(query, n)
3653
3654     def _get_n_results(self, query, n):
3655         """Get a specified number of results for a query.
3656         Either this function or _search_results must be overridden by subclasses """
3657         return self.playlist_result(
3658             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3659             query, query)
3660
3661     def _search_results(self, query):
3662         """Returns an iterator of search results"""
3663         raise NotImplementedError('This method must be implemented by subclasses')
3664
3665     @property
3666     def SEARCH_KEY(self):
3667         return self._SEARCH_KEY