yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import itertools
   8 import json
   9 import netrc
  10 import os
  11 import random
  12 import re
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar_Cookie,
  19     compat_cookies_SimpleCookie,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader import FileDownloader
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     error_to_compat_str,
  49     extract_attributes,
  50     ExtractorError,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     GeoRestrictedError,
  55     GeoUtils,
  56     int_or_none,
  57     js_to_json,
  58     JSON_LD_RE,
  59     mimetype2ext,
  60     network_exceptions,
  61     NO_DEFAULT,
  62     orderedSet,
  63     parse_bitrate,
  64     parse_codecs,
  65     parse_duration,
  66     parse_iso8601,
  67     parse_m3u8_attributes,
  68     parse_resolution,
  69     RegexNotFoundError,
  70     sanitize_filename,
  71     sanitized_Request,
  72     str_or_none,
  73     str_to_int,
  74     strip_or_none,
  75     traverse_obj,
  76     unescapeHTML,
  77     unified_strdate,
  78     unified_timestamp,
  79     update_Request,
  80     update_url_query,
  81     url_basename,
  82     url_or_none,
  83     urljoin,
  84     variadic,
  85     xpath_element,
  86     xpath_text,
  87     xpath_with_ns,
  88 )
  89
  90
  91 class InfoExtractor(object):
  92     """Information Extractor class.
  93
  94     Information extractors are the classes that, given a URL, extract
  95     information about the video (or videos) the URL refers to. This
  96     information includes the real video URL, the video title, author and
  97     others. The information is stored in a dictionary which is then
  98     passed to the YoutubeDL. The YoutubeDL processes this
  99     information possibly downloading the video to the file system, among
 100     other possible outcomes.
 101
 102     The type field determines the type of the result.
 103     By far the most common value (and the default if _type is missing) is
 104     "video", which indicates a single video.
 105
 106     For a video, the dictionaries must include the following fields:
 107
 108     id:             Video identifier.
 109     title:          Video title, unescaped.
 110
 111     Additionally, it must contain either a formats entry or a url one:
 112
 113     formats:        A list of dictionaries for each format available, ordered
 114                     from worst to best quality.
 115
 116                     Potential fields:
 117                     * url        The mandatory URL representing the media:
 118                                    for plain file media - HTTP URL of this file,
 119                                    for RTMP - RTMP URL,
 120                                    for HLS - URL of the M3U8 media playlist,
 121                                    for HDS - URL of the F4M manifest,
 122                                    for DASH
 123                                      - HTTP URL to plain file media (in case of
 124                                        unfragmented media)
 125                                      - URL of the MPD manifest or base URL
 126                                        representing the media if MPD manifest
 127                                        is parsed from a string (in case of
 128                                        fragmented media)
 129                                    for MSS - URL of the ISM manifest.
 130                     * manifest_url
 131                                  The URL of the manifest file in case of
 132                                  fragmented media:
 133                                    for HLS - URL of the M3U8 master playlist,
 134                                    for HDS - URL of the F4M manifest,
 135                                    for DASH - URL of the MPD manifest,
 136                                    for MSS - URL of the ISM manifest.
 137                     * ext        Will be calculated from URL if missing
 138                     * format     A human-readable description of the format
 139                                  ("mp4 container with h264/opus").
 140                                  Calculated from the format_id, width, height.
 141                                  and format_note fields if missing.
 142                     * format_id  A short description of the format
 143                                  ("mp4_h264_opus" or "19").
 144                                 Technically optional, but strongly recommended.
 145                     * format_note Additional info about the format
 146                                  ("3D" or "DASH video")
 147                     * width      Width of the video, if known
 148                     * height     Height of the video, if known
 149                     * resolution Textual description of width and height
 150                     * tbr        Average bitrate of audio and video in KBit/s
 151                     * abr        Average audio bitrate in KBit/s
 152                     * acodec     Name of the audio codec in use
 153                     * asr        Audio sampling rate in Hertz
 154                     * vbr        Average video bitrate in KBit/s
 155                     * fps        Frame rate
 156                     * vcodec     Name of the video codec in use
 157                     * container  Name of the container format
 158                     * filesize   The number of bytes, if known in advance
 159                     * filesize_approx  An estimate for the number of bytes
 160                     * player_url SWF Player URL (used for rtmpdump).
 161                     * protocol   The protocol that will be used for the actual
 162                                  download, lower-case.
 163                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 164                                  "m3u8", "m3u8_native" or "http_dash_segments".
 165                     * fragment_base_url
 166                                  Base URL for fragments. Each fragment's path
 167                                  value (if present) will be relative to
 168                                  this URL.
 169                     * fragments  A list of fragments of a fragmented media.
 170                                  Each fragment entry must contain either an url
 171                                  or a path. If an url is present it should be
 172                                  considered by a client. Otherwise both path and
 173                                  fragment_base_url must be present. Here is
 174                                  the list of all potential fields:
 175                                  * "url" - fragment's URL
 176                                  * "path" - fragment's path relative to
 177                                             fragment_base_url
 178                                  * "duration" (optional, int or float)
 179                                  * "filesize" (optional, int)
 180                     * preference Order number of this format. If this field is
 181                                  present and not None, the formats get sorted
 182                                  by this field, regardless of all other values.
 183                                  -1 for default (order by other properties),
 184                                  -2 or smaller for less than default.
 185                                  < -1000 to hide the format (if there is
 186                                     another one which is strictly better)
 187                     * language   Language code, e.g. "de" or "en-US".
 188                     * language_preference  Is this in the language mentioned in
 189                                  the URL?
 190                                  10 if it's what the URL is about,
 191                                  -1 for default (don't know),
 192                                  -10 otherwise, other values reserved for now.
 193                     * quality    Order number of the video quality of this
 194                                  format, irrespective of the file format.
 195                                  -1 for default (order by other properties),
 196                                  -2 or smaller for less than default.
 197                     * source_preference  Order number for this video source
 198                                   (quality takes higher priority)
 199                                  -1 for default (order by other properties),
 200                                  -2 or smaller for less than default.
 201                     * http_headers  A dictionary of additional HTTP headers
 202                                  to add to the request.
 203                     * stretched_ratio  If given and not 1, indicates that the
 204                                  video's pixels are not square.
 205                                  width : height ratio as float.
 206                     * no_resume  The server does not support resuming the
 207                                  (HTTP or RTMP) download. Boolean.
 208                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 209                     * downloader_options  A dictionary of downloader options as
 210                                  described in FileDownloader
 211                     RTMP formats can also have the additional fields: page_url,
 212                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 213                     rtmp_protocol, rtmp_real_time
 214
 215     url:            Final video URL.
 216     ext:            Video filename extension.
 217     format:         The video format, defaults to ext (used for --get-format)
 218     player_url:     SWF Player URL (used for rtmpdump).
 219
 220     The following fields are optional:
 221
 222     alt_title:      A secondary title of the video.
 223     display_id      An alternative identifier for the video, not necessarily
 224                     unique, but available before title. Typically, id is
 225                     something like "4234987", title "Dancing naked mole rats",
 226                     and display_id "dancing-naked-mole-rats"
 227     thumbnails:     A list of dictionaries, with the following entries:
 228                         * "id" (optional, string) - Thumbnail format ID
 229                         * "url"
 230                         * "preference" (optional, int) - quality of the image
 231                         * "width" (optional, int)
 232                         * "height" (optional, int)
 233                         * "resolution" (optional, string "{width}x{height}",
 234                                         deprecated)
 235                         * "filesize" (optional, int)
 236     thumbnail:      Full URL to a video thumbnail image.
 237     description:    Full video description.
 238     uploader:       Full name of the video uploader.
 239     license:        License name the video is licensed under.
 240     creator:        The creator of the video.
 241     release_timestamp: UNIX timestamp of the moment the video was released.
 242     release_date:   The date (YYYYMMDD) when the video was released.
 243     timestamp:      UNIX timestamp of the moment the video was uploaded
 244     upload_date:    Video upload date (YYYYMMDD).
 245                     If not explicitly set, calculated from timestamp.
 246     uploader_id:    Nickname or id of the video uploader.
 247     uploader_url:   Full URL to a personal webpage of the video uploader.
 248     channel:        Full name of the channel the video is uploaded on.
 249                     Note that channel fields may or may not repeat uploader
 250                     fields. This depends on a particular extractor.
 251     channel_id:     Id of the channel.
 252     channel_url:    Full URL to a channel webpage.
 253     location:       Physical location where the video was filmed.
 254     subtitles:      The available subtitles as a dictionary in the format
 255                     {tag: subformats}. "tag" is usually a language code, and
 256                     "subformats" is a list sorted from lower to higher
 257                     preference, each element is a dictionary with the "ext"
 258                     entry and one of:
 259                         * "data": The subtitles file contents
 260                         * "url": A URL pointing to the subtitles file
 261                     It can optionally also have:
 262                         * "name": Name or description of the subtitles
 263                     "ext" will be calculated from URL if missing
 264     automatic_captions: Like 'subtitles'; contains automatically generated
 265                     captions instead of normal subtitles
 266     duration:       Length of the video in seconds, as an integer or float.
 267     view_count:     How many users have watched the video on the platform.
 268     like_count:     Number of positive ratings of the video
 269     dislike_count:  Number of negative ratings of the video
 270     repost_count:   Number of reposts of the video
 271     average_rating: Average rating give by users, the scale used depends on the webpage
 272     comment_count:  Number of comments on the video
 273     comments:       A list of comments, each with one or more of the following
 274                     properties (all but one of text or html optional):
 275                         * "author" - human-readable name of the comment author
 276                         * "author_id" - user ID of the comment author
 277                         * "author_thumbnail" - The thumbnail of the comment author
 278                         * "id" - Comment ID
 279                         * "html" - Comment as HTML
 280                         * "text" - Plain text of the comment
 281                         * "timestamp" - UNIX timestamp of comment
 282                         * "parent" - ID of the comment this one is replying to.
 283                                      Set to "root" to indicate that this is a
 284                                      comment to the original video.
 285                         * "like_count" - Number of positive ratings of the comment
 286                         * "dislike_count" - Number of negative ratings of the comment
 287                         * "is_favorited" - Whether the comment is marked as
 288                                            favorite by the video uploader
 289                         * "author_is_uploader" - Whether the comment is made by
 290                                                  the video uploader
 291     age_limit:      Age restriction for the video, as an integer (years)
 292     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 293                     should allow to get the same result again. (It will be set
 294                     by YoutubeDL if it's missing)
 295     categories:     A list of categories that the video falls in, for example
 296                     ["Sports", "Berlin"]
 297     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 298     cast:           A list of the video cast
 299     is_live:        True, False, or None (=unknown). Whether this video is a
 300                     live stream that goes on instead of a fixed-length video.
 301     was_live:       True, False, or None (=unknown). Whether this video was
 302                     originally a live stream.
 303     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 304                     If absent, automatically set from is_live, was_live
 305     start_time:     Time in seconds where the reproduction should start, as
 306                     specified in the URL.
 307     end_time:       Time in seconds where the reproduction should end, as
 308                     specified in the URL.
 309     chapters:       A list of dictionaries, with the following entries:
 310                         * "start_time" - The start time of the chapter in seconds
 311                         * "end_time" - The end time of the chapter in seconds
 312                         * "title" (optional, string)
 313     playable_in_embed: Whether this video is allowed to play in embedded
 314                     players on other sites. Can be True (=always allowed),
 315                     False (=never allowed), None (=unknown), or a string
 316                     specifying the criteria for embedability (Eg: 'whitelist')
 317     availability:   Under what condition the video is available. One of
 318                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 319                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 320                     to set it
 321     __post_extractor: A function to be called just before the metadata is
 322                     written to either disk, logger or console. The function
 323                     must return a dict which will be added to the info_dict.
 324                     This is usefull for additional information that is
 325                     time-consuming to extract. Note that the fields thus
 326                     extracted will not be available to output template and
 327                     match_filter. So, only "comments" and "comment_count" are
 328                     currently allowed to be extracted via this method.
 329
 330     The following fields should only be used when the video belongs to some logical
 331     chapter or section:
 332
 333     chapter:        Name or title of the chapter the video belongs to.
 334     chapter_number: Number of the chapter the video belongs to, as an integer.
 335     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 336
 337     The following fields should only be used when the video is an episode of some
 338     series, programme or podcast:
 339
 340     series:         Title of the series or programme the video episode belongs to.
 341     season:         Title of the season the video episode belongs to.
 342     season_number:  Number of the season the video episode belongs to, as an integer.
 343     season_id:      Id of the season the video episode belongs to, as a unicode string.
 344     episode:        Title of the video episode. Unlike mandatory video title field,
 345                     this field should denote the exact title of the video episode
 346                     without any kind of decoration.
 347     episode_number: Number of the video episode within a season, as an integer.
 348     episode_id:     Id of the video episode, as a unicode string.
 349
 350     The following fields should only be used when the media is a track or a part of
 351     a music album:
 352
 353     track:          Title of the track.
 354     track_number:   Number of the track within an album or a disc, as an integer.
 355     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 356                     as a unicode string.
 357     artist:         Artist(s) of the track.
 358     genre:          Genre(s) of the track.
 359     album:          Title of the album the track belongs to.
 360     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 361     album_artist:   List of all artists appeared on the album (e.g.
 362                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 363                     and compilations).
 364     disc_number:    Number of the disc or other physical medium the track belongs to,
 365                     as an integer.
 366     release_year:   Year (YYYY) when the album was released.
 367
 368     Unless mentioned otherwise, the fields should be Unicode strings.
 369
 370     Unless mentioned otherwise, None is equivalent to absence of information.
 371
 372
 373     _type "playlist" indicates multiple videos.
 374     There must be a key "entries", which is a list, an iterable, or a PagedList
 375     object, each element of which is a valid dictionary by this specification.
 376
 377     Additionally, playlists can have "id", "title", and any other relevent
 378     attributes with the same semantics as videos (see above).
 379
 380
 381     _type "multi_video" indicates that there are multiple videos that
 382     form a single show, for examples multiple acts of an opera or TV episode.
 383     It must have an entries key like a playlist and contain all the keys
 384     required for a video at the same time.
 385
 386
 387     _type "url" indicates that the video must be extracted from another
 388     location, possibly by a different extractor. Its only required key is:
 389     "url" - the next URL to extract.
 390     The key "ie_key" can be set to the class name (minus the trailing "IE",
 391     e.g. "Youtube") if the extractor class is known in advance.
 392     Additionally, the dictionary may have any properties of the resolved entity
 393     known in advance, for example "title" if the title of the referred video is
 394     known ahead of time.
 395
 396
 397     _type "url_transparent" entities have the same specification as "url", but
 398     indicate that the given additional information is more precise than the one
 399     associated with the resolved URL.
 400     This is useful when a site employs a video service that hosts the video and
 401     its technical metadata, but that video service does not embed a useful
 402     title, description etc.
 403
 404
 405     Subclasses of this one should re-define the _real_initialize() and
 406     _real_extract() methods and define a _VALID_URL regexp.
 407     Probably, they should also be added to the list of extractors.
 408
 409     Subclasses may also override suitable() if necessary, but ensure the function
 410     signature is preserved and that this function imports everything it needs
 411     (except other extractors), so that lazy_extractors works correctly
 412
 413     _GEO_BYPASS attribute may be set to False in order to disable
 414     geo restriction bypass mechanisms for a particular extractor.
 415     Though it won't disable explicit geo restriction bypass based on
 416     country code provided with geo_bypass_country.
 417
 418     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 419     countries for this extractor. One of these countries will be used by
 420     geo restriction bypass mechanism right away in order to bypass
 421     geo restriction, of course, if the mechanism is not disabled.
 422
 423     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 424     IP blocks in CIDR notation for this extractor. One of these IP blocks
 425     will be used by geo restriction bypass mechanism similarly
 426     to _GEO_COUNTRIES.
 427
 428     The _WORKING attribute should be set to False for broken IEs
 429     in order to warn the users and skip the tests.
 430     """
 431
 432     _ready = False
 433     _downloader = None
 434     _x_forwarded_for_ip = None
 435     _GEO_BYPASS = True
 436     _GEO_COUNTRIES = None
 437     _GEO_IP_BLOCKS = None
 438     _WORKING = True
 439
 440     _LOGIN_HINTS = {
 441         'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
 442         'cookies': (
 443             'Use --cookies-from-browser or --cookies for the authentication. '
 444             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 445         'password': 'Use --username and --password or --netrc to provide account credentials',
 446     }
 447
 448     def __init__(self, downloader=None):
 449         """Constructor. Receives an optional downloader."""
 450         self._ready = False
 451         self._x_forwarded_for_ip = None
 452         self._printed_messages = set()
 453         self.set_downloader(downloader)
 454
 455     @classmethod
 456     def _match_valid_url(cls, url):
 457         # This does not use has/getattr intentionally - we want to know whether
 458         # we have cached the regexp for *this* class, whereas getattr would also
 459         # match the superclass
 460         if '_VALID_URL_RE' not in cls.__dict__:
 461             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 462         return cls._VALID_URL_RE.match(url)
 463
 464     @classmethod
 465     def suitable(cls, url):
 466         """Receives a URL and returns True if suitable for this IE."""
 467         # This function must import everything it needs (except other extractors),
 468         # so that lazy_extractors works correctly
 469         return cls._match_valid_url(url) is not None
 470
 471     @classmethod
 472     def _match_id(cls, url):
 473         return cls._match_valid_url(url).group('id')
 474
 475     @classmethod
 476     def get_temp_id(cls, url):
 477         try:
 478             return cls._match_id(url)
 479         except (IndexError, AttributeError):
 480             return None
 481
 482     @classmethod
 483     def working(cls):
 484         """Getter method for _WORKING."""
 485         return cls._WORKING
 486
 487     def initialize(self):
 488         """Initializes an instance (authentication, etc)."""
 489         self._printed_messages = set()
 490         self._initialize_geo_bypass({
 491             'countries': self._GEO_COUNTRIES,
 492             'ip_blocks': self._GEO_IP_BLOCKS,
 493         })
 494         if not self._ready:
 495             self._real_initialize()
 496             self._ready = True
 497
 498     def _initialize_geo_bypass(self, geo_bypass_context):
 499         """
 500         Initialize geo restriction bypass mechanism.
 501
 502         This method is used to initialize geo bypass mechanism based on faking
 503         X-Forwarded-For HTTP header. A random country from provided country list
 504         is selected and a random IP belonging to this country is generated. This
 505         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 506         HTTP requests.
 507
 508         This method will be used for initial geo bypass mechanism initialization
 509         during the instance initialization with _GEO_COUNTRIES and
 510         _GEO_IP_BLOCKS.
 511
 512         You may also manually call it from extractor's code if geo bypass
 513         information is not available beforehand (e.g. obtained during
 514         extraction) or due to some other reason. In this case you should pass
 515         this information in geo bypass context passed as first argument. It may
 516         contain following fields:
 517
 518         countries:  List of geo unrestricted countries (similar
 519                     to _GEO_COUNTRIES)
 520         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 521                     (similar to _GEO_IP_BLOCKS)
 522
 523         """
 524         if not self._x_forwarded_for_ip:
 525
 526             # Geo bypass mechanism is explicitly disabled by user
 527             if not self.get_param('geo_bypass', True):
 528                 return
 529
 530             if not geo_bypass_context:
 531                 geo_bypass_context = {}
 532
 533             # Backward compatibility: previously _initialize_geo_bypass
 534             # expected a list of countries, some 3rd party code may still use
 535             # it this way
 536             if isinstance(geo_bypass_context, (list, tuple)):
 537                 geo_bypass_context = {
 538                     'countries': geo_bypass_context,
 539                 }
 540
 541             # The whole point of geo bypass mechanism is to fake IP
 542             # as X-Forwarded-For HTTP header based on some IP block or
 543             # country code.
 544
 545             # Path 1: bypassing based on IP block in CIDR notation
 546
 547             # Explicit IP block specified by user, use it right away
 548             # regardless of whether extractor is geo bypassable or not
 549             ip_block = self.get_param('geo_bypass_ip_block', None)
 550
 551             # Otherwise use random IP block from geo bypass context but only
 552             # if extractor is known as geo bypassable
 553             if not ip_block:
 554                 ip_blocks = geo_bypass_context.get('ip_blocks')
 555                 if self._GEO_BYPASS and ip_blocks:
 556                     ip_block = random.choice(ip_blocks)
 557
 558             if ip_block:
 559                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 560                 self._downloader.write_debug(
 561                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 562                 return
 563
 564             # Path 2: bypassing based on country code
 565
 566             # Explicit country code specified by user, use it right away
 567             # regardless of whether extractor is geo bypassable or not
 568             country = self.get_param('geo_bypass_country', None)
 569
 570             # Otherwise use random country code from geo bypass context but
 571             # only if extractor is known as geo bypassable
 572             if not country:
 573                 countries = geo_bypass_context.get('countries')
 574                 if self._GEO_BYPASS and countries:
 575                     country = random.choice(countries)
 576
 577             if country:
 578                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 579                 self._downloader.write_debug(
 580                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 581
 582     def extract(self, url):
 583         """Extracts URL information and returns it in list of dicts."""
 584         try:
 585             for _ in range(2):
 586                 try:
 587                     self.initialize()
 588                     self.write_debug('Extracting URL: %s' % url)
 589                     ie_result = self._real_extract(url)
 590                     if ie_result is None:
 591                         return None
 592                     if self._x_forwarded_for_ip:
 593                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 594                     subtitles = ie_result.get('subtitles')
 595                     if (subtitles and 'live_chat' in subtitles
 596                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 597                         del subtitles['live_chat']
 598                     return ie_result
 599                 except GeoRestrictedError as e:
 600                     if self.__maybe_fake_ip_and_retry(e.countries):
 601                         continue
 602                     raise
 603         except ExtractorError as e:
 604             video_id = e.video_id or self.get_temp_id(url)
 605             raise ExtractorError(
 606                 e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
 607         except compat_http_client.IncompleteRead as e:
 608             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 609         except (KeyError, StopIteration) as e:
 610             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 611
 612     def __maybe_fake_ip_and_retry(self, countries):
 613         if (not self.get_param('geo_bypass_country', None)
 614                 and self._GEO_BYPASS
 615                 and self.get_param('geo_bypass', True)
 616                 and not self._x_forwarded_for_ip
 617                 and countries):
 618             country_code = random.choice(countries)
 619             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 620             if self._x_forwarded_for_ip:
 621                 self.report_warning(
 622                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 623                     % (self._x_forwarded_for_ip, country_code.upper()))
 624                 return True
 625         return False
 626
 627     def set_downloader(self, downloader):
 628         """Sets the downloader for this IE."""
 629         self._downloader = downloader
 630
 631     def _real_initialize(self):
 632         """Real initialization process. Redefine in subclasses."""
 633         pass
 634
 635     def _real_extract(self, url):
 636         """Real extraction process. Redefine in subclasses."""
 637         pass
 638
 639     @classmethod
 640     def ie_key(cls):
 641         """A string for getting the InfoExtractor with get_info_extractor"""
 642         return cls.__name__[:-2]
 643
 644     @property
 645     def IE_NAME(self):
 646         return compat_str(type(self).__name__[:-2])
 647
 648     @staticmethod
 649     def __can_accept_status_code(err, expected_status):
 650         assert isinstance(err, compat_urllib_error.HTTPError)
 651         if expected_status is None:
 652             return False
 653         elif callable(expected_status):
 654             return expected_status(err.code) is True
 655         else:
 656             return err.code in variadic(expected_status)
 657
 658     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 659         """
 660         Return the response handle.
 661
 662         See _download_webpage docstring for arguments specification.
 663         """
 664         if not self._downloader._first_webpage_request:
 665             sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
 666             if sleep_interval > 0:
 667                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 668                 time.sleep(sleep_interval)
 669         else:
 670             self._downloader._first_webpage_request = False
 671
 672         if note is None:
 673             self.report_download_webpage(video_id)
 674         elif note is not False:
 675             if video_id is None:
 676                 self.to_screen('%s' % (note,))
 677             else:
 678                 self.to_screen('%s: %s' % (video_id, note))
 679
 680         # Some sites check X-Forwarded-For HTTP header in order to figure out
 681         # the origin of the client behind proxy. This allows bypassing geo
 682         # restriction by faking this header's value to IP that belongs to some
 683         # geo unrestricted country. We will do so once we encounter any
 684         # geo restriction error.
 685         if self._x_forwarded_for_ip:
 686             if 'X-Forwarded-For' not in headers:
 687                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 688
 689         if isinstance(url_or_request, compat_urllib_request.Request):
 690             url_or_request = update_Request(
 691                 url_or_request, data=data, headers=headers, query=query)
 692         else:
 693             if query:
 694                 url_or_request = update_url_query(url_or_request, query)
 695             if data is not None or headers:
 696                 url_or_request = sanitized_Request(url_or_request, data, headers)
 697         try:
 698             return self._downloader.urlopen(url_or_request)
 699         except network_exceptions as err:
 700             if isinstance(err, compat_urllib_error.HTTPError):
 701                 if self.__can_accept_status_code(err, expected_status):
 702                     # Retain reference to error to prevent file object from
 703                     # being closed before it can be read. Works around the
 704                     # effects of <https://bugs.python.org/issue15002>
 705                     # introduced in Python 3.4.1.
 706                     err.fp._error = err
 707                     return err.fp
 708
 709             if errnote is False:
 710                 return False
 711             if errnote is None:
 712                 errnote = 'Unable to download webpage'
 713
 714             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 715             if fatal:
 716                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 717             else:
 718                 self.report_warning(errmsg)
 719                 return False
 720
 721     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 722         """
 723         Return a tuple (page content as string, URL handle).
 724
 725         See _download_webpage docstring for arguments specification.
 726         """
 727         # Strip hashes from the URL (#1038)
 728         if isinstance(url_or_request, (compat_str, str)):
 729             url_or_request = url_or_request.partition('#')[0]
 730
 731         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 732         if urlh is False:
 733             assert not fatal
 734             return False
 735         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 736         return (content, urlh)
 737
 738     @staticmethod
 739     def _guess_encoding_from_content(content_type, webpage_bytes):
 740         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 741         if m:
 742             encoding = m.group(1)
 743         else:
 744             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 745                           webpage_bytes[:1024])
 746             if m:
 747                 encoding = m.group(1).decode('ascii')
 748             elif webpage_bytes.startswith(b'\xff\xfe'):
 749                 encoding = 'utf-16'
 750             else:
 751                 encoding = 'utf-8'
 752
 753         return encoding
 754
 755     def __check_blocked(self, content):
 756         first_block = content[:512]
 757         if ('<title>Access to this site is blocked</title>' in content
 758                 and 'Websense' in first_block):
 759             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 760             blocked_iframe = self._html_search_regex(
 761                 r'<iframe src="([^"]+)"', content,
 762                 'Websense information URL', default=None)
 763             if blocked_iframe:
 764                 msg += ' Visit %s for more details' % blocked_iframe
 765             raise ExtractorError(msg, expected=True)
 766         if '<title>The URL you requested has been blocked</title>' in first_block:
 767             msg = (
 768                 'Access to this webpage has been blocked by Indian censorship. '
 769                 'Use a VPN or proxy server (with --proxy) to route around it.')
 770             block_msg = self._html_search_regex(
 771                 r'</h1><p>(.*?)</p>',
 772                 content, 'block message', default=None)
 773             if block_msg:
 774                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 775             raise ExtractorError(msg, expected=True)
 776         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 777                 and 'blocklist.rkn.gov.ru' in content):
 778             raise ExtractorError(
 779                 'Access to this webpage has been blocked by decision of the Russian government. '
 780                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 781                 expected=True)
 782
 783     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 784         content_type = urlh.headers.get('Content-Type', '')
 785         webpage_bytes = urlh.read()
 786         if prefix is not None:
 787             webpage_bytes = prefix + webpage_bytes
 788         if not encoding:
 789             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 790         if self.get_param('dump_intermediate_pages', False):
 791             self.to_screen('Dumping request to ' + urlh.geturl())
 792             dump = base64.b64encode(webpage_bytes).decode('ascii')
 793             self._downloader.to_screen(dump)
 794         if self.get_param('write_pages', False):
 795             basen = '%s_%s' % (video_id, urlh.geturl())
 796             trim_length = self.get_param('trim_file_name') or 240
 797             if len(basen) > trim_length:
 798                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 799                 basen = basen[:trim_length - len(h)] + h
 800             raw_filename = basen + '.dump'
 801             filename = sanitize_filename(raw_filename, restricted=True)
 802             self.to_screen('Saving request to ' + filename)
 803             # Working around MAX_PATH limitation on Windows (see
 804             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 805             if compat_os_name == 'nt':
 806                 absfilepath = os.path.abspath(filename)
 807                 if len(absfilepath) > 259:
 808                     filename = '\\\\?\\' + absfilepath
 809             with open(filename, 'wb') as outf:
 810                 outf.write(webpage_bytes)
 811
 812         try:
 813             content = webpage_bytes.decode(encoding, 'replace')
 814         except LookupError:
 815             content = webpage_bytes.decode('utf-8', 'replace')
 816
 817         self.__check_blocked(content)
 818
 819         return content
 820
 821     def _download_webpage(
 822             self, url_or_request, video_id, note=None, errnote=None,
 823             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 824             headers={}, query={}, expected_status=None):
 825         """
 826         Return the data of the page as a string.
 827
 828         Arguments:
 829         url_or_request -- plain text URL as a string or
 830             a compat_urllib_request.Requestobject
 831         video_id -- Video/playlist/item identifier (string)
 832
 833         Keyword arguments:
 834         note -- note printed before downloading (string)
 835         errnote -- note printed in case of an error (string)
 836         fatal -- flag denoting whether error should be considered fatal,
 837             i.e. whether it should cause ExtractionError to be raised,
 838             otherwise a warning will be reported and extraction continued
 839         tries -- number of tries
 840         timeout -- sleep interval between tries
 841         encoding -- encoding for a page content decoding, guessed automatically
 842             when not explicitly specified
 843         data -- POST data (bytes)
 844         headers -- HTTP headers (dict)
 845         query -- URL query (dict)
 846         expected_status -- allows to accept failed HTTP requests (non 2xx
 847             status code) by explicitly specifying a set of accepted status
 848             codes. Can be any of the following entities:
 849                 - an integer type specifying an exact failed status code to
 850                   accept
 851                 - a list or a tuple of integer types specifying a list of
 852                   failed status codes to accept
 853                 - a callable accepting an actual failed status code and
 854                   returning True if it should be accepted
 855             Note that this argument does not affect success status codes (2xx)
 856             which are always accepted.
 857         """
 858
 859         success = False
 860         try_count = 0
 861         while success is False:
 862             try:
 863                 res = self._download_webpage_handle(
 864                     url_or_request, video_id, note, errnote, fatal,
 865                     encoding=encoding, data=data, headers=headers, query=query,
 866                     expected_status=expected_status)
 867                 success = True
 868             except compat_http_client.IncompleteRead as e:
 869                 try_count += 1
 870                 if try_count >= tries:
 871                     raise e
 872                 self._sleep(timeout, video_id)
 873         if res is False:
 874             return res
 875         else:
 876             content, _ = res
 877             return content
 878
 879     def _download_xml_handle(
 880             self, url_or_request, video_id, note='Downloading XML',
 881             errnote='Unable to download XML', transform_source=None,
 882             fatal=True, encoding=None, data=None, headers={}, query={},
 883             expected_status=None):
 884         """
 885         Return a tuple (xml as an compat_etree_Element, URL handle).
 886
 887         See _download_webpage docstring for arguments specification.
 888         """
 889         res = self._download_webpage_handle(
 890             url_or_request, video_id, note, errnote, fatal=fatal,
 891             encoding=encoding, data=data, headers=headers, query=query,
 892             expected_status=expected_status)
 893         if res is False:
 894             return res
 895         xml_string, urlh = res
 896         return self._parse_xml(
 897             xml_string, video_id, transform_source=transform_source,
 898             fatal=fatal), urlh
 899
 900     def _download_xml(
 901             self, url_or_request, video_id,
 902             note='Downloading XML', errnote='Unable to download XML',
 903             transform_source=None, fatal=True, encoding=None,
 904             data=None, headers={}, query={}, expected_status=None):
 905         """
 906         Return the xml as an compat_etree_Element.
 907
 908         See _download_webpage docstring for arguments specification.
 909         """
 910         res = self._download_xml_handle(
 911             url_or_request, video_id, note=note, errnote=errnote,
 912             transform_source=transform_source, fatal=fatal, encoding=encoding,
 913             data=data, headers=headers, query=query,
 914             expected_status=expected_status)
 915         return res if res is False else res[0]
 916
 917     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 918         if transform_source:
 919             xml_string = transform_source(xml_string)
 920         try:
 921             return compat_etree_fromstring(xml_string.encode('utf-8'))
 922         except compat_xml_parse_error as ve:
 923             errmsg = '%s: Failed to parse XML ' % video_id
 924             if fatal:
 925                 raise ExtractorError(errmsg, cause=ve)
 926             else:
 927                 self.report_warning(errmsg + str(ve))
 928
 929     def _download_json_handle(
 930             self, url_or_request, video_id, note='Downloading JSON metadata',
 931             errnote='Unable to download JSON metadata', transform_source=None,
 932             fatal=True, encoding=None, data=None, headers={}, query={},
 933             expected_status=None):
 934         """
 935         Return a tuple (JSON object, URL handle).
 936
 937         See _download_webpage docstring for arguments specification.
 938         """
 939         res = self._download_webpage_handle(
 940             url_or_request, video_id, note, errnote, fatal=fatal,
 941             encoding=encoding, data=data, headers=headers, query=query,
 942             expected_status=expected_status)
 943         if res is False:
 944             return res
 945         json_string, urlh = res
 946         return self._parse_json(
 947             json_string, video_id, transform_source=transform_source,
 948             fatal=fatal), urlh
 949
 950     def _download_json(
 951             self, url_or_request, video_id, note='Downloading JSON metadata',
 952             errnote='Unable to download JSON metadata', transform_source=None,
 953             fatal=True, encoding=None, data=None, headers={}, query={},
 954             expected_status=None):
 955         """
 956         Return the JSON object as a dict.
 957
 958         See _download_webpage docstring for arguments specification.
 959         """
 960         res = self._download_json_handle(
 961             url_or_request, video_id, note=note, errnote=errnote,
 962             transform_source=transform_source, fatal=fatal, encoding=encoding,
 963             data=data, headers=headers, query=query,
 964             expected_status=expected_status)
 965         return res if res is False else res[0]
 966
 967     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 968         if transform_source:
 969             json_string = transform_source(json_string)
 970         try:
 971             return json.loads(json_string)
 972         except ValueError as ve:
 973             errmsg = '%s: Failed to parse JSON ' % video_id
 974             if fatal:
 975                 raise ExtractorError(errmsg, cause=ve)
 976             else:
 977                 self.report_warning(errmsg + str(ve))
 978
 979     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 980         return self._parse_json(
 981             data[data.find('{'):data.rfind('}') + 1],
 982             video_id, transform_source, fatal)
 983
 984     def _download_socket_json_handle(
 985             self, url_or_request, video_id, note='Polling socket',
 986             errnote='Unable to poll socket', transform_source=None,
 987             fatal=True, encoding=None, data=None, headers={}, query={},
 988             expected_status=None):
 989         """
 990         Return a tuple (JSON object, URL handle).
 991
 992         See _download_webpage docstring for arguments specification.
 993         """
 994         res = self._download_webpage_handle(
 995             url_or_request, video_id, note, errnote, fatal=fatal,
 996             encoding=encoding, data=data, headers=headers, query=query,
 997             expected_status=expected_status)
 998         if res is False:
 999             return res
1000         webpage, urlh = res
1001         return self._parse_socket_response_as_json(
1002             webpage, video_id, transform_source=transform_source,
1003             fatal=fatal), urlh
1004
1005     def _download_socket_json(
1006             self, url_or_request, video_id, note='Polling socket',
1007             errnote='Unable to poll socket', transform_source=None,
1008             fatal=True, encoding=None, data=None, headers={}, query={},
1009             expected_status=None):
1010         """
1011         Return the JSON object as a dict.
1012
1013         See _download_webpage docstring for arguments specification.
1014         """
1015         res = self._download_socket_json_handle(
1016             url_or_request, video_id, note=note, errnote=errnote,
1017             transform_source=transform_source, fatal=fatal, encoding=encoding,
1018             data=data, headers=headers, query=query,
1019             expected_status=expected_status)
1020         return res if res is False else res[0]
1021
1022     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1023         idstr = format_field(video_id, template='%s: ')
1024         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1025         if only_once:
1026             if f'WARNING: {msg}' in self._printed_messages:
1027                 return
1028             self._printed_messages.add(f'WARNING: {msg}')
1029         self._downloader.report_warning(msg, *args, **kwargs)
1030
1031     def to_screen(self, msg, *args, **kwargs):
1032         """Print msg to screen, prefixing it with '[ie_name]'"""
1033         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1034
1035     def write_debug(self, msg, *args, **kwargs):
1036         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1037
1038     def get_param(self, name, default=None, *args, **kwargs):
1039         if self._downloader:
1040             return self._downloader.params.get(name, default, *args, **kwargs)
1041         return default
1042
1043     def report_drm(self, video_id, partial=False):
1044         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1045
1046     def report_extraction(self, id_or_name):
1047         """Report information extraction."""
1048         self.to_screen('%s: Extracting information' % id_or_name)
1049
1050     def report_download_webpage(self, video_id):
1051         """Report webpage download."""
1052         self.to_screen('%s: Downloading webpage' % video_id)
1053
1054     def report_age_confirmation(self):
1055         """Report attempt to confirm age."""
1056         self.to_screen('Confirming age')
1057
1058     def report_login(self):
1059         """Report attempt to log in."""
1060         self.to_screen('Logging in')
1061
1062     def raise_login_required(
1063             self, msg='This video is only available for registered users',
1064             metadata_available=False, method='any'):
1065         if metadata_available and self.get_param('ignore_no_formats_error'):
1066             self.report_warning(msg)
1067         if method is not None:
1068             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1069         raise ExtractorError(msg, expected=True)
1070
1071     def raise_geo_restricted(
1072             self, msg='This video is not available from your location due to geo restriction',
1073             countries=None, metadata_available=False):
1074         if metadata_available and self.get_param('ignore_no_formats_error'):
1075             self.report_warning(msg)
1076         else:
1077             raise GeoRestrictedError(msg, countries=countries)
1078
1079     def raise_no_formats(self, msg, expected=False, video_id=None):
1080         if expected and self.get_param('ignore_no_formats_error'):
1081             self.report_warning(msg, video_id)
1082         elif isinstance(msg, ExtractorError):
1083             raise msg
1084         else:
1085             raise ExtractorError(msg, expected=expected, video_id=video_id)
1086
1087     # Methods for following #608
1088     @staticmethod
1089     def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1090         """Returns a URL that points to a page that should be processed"""
1091         # TODO: ie should be the class used for getting the info
1092         video_info = {'_type': 'url',
1093                       'url': url,
1094                       'ie_key': ie}
1095         video_info.update(kwargs)
1096         if video_id is not None:
1097             video_info['id'] = video_id
1098         if video_title is not None:
1099             video_info['title'] = video_title
1100         return video_info
1101
1102     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1103         urls = orderedSet(
1104             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1105             for m in matches)
1106         return self.playlist_result(
1107             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1108
1109     @staticmethod
1110     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1111         """Returns a playlist"""
1112         video_info = {'_type': 'playlist',
1113                       'entries': entries}
1114         video_info.update(kwargs)
1115         if playlist_id:
1116             video_info['id'] = playlist_id
1117         if playlist_title:
1118             video_info['title'] = playlist_title
1119         if playlist_description is not None:
1120             video_info['description'] = playlist_description
1121         return video_info
1122
1123     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1124         """
1125         Perform a regex search on the given string, using a single or a list of
1126         patterns returning the first matching group.
1127         In case of failure return a default value or raise a WARNING or a
1128         RegexNotFoundError, depending on fatal, specifying the field name.
1129         """
1130         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1131             mobj = re.search(pattern, string, flags)
1132         else:
1133             for p in pattern:
1134                 mobj = re.search(p, string, flags)
1135                 if mobj:
1136                     break
1137
1138         _name = self._downloader._color_text(name, 'blue')
1139
1140         if mobj:
1141             if group is None:
1142                 # return the first matching group
1143                 return next(g for g in mobj.groups() if g is not None)
1144             elif isinstance(group, (list, tuple)):
1145                 return tuple(mobj.group(g) for g in group)
1146             else:
1147                 return mobj.group(group)
1148         elif default is not NO_DEFAULT:
1149             return default
1150         elif fatal:
1151             raise RegexNotFoundError('Unable to extract %s' % _name)
1152         else:
1153             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1154             return None
1155
1156     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1157         """
1158         Like _search_regex, but strips HTML tags and unescapes entities.
1159         """
1160         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1161         if res:
1162             return clean_html(res).strip()
1163         else:
1164             return res
1165
1166     def _get_netrc_login_info(self, netrc_machine=None):
1167         username = None
1168         password = None
1169         netrc_machine = netrc_machine or self._NETRC_MACHINE
1170
1171         if self.get_param('usenetrc', False):
1172             try:
1173                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1174                 if os.path.isdir(netrc_file):
1175                     netrc_file = os.path.join(netrc_file, '.netrc')
1176                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1177                 if info is not None:
1178                     username = info[0]
1179                     password = info[2]
1180                 else:
1181                     raise netrc.NetrcParseError(
1182                         'No authenticators for %s' % netrc_machine)
1183             except (IOError, netrc.NetrcParseError) as err:
1184                 self.report_warning(
1185                     'parsing .netrc: %s' % error_to_compat_str(err))
1186
1187         return username, password
1188
1189     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1190         """
1191         Get the login info as (username, password)
1192         First look for the manually specified credentials using username_option
1193         and password_option as keys in params dictionary. If no such credentials
1194         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1195         value.
1196         If there's no info available, return (None, None)
1197         """
1198
1199         # Attempt to use provided username and password or .netrc data
1200         username = self.get_param(username_option)
1201         if username is not None:
1202             password = self.get_param(password_option)
1203         else:
1204             username, password = self._get_netrc_login_info(netrc_machine)
1205
1206         return username, password
1207
1208     def _get_tfa_info(self, note='two-factor verification code'):
1209         """
1210         Get the two-factor authentication info
1211         TODO - asking the user will be required for sms/phone verify
1212         currently just uses the command line option
1213         If there's no info available, return None
1214         """
1215
1216         tfa = self.get_param('twofactor')
1217         if tfa is not None:
1218             return tfa
1219
1220         return compat_getpass('Type %s and press [Return]: ' % note)
1221
1222     # Helper functions for extracting OpenGraph info
1223     @staticmethod
1224     def _og_regexes(prop):
1225         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1226         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1227                        % {'prop': re.escape(prop)})
1228         template = r'<meta[^>]+?%s[^>]+?%s'
1229         return [
1230             template % (property_re, content_re),
1231             template % (content_re, property_re),
1232         ]
1233
1234     @staticmethod
1235     def _meta_regex(prop):
1236         return r'''(?isx)<meta
1237                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1238                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1239
1240     def _og_search_property(self, prop, html, name=None, **kargs):
1241         prop = variadic(prop)
1242         if name is None:
1243             name = 'OpenGraph %s' % prop[0]
1244         og_regexes = []
1245         for p in prop:
1246             og_regexes.extend(self._og_regexes(p))
1247         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1248         if escaped is None:
1249             return None
1250         return unescapeHTML(escaped)
1251
1252     def _og_search_thumbnail(self, html, **kargs):
1253         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1254
1255     def _og_search_description(self, html, **kargs):
1256         return self._og_search_property('description', html, fatal=False, **kargs)
1257
1258     def _og_search_title(self, html, **kargs):
1259         return self._og_search_property('title', html, **kargs)
1260
1261     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1262         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1263         if secure:
1264             regexes = self._og_regexes('video:secure_url') + regexes
1265         return self._html_search_regex(regexes, html, name, **kargs)
1266
1267     def _og_search_url(self, html, **kargs):
1268         return self._og_search_property('url', html, **kargs)
1269
1270     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1271         name = variadic(name)
1272         if display_name is None:
1273             display_name = name[0]
1274         return self._html_search_regex(
1275             [self._meta_regex(n) for n in name],
1276             html, display_name, fatal=fatal, group='content', **kwargs)
1277
1278     def _dc_search_uploader(self, html):
1279         return self._html_search_meta('dc.creator', html, 'uploader')
1280
1281     def _rta_search(self, html):
1282         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1283         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1284                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1285                      html):
1286             return 18
1287         return 0
1288
1289     def _media_rating_search(self, html):
1290         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1291         rating = self._html_search_meta('rating', html)
1292
1293         if not rating:
1294             return None
1295
1296         RATING_TABLE = {
1297             'safe for kids': 0,
1298             'general': 8,
1299             '14 years': 14,
1300             'mature': 17,
1301             'restricted': 19,
1302         }
1303         return RATING_TABLE.get(rating.lower())
1304
1305     def _family_friendly_search(self, html):
1306         # See http://schema.org/VideoObject
1307         family_friendly = self._html_search_meta(
1308             'isFamilyFriendly', html, default=None)
1309
1310         if not family_friendly:
1311             return None
1312
1313         RATING_TABLE = {
1314             '1': 0,
1315             'true': 0,
1316             '0': 18,
1317             'false': 18,
1318         }
1319         return RATING_TABLE.get(family_friendly.lower())
1320
1321     def _twitter_search_player(self, html):
1322         return self._html_search_meta('twitter:player', html,
1323                                       'twitter card player')
1324
1325     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1326         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1327         default = kwargs.get('default', NO_DEFAULT)
1328         # JSON-LD may be malformed and thus `fatal` should be respected.
1329         # At the same time `default` may be passed that assumes `fatal=False`
1330         # for _search_regex. Let's simulate the same behavior here as well.
1331         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1332         json_ld = []
1333         for mobj in json_ld_list:
1334             json_ld_item = self._parse_json(
1335                 mobj.group('json_ld'), video_id, fatal=fatal)
1336             if not json_ld_item:
1337                 continue
1338             if isinstance(json_ld_item, dict):
1339                 json_ld.append(json_ld_item)
1340             elif isinstance(json_ld_item, (list, tuple)):
1341                 json_ld.extend(json_ld_item)
1342         if json_ld:
1343             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1344         if json_ld:
1345             return json_ld
1346         if default is not NO_DEFAULT:
1347             return default
1348         elif fatal:
1349             raise RegexNotFoundError('Unable to extract JSON-LD')
1350         else:
1351             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1352             return {}
1353
1354     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1355         if isinstance(json_ld, compat_str):
1356             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1357         if not json_ld:
1358             return {}
1359         info = {}
1360         if not isinstance(json_ld, (list, tuple, dict)):
1361             return info
1362         if isinstance(json_ld, dict):
1363             json_ld = [json_ld]
1364
1365         INTERACTION_TYPE_MAP = {
1366             'CommentAction': 'comment',
1367             'AgreeAction': 'like',
1368             'DisagreeAction': 'dislike',
1369             'LikeAction': 'like',
1370             'DislikeAction': 'dislike',
1371             'ListenAction': 'view',
1372             'WatchAction': 'view',
1373             'ViewAction': 'view',
1374         }
1375
1376         def extract_interaction_type(e):
1377             interaction_type = e.get('interactionType')
1378             if isinstance(interaction_type, dict):
1379                 interaction_type = interaction_type.get('@type')
1380             return str_or_none(interaction_type)
1381
1382         def extract_interaction_statistic(e):
1383             interaction_statistic = e.get('interactionStatistic')
1384             if isinstance(interaction_statistic, dict):
1385                 interaction_statistic = [interaction_statistic]
1386             if not isinstance(interaction_statistic, list):
1387                 return
1388             for is_e in interaction_statistic:
1389                 if not isinstance(is_e, dict):
1390                     continue
1391                 if is_e.get('@type') != 'InteractionCounter':
1392                     continue
1393                 interaction_type = extract_interaction_type(is_e)
1394                 if not interaction_type:
1395                     continue
1396                 # For interaction count some sites provide string instead of
1397                 # an integer (as per spec) with non digit characters (e.g. ",")
1398                 # so extracting count with more relaxed str_to_int
1399                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1400                 if interaction_count is None:
1401                     continue
1402                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1403                 if not count_kind:
1404                     continue
1405                 count_key = '%s_count' % count_kind
1406                 if info.get(count_key) is not None:
1407                     continue
1408                 info[count_key] = interaction_count
1409
1410         def extract_video_object(e):
1411             assert e['@type'] == 'VideoObject'
1412             author = e.get('author')
1413             info.update({
1414                 'url': url_or_none(e.get('contentUrl')),
1415                 'title': unescapeHTML(e.get('name')),
1416                 'description': unescapeHTML(e.get('description')),
1417                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1418                 'duration': parse_duration(e.get('duration')),
1419                 'timestamp': unified_timestamp(e.get('uploadDate')),
1420                 # author can be an instance of 'Organization' or 'Person' types.
1421                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1422                 # however some websites are using 'Text' type instead.
1423                 # 1. https://schema.org/VideoObject
1424                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1425                 'filesize': float_or_none(e.get('contentSize')),
1426                 'tbr': int_or_none(e.get('bitrate')),
1427                 'width': int_or_none(e.get('width')),
1428                 'height': int_or_none(e.get('height')),
1429                 'view_count': int_or_none(e.get('interactionCount')),
1430             })
1431             extract_interaction_statistic(e)
1432
1433         for e in json_ld:
1434             if '@context' in e:
1435                 item_type = e.get('@type')
1436                 if expected_type is not None and expected_type != item_type:
1437                     continue
1438                 if item_type in ('TVEpisode', 'Episode'):
1439                     episode_name = unescapeHTML(e.get('name'))
1440                     info.update({
1441                         'episode': episode_name,
1442                         'episode_number': int_or_none(e.get('episodeNumber')),
1443                         'description': unescapeHTML(e.get('description')),
1444                     })
1445                     if not info.get('title') and episode_name:
1446                         info['title'] = episode_name
1447                     part_of_season = e.get('partOfSeason')
1448                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1449                         info.update({
1450                             'season': unescapeHTML(part_of_season.get('name')),
1451                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1452                         })
1453                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1454                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1455                         info['series'] = unescapeHTML(part_of_series.get('name'))
1456                 elif item_type == 'Movie':
1457                     info.update({
1458                         'title': unescapeHTML(e.get('name')),
1459                         'description': unescapeHTML(e.get('description')),
1460                         'duration': parse_duration(e.get('duration')),
1461                         'timestamp': unified_timestamp(e.get('dateCreated')),
1462                     })
1463                 elif item_type in ('Article', 'NewsArticle'):
1464                     info.update({
1465                         'timestamp': parse_iso8601(e.get('datePublished')),
1466                         'title': unescapeHTML(e.get('headline')),
1467                         'description': unescapeHTML(e.get('articleBody')),
1468                     })
1469                 elif item_type == 'VideoObject':
1470                     extract_video_object(e)
1471                     if expected_type is None:
1472                         continue
1473                     else:
1474                         break
1475                 video = e.get('video')
1476                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1477                     extract_video_object(video)
1478                 if expected_type is None:
1479                     continue
1480                 else:
1481                     break
1482         return dict((k, v) for k, v in info.items() if v is not None)
1483
1484     @staticmethod
1485     def _hidden_inputs(html):
1486         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1487         hidden_inputs = {}
1488         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1489             attrs = extract_attributes(input)
1490             if not input:
1491                 continue
1492             if attrs.get('type') not in ('hidden', 'submit'):
1493                 continue
1494             name = attrs.get('name') or attrs.get('id')
1495             value = attrs.get('value')
1496             if name and value is not None:
1497                 hidden_inputs[name] = value
1498         return hidden_inputs
1499
1500     def _form_hidden_inputs(self, form_id, html):
1501         form = self._search_regex(
1502             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1503             html, '%s form' % form_id, group='form')
1504         return self._hidden_inputs(form)
1505
1506     class FormatSort:
1507         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1508
1509         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1510                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1511                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1512         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1513                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1514                         'fps', 'fs_approx', 'source', 'format_id')
1515
1516         settings = {
1517             'vcodec': {'type': 'ordered', 'regex': True,
1518                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1519             'acodec': {'type': 'ordered', 'regex': True,
1520                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1521             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1522                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1523             'vext': {'type': 'ordered', 'field': 'video_ext',
1524                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1525                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1526             'aext': {'type': 'ordered', 'field': 'audio_ext',
1527                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1528                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1529             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1530             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1531                            'field': ('vcodec', 'acodec'),
1532                            'function': lambda it: int(any(v != 'none' for v in it))},
1533             'ie_pref': {'priority': True, 'type': 'extractor'},
1534             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1535             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1536             'lang': {'convert': 'ignore', 'field': 'language_preference'},
1537             'quality': {'convert': 'float_none', 'default': -1},
1538             'filesize': {'convert': 'bytes'},
1539             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1540             'id': {'convert': 'string', 'field': 'format_id'},
1541             'height': {'convert': 'float_none'},
1542             'width': {'convert': 'float_none'},
1543             'fps': {'convert': 'float_none'},
1544             'tbr': {'convert': 'float_none'},
1545             'vbr': {'convert': 'float_none'},
1546             'abr': {'convert': 'float_none'},
1547             'asr': {'convert': 'float_none'},
1548             'source': {'convert': 'ignore', 'field': 'source_preference'},
1549
1550             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1551             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1552             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1553             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1554             'res': {'type': 'multiple', 'field': ('height', 'width'),
1555                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1556
1557             # Most of these exist only for compatibility reasons
1558             'dimension': {'type': 'alias', 'field': 'res'},
1559             'resolution': {'type': 'alias', 'field': 'res'},
1560             'extension': {'type': 'alias', 'field': 'ext'},
1561             'bitrate': {'type': 'alias', 'field': 'br'},
1562             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1563             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1564             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1565             'framerate': {'type': 'alias', 'field': 'fps'},
1566             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1567             'protocol': {'type': 'alias', 'field': 'proto'},
1568             'source_preference': {'type': 'alias', 'field': 'source'},
1569             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1570             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1571             'samplerate': {'type': 'alias', 'field': 'asr'},
1572             'video_ext': {'type': 'alias', 'field': 'vext'},
1573             'audio_ext': {'type': 'alias', 'field': 'aext'},
1574             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1575             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1576             'video': {'type': 'alias', 'field': 'hasvid'},
1577             'has_video': {'type': 'alias', 'field': 'hasvid'},
1578             'audio': {'type': 'alias', 'field': 'hasaud'},
1579             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1580             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1581             'preference': {'type': 'alias', 'field': 'ie_pref'},
1582             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1583             'format_id': {'type': 'alias', 'field': 'id'},
1584         }
1585
1586         _order = []
1587
1588         def _get_field_setting(self, field, key):
1589             if field not in self.settings:
1590                 self.settings[field] = {}
1591             propObj = self.settings[field]
1592             if key not in propObj:
1593                 type = propObj.get('type')
1594                 if key == 'field':
1595                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1596                 elif key == 'convert':
1597                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1598                 else:
1599                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1600                 propObj[key] = default
1601             return propObj[key]
1602
1603         def _resolve_field_value(self, field, value, convertNone=False):
1604             if value is None:
1605                 if not convertNone:
1606                     return None
1607             else:
1608                 value = value.lower()
1609             conversion = self._get_field_setting(field, 'convert')
1610             if conversion == 'ignore':
1611                 return None
1612             if conversion == 'string':
1613                 return value
1614             elif conversion == 'float_none':
1615                 return float_or_none(value)
1616             elif conversion == 'bytes':
1617                 return FileDownloader.parse_bytes(value)
1618             elif conversion == 'order':
1619                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1620                 use_regex = self._get_field_setting(field, 'regex')
1621                 list_length = len(order_list)
1622                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1623                 if use_regex and value is not None:
1624                     for i, regex in enumerate(order_list):
1625                         if regex and re.match(regex, value):
1626                             return list_length - i
1627                     return list_length - empty_pos  # not in list
1628                 else:  # not regex or  value = None
1629                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1630             else:
1631                 if value.isnumeric():
1632                     return float(value)
1633                 else:
1634                     self.settings[field]['convert'] = 'string'
1635                     return value
1636
1637         def evaluate_params(self, params, sort_extractor):
1638             self._use_free_order = params.get('prefer_free_formats', False)
1639             self._sort_user = params.get('format_sort', [])
1640             self._sort_extractor = sort_extractor
1641
1642             def add_item(field, reverse, closest, limit_text):
1643                 field = field.lower()
1644                 if field in self._order:
1645                     return
1646                 self._order.append(field)
1647                 limit = self._resolve_field_value(field, limit_text)
1648                 data = {
1649                     'reverse': reverse,
1650                     'closest': False if limit is None else closest,
1651                     'limit_text': limit_text,
1652                     'limit': limit}
1653                 if field in self.settings:
1654                     self.settings[field].update(data)
1655                 else:
1656                     self.settings[field] = data
1657
1658             sort_list = (
1659                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1660                 + (tuple() if params.get('format_sort_force', False)
1661                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1662                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1663
1664             for item in sort_list:
1665                 match = re.match(self.regex, item)
1666                 if match is None:
1667                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1668                 field = match.group('field')
1669                 if field is None:
1670                     continue
1671                 if self._get_field_setting(field, 'type') == 'alias':
1672                     field = self._get_field_setting(field, 'field')
1673                 reverse = match.group('reverse') is not None
1674                 closest = match.group('separator') == '~'
1675                 limit_text = match.group('limit')
1676
1677                 has_limit = limit_text is not None
1678                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1679                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1680
1681                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1682                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1683                 limit_count = len(limits)
1684                 for (i, f) in enumerate(fields):
1685                     add_item(f, reverse, closest,
1686                              limits[i] if i < limit_count
1687                              else limits[0] if has_limit and not has_multiple_limits
1688                              else None)
1689
1690         def print_verbose_info(self, write_debug):
1691             if self._sort_user:
1692                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1693             if self._sort_extractor:
1694                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1695             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1696                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1697                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1698                               self._get_field_setting(field, 'limit_text'),
1699                               self._get_field_setting(field, 'limit'))
1700                 if self._get_field_setting(field, 'limit_text') is not None else '')
1701                 for field in self._order if self._get_field_setting(field, 'visible')]))
1702
1703         def _calculate_field_preference_from_value(self, format, field, type, value):
1704             reverse = self._get_field_setting(field, 'reverse')
1705             closest = self._get_field_setting(field, 'closest')
1706             limit = self._get_field_setting(field, 'limit')
1707
1708             if type == 'extractor':
1709                 maximum = self._get_field_setting(field, 'max')
1710                 if value is None or (maximum is not None and value >= maximum):
1711                     value = -1
1712             elif type == 'boolean':
1713                 in_list = self._get_field_setting(field, 'in_list')
1714                 not_in_list = self._get_field_setting(field, 'not_in_list')
1715                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1716             elif type == 'ordered':
1717                 value = self._resolve_field_value(field, value, True)
1718
1719             # try to convert to number
1720             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1721             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1722             if is_num:
1723                 value = val_num
1724
1725             return ((-10, 0) if value is None
1726                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1727                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1728                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1729                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1730                     else (-1, value, 0))
1731
1732         def _calculate_field_preference(self, format, field):
1733             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1734             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1735             if type == 'multiple':
1736                 type = 'field'  # Only 'field' is allowed in multiple for now
1737                 actual_fields = self._get_field_setting(field, 'field')
1738
1739                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1740             else:
1741                 value = get_value(field)
1742             return self._calculate_field_preference_from_value(format, field, type, value)
1743
1744         def calculate_preference(self, format):
1745             # Determine missing protocol
1746             if not format.get('protocol'):
1747                 format['protocol'] = determine_protocol(format)
1748
1749             # Determine missing ext
1750             if not format.get('ext') and 'url' in format:
1751                 format['ext'] = determine_ext(format['url'])
1752             if format.get('vcodec') == 'none':
1753                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1754                 format['video_ext'] = 'none'
1755             else:
1756                 format['video_ext'] = format['ext']
1757                 format['audio_ext'] = 'none'
1758             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1759             #    format['preference'] = -1000
1760
1761             # Determine missing bitrates
1762             if format.get('tbr') is None:
1763                 if format.get('vbr') is not None and format.get('abr') is not None:
1764                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1765             else:
1766                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1767                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1768                 if format.get('acodec') != 'none' and format.get('abr') is None:
1769                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1770
1771             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1772
1773     def _sort_formats(self, formats, field_preference=[]):
1774         if not formats:
1775             return
1776         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1777         format_sort.evaluate_params(self._downloader.params, field_preference)
1778         if self.get_param('verbose', False):
1779             format_sort.print_verbose_info(self._downloader.write_debug)
1780         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1781
1782     def _check_formats(self, formats, video_id):
1783         if formats:
1784             formats[:] = filter(
1785                 lambda f: self._is_valid_url(
1786                     f['url'], video_id,
1787                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1788                 formats)
1789
1790     @staticmethod
1791     def _remove_duplicate_formats(formats):
1792         format_urls = set()
1793         unique_formats = []
1794         for f in formats:
1795             if f['url'] not in format_urls:
1796                 format_urls.add(f['url'])
1797                 unique_formats.append(f)
1798         formats[:] = unique_formats
1799
1800     def _is_valid_url(self, url, video_id, item='video', headers={}):
1801         url = self._proto_relative_url(url, scheme='http:')
1802         # For now assume non HTTP(S) URLs always valid
1803         if not (url.startswith('http://') or url.startswith('https://')):
1804             return True
1805         try:
1806             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1807             return True
1808         except ExtractorError as e:
1809             self.to_screen(
1810                 '%s: %s URL is invalid, skipping: %s'
1811                 % (video_id, item, error_to_compat_str(e.cause)))
1812             return False
1813
1814     def http_scheme(self):
1815         """ Either "http:" or "https:", depending on the user's preferences """
1816         return (
1817             'http:'
1818             if self.get_param('prefer_insecure', False)
1819             else 'https:')
1820
1821     def _proto_relative_url(self, url, scheme=None):
1822         if url is None:
1823             return url
1824         if url.startswith('//'):
1825             if scheme is None:
1826                 scheme = self.http_scheme()
1827             return scheme + url
1828         else:
1829             return url
1830
1831     def _sleep(self, timeout, video_id, msg_template=None):
1832         if msg_template is None:
1833             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1834         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1835         self.to_screen(msg)
1836         time.sleep(timeout)
1837
1838     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1839                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1840                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1841         manifest = self._download_xml(
1842             manifest_url, video_id, 'Downloading f4m manifest',
1843             'Unable to download f4m manifest',
1844             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1845             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1846             transform_source=transform_source,
1847             fatal=fatal, data=data, headers=headers, query=query)
1848
1849         if manifest is False:
1850             return []
1851
1852         return self._parse_f4m_formats(
1853             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1854             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1855
1856     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1857                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1858                            fatal=True, m3u8_id=None):
1859         if not isinstance(manifest, compat_etree_Element) and not fatal:
1860             return []
1861
1862         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1863         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1864         if akamai_pv is not None and ';' in akamai_pv.text:
1865             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1866             if playerVerificationChallenge.strip() != '':
1867                 return []
1868
1869         formats = []
1870         manifest_version = '1.0'
1871         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1872         if not media_nodes:
1873             manifest_version = '2.0'
1874             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1875         # Remove unsupported DRM protected media from final formats
1876         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1877         media_nodes = remove_encrypted_media(media_nodes)
1878         if not media_nodes:
1879             return formats
1880
1881         manifest_base_url = get_base_url(manifest)
1882
1883         bootstrap_info = xpath_element(
1884             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1885             'bootstrap info', default=None)
1886
1887         vcodec = None
1888         mime_type = xpath_text(
1889             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1890             'base URL', default=None)
1891         if mime_type and mime_type.startswith('audio/'):
1892             vcodec = 'none'
1893
1894         for i, media_el in enumerate(media_nodes):
1895             tbr = int_or_none(media_el.attrib.get('bitrate'))
1896             width = int_or_none(media_el.attrib.get('width'))
1897             height = int_or_none(media_el.attrib.get('height'))
1898             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1899             # If <bootstrapInfo> is present, the specified f4m is a
1900             # stream-level manifest, and only set-level manifests may refer to
1901             # external resources.  See section 11.4 and section 4 of F4M spec
1902             if bootstrap_info is None:
1903                 media_url = None
1904                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1905                 if manifest_version == '2.0':
1906                     media_url = media_el.attrib.get('href')
1907                 if media_url is None:
1908                     media_url = media_el.attrib.get('url')
1909                 if not media_url:
1910                     continue
1911                 manifest_url = (
1912                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1913                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1914                 # If media_url is itself a f4m manifest do the recursive extraction
1915                 # since bitrates in parent manifest (this one) and media_url manifest
1916                 # may differ leading to inability to resolve the format by requested
1917                 # bitrate in f4m downloader
1918                 ext = determine_ext(manifest_url)
1919                 if ext == 'f4m':
1920                     f4m_formats = self._extract_f4m_formats(
1921                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1922                         transform_source=transform_source, fatal=fatal)
1923                     # Sometimes stream-level manifest contains single media entry that
1924                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1925                     # At the same time parent's media entry in set-level manifest may
1926                     # contain it. We will copy it from parent in such cases.
1927                     if len(f4m_formats) == 1:
1928                         f = f4m_formats[0]
1929                         f.update({
1930                             'tbr': f.get('tbr') or tbr,
1931                             'width': f.get('width') or width,
1932                             'height': f.get('height') or height,
1933                             'format_id': f.get('format_id') if not tbr else format_id,
1934                             'vcodec': vcodec,
1935                         })
1936                     formats.extend(f4m_formats)
1937                     continue
1938                 elif ext == 'm3u8':
1939                     formats.extend(self._extract_m3u8_formats(
1940                         manifest_url, video_id, 'mp4', preference=preference,
1941                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1942                     continue
1943             formats.append({
1944                 'format_id': format_id,
1945                 'url': manifest_url,
1946                 'manifest_url': manifest_url,
1947                 'ext': 'flv' if bootstrap_info is not None else None,
1948                 'protocol': 'f4m',
1949                 'tbr': tbr,
1950                 'width': width,
1951                 'height': height,
1952                 'vcodec': vcodec,
1953                 'preference': preference,
1954                 'quality': quality,
1955             })
1956         return formats
1957
1958     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1959         return {
1960             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1961             'url': m3u8_url,
1962             'ext': ext,
1963             'protocol': 'm3u8',
1964             'preference': preference - 100 if preference else -100,
1965             'quality': quality,
1966             'resolution': 'multiple',
1967             'format_note': 'Quality selection URL',
1968         }
1969
1970     def _report_ignoring_subs(self, name):
1971         self.report_warning(bug_reports_message(
1972             f'Ignoring subtitle tracks found in the {name} manifest; '
1973             'if any subtitle tracks are missing,'
1974         ), only_once=True)
1975
1976     def _extract_m3u8_formats(self, *args, **kwargs):
1977         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1978         if subs:
1979             self._report_ignoring_subs('HLS')
1980         return fmts
1981
1982     def _extract_m3u8_formats_and_subtitles(
1983             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1984             preference=None, quality=None, m3u8_id=None, note=None,
1985             errnote=None, fatal=True, live=False, data=None, headers={},
1986             query={}):
1987
1988         res = self._download_webpage_handle(
1989             m3u8_url, video_id,
1990             note='Downloading m3u8 information' if note is None else note,
1991             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1992             fatal=fatal, data=data, headers=headers, query=query)
1993
1994         if res is False:
1995             return [], {}
1996
1997         m3u8_doc, urlh = res
1998         m3u8_url = urlh.geturl()
1999
2000         return self._parse_m3u8_formats_and_subtitles(
2001             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2002             preference=preference, quality=quality, m3u8_id=m3u8_id,
2003             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2004             headers=headers, query=query, video_id=video_id)
2005
2006     def _parse_m3u8_formats_and_subtitles(
2007             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2008             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2009             errnote=None, fatal=True, data=None, headers={}, query={},
2010             video_id=None):
2011         formats, subtitles = [], {}
2012
2013         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
2014             return formats, subtitles
2015
2016         has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc)
2017
2018         def format_url(url):
2019             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2020
2021         if self.get_param('hls_split_discontinuity', False):
2022             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2023                 if not m3u8_doc:
2024                     if not manifest_url:
2025                         return []
2026                     m3u8_doc = self._download_webpage(
2027                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2028                         note=False, errnote='Failed to download m3u8 playlist information')
2029                     if m3u8_doc is False:
2030                         return []
2031                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2032
2033         else:
2034             def _extract_m3u8_playlist_indices(*args, **kwargs):
2035                 return [None]
2036
2037         # References:
2038         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2039         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2040         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2041
2042         # We should try extracting formats only from master playlists [1, 4.3.4],
2043         # i.e. playlists that describe available qualities. On the other hand
2044         # media playlists [1, 4.3.3] should be returned as is since they contain
2045         # just the media without qualities renditions.
2046         # Fortunately, master playlist can be easily distinguished from media
2047         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2048         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2049         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2050         # media playlist and MUST NOT appear in master playlist thus we can
2051         # clearly detect media playlist with this criterion.
2052
2053         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2054             formats = [{
2055                 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
2056                 'format_index': idx,
2057                 'url': m3u8_url,
2058                 'ext': ext,
2059                 'protocol': entry_protocol,
2060                 'preference': preference,
2061                 'quality': quality,
2062                 'has_drm': has_drm,
2063             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2064
2065             return formats, subtitles
2066
2067         groups = {}
2068         last_stream_inf = {}
2069
2070         def extract_media(x_media_line):
2071             media = parse_m3u8_attributes(x_media_line)
2072             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2073             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2074             if not (media_type and group_id and name):
2075                 return
2076             groups.setdefault(group_id, []).append(media)
2077             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2078             if media_type == 'SUBTITLES':
2079                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2080                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2081                 # However, lack of URI has been spotted in the wild.
2082                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2083                 if not media.get('URI'):
2084                     return
2085                 url = format_url(media['URI'])
2086                 sub_info = {
2087                     'url': url,
2088                     'ext': determine_ext(url),
2089                 }
2090                 if sub_info['ext'] == 'm3u8':
2091                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2092                     # files may contain is WebVTT:
2093                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2094                     sub_info['ext'] = 'vtt'
2095                     sub_info['protocol'] = 'm3u8_native'
2096                 lang = media.get('LANGUAGE') or 'und'
2097                 subtitles.setdefault(lang, []).append(sub_info)
2098             if media_type not in ('VIDEO', 'AUDIO'):
2099                 return
2100             media_url = media.get('URI')
2101             if media_url:
2102                 manifest_url = format_url(media_url)
2103                 formats.extend({
2104                     'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
2105                     'format_note': name,
2106                     'format_index': idx,
2107                     'url': manifest_url,
2108                     'manifest_url': m3u8_url,
2109                     'language': media.get('LANGUAGE'),
2110                     'ext': ext,
2111                     'protocol': entry_protocol,
2112                     'preference': preference,
2113                     'quality': quality,
2114                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2115                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2116
2117         def build_stream_name():
2118             # Despite specification does not mention NAME attribute for
2119             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2120             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2121             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2122             stream_name = last_stream_inf.get('NAME')
2123             if stream_name:
2124                 return stream_name
2125             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2126             # from corresponding rendition group
2127             stream_group_id = last_stream_inf.get('VIDEO')
2128             if not stream_group_id:
2129                 return
2130             stream_group = groups.get(stream_group_id)
2131             if not stream_group:
2132                 return stream_group_id
2133             rendition = stream_group[0]
2134             return rendition.get('NAME') or stream_group_id
2135
2136         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2137         # chance to detect video only formats when EXT-X-STREAM-INF tags
2138         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2139         for line in m3u8_doc.splitlines():
2140             if line.startswith('#EXT-X-MEDIA:'):
2141                 extract_media(line)
2142
2143         for line in m3u8_doc.splitlines():
2144             if line.startswith('#EXT-X-STREAM-INF:'):
2145                 last_stream_inf = parse_m3u8_attributes(line)
2146             elif line.startswith('#') or not line.strip():
2147                 continue
2148             else:
2149                 tbr = float_or_none(
2150                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2151                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2152                 manifest_url = format_url(line.strip())
2153
2154                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2155                     format_id = [m3u8_id, None, idx]
2156                     # Bandwidth of live streams may differ over time thus making
2157                     # format_id unpredictable. So it's better to keep provided
2158                     # format_id intact.
2159                     if not live:
2160                         stream_name = build_stream_name()
2161                         format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
2162                     f = {
2163                         'format_id': '-'.join(map(str, filter(None, format_id))),
2164                         'format_index': idx,
2165                         'url': manifest_url,
2166                         'manifest_url': m3u8_url,
2167                         'tbr': tbr,
2168                         'ext': ext,
2169                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2170                         'protocol': entry_protocol,
2171                         'preference': preference,
2172                         'quality': quality,
2173                     }
2174                     resolution = last_stream_inf.get('RESOLUTION')
2175                     if resolution:
2176                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2177                         if mobj:
2178                             f['width'] = int(mobj.group('width'))
2179                             f['height'] = int(mobj.group('height'))
2180                     # Unified Streaming Platform
2181                     mobj = re.search(
2182                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2183                     if mobj:
2184                         abr, vbr = mobj.groups()
2185                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2186                         f.update({
2187                             'vbr': vbr,
2188                             'abr': abr,
2189                         })
2190                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2191                     f.update(codecs)
2192                     audio_group_id = last_stream_inf.get('AUDIO')
2193                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2194                     # references a rendition group MUST have a CODECS attribute.
2195                     # However, this is not always respected, for example, [2]
2196                     # contains EXT-X-STREAM-INF tag which references AUDIO
2197                     # rendition group but does not have CODECS and despite
2198                     # referencing an audio group it represents a complete
2199                     # (with audio and video) format. So, for such cases we will
2200                     # ignore references to rendition groups and treat them
2201                     # as complete formats.
2202                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2203                         audio_group = groups.get(audio_group_id)
2204                         if audio_group and audio_group[0].get('URI'):
2205                             # TODO: update acodec for audio only formats with
2206                             # the same GROUP-ID
2207                             f['acodec'] = 'none'
2208                     if not f.get('ext'):
2209                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2210                     formats.append(f)
2211
2212                     # for DailyMotion
2213                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2214                     if progressive_uri:
2215                         http_f = f.copy()
2216                         del http_f['manifest_url']
2217                         http_f.update({
2218                             'format_id': f['format_id'].replace('hls-', 'http-'),
2219                             'protocol': 'http',
2220                             'url': progressive_uri,
2221                         })
2222                         formats.append(http_f)
2223
2224                 last_stream_inf = {}
2225         return formats, subtitles
2226
2227     def _extract_m3u8_vod_duration(
2228             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2229
2230         m3u8_vod = self._download_webpage(
2231             m3u8_vod_url, video_id,
2232             note='Downloading m3u8 VOD manifest' if note is None else note,
2233             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2234             fatal=False, data=data, headers=headers, query=query)
2235
2236         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2237
2238     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2239         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2240             return None
2241
2242         return int(sum(
2243             float(line[len('#EXTINF:'):].split(',')[0])
2244             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2245
2246     @staticmethod
2247     def _xpath_ns(path, namespace=None):
2248         if not namespace:
2249             return path
2250         out = []
2251         for c in path.split('/'):
2252             if not c or c == '.':
2253                 out.append(c)
2254             else:
2255                 out.append('{%s}%s' % (namespace, c))
2256         return '/'.join(out)
2257
2258     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2259         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2260
2261         if smil is False:
2262             assert not fatal
2263             return []
2264
2265         namespace = self._parse_smil_namespace(smil)
2266
2267         fmts = self._parse_smil_formats(
2268             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2269         subs = self._parse_smil_subtitles(
2270             smil, namespace=namespace)
2271
2272         return fmts, subs
2273
2274     def _extract_smil_formats(self, *args, **kwargs):
2275         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2276         if subs:
2277             self._report_ignoring_subs('SMIL')
2278         return fmts
2279
2280     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2281         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2282         if smil is False:
2283             return {}
2284         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2285
2286     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2287         return self._download_xml(
2288             smil_url, video_id, 'Downloading SMIL file',
2289             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2290
2291     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2292         namespace = self._parse_smil_namespace(smil)
2293
2294         formats = self._parse_smil_formats(
2295             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2296         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2297
2298         video_id = os.path.splitext(url_basename(smil_url))[0]
2299         title = None
2300         description = None
2301         upload_date = None
2302         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2303             name = meta.attrib.get('name')
2304             content = meta.attrib.get('content')
2305             if not name or not content:
2306                 continue
2307             if not title and name == 'title':
2308                 title = content
2309             elif not description and name in ('description', 'abstract'):
2310                 description = content
2311             elif not upload_date and name == 'date':
2312                 upload_date = unified_strdate(content)
2313
2314         thumbnails = [{
2315             'id': image.get('type'),
2316             'url': image.get('src'),
2317             'width': int_or_none(image.get('width')),
2318             'height': int_or_none(image.get('height')),
2319         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2320
2321         return {
2322             'id': video_id,
2323             'title': title or video_id,
2324             'description': description,
2325             'upload_date': upload_date,
2326             'thumbnails': thumbnails,
2327             'formats': formats,
2328             'subtitles': subtitles,
2329         }
2330
2331     def _parse_smil_namespace(self, smil):
2332         return self._search_regex(
2333             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2334
2335     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2336         base = smil_url
2337         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2338             b = meta.get('base') or meta.get('httpBase')
2339             if b:
2340                 base = b
2341                 break
2342
2343         formats = []
2344         rtmp_count = 0
2345         http_count = 0
2346         m3u8_count = 0
2347         imgs_count = 0
2348
2349         srcs = set()
2350         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2351         for medium in media:
2352             src = medium.get('src')
2353             if not src or src in srcs:
2354                 continue
2355             srcs.add(src)
2356
2357             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2358             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2359             width = int_or_none(medium.get('width'))
2360             height = int_or_none(medium.get('height'))
2361             proto = medium.get('proto')
2362             ext = medium.get('ext')
2363             src_ext = determine_ext(src)
2364             streamer = medium.get('streamer') or base
2365
2366             if proto == 'rtmp' or streamer.startswith('rtmp'):
2367                 rtmp_count += 1
2368                 formats.append({
2369                     'url': streamer,
2370                     'play_path': src,
2371                     'ext': 'flv',
2372                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2373                     'tbr': bitrate,
2374                     'filesize': filesize,
2375                     'width': width,
2376                     'height': height,
2377                 })
2378                 if transform_rtmp_url:
2379                     streamer, src = transform_rtmp_url(streamer, src)
2380                     formats[-1].update({
2381                         'url': streamer,
2382                         'play_path': src,
2383                     })
2384                 continue
2385
2386             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2387             src_url = src_url.strip()
2388
2389             if proto == 'm3u8' or src_ext == 'm3u8':
2390                 m3u8_formats = self._extract_m3u8_formats(
2391                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2392                 if len(m3u8_formats) == 1:
2393                     m3u8_count += 1
2394                     m3u8_formats[0].update({
2395                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2396                         'tbr': bitrate,
2397                         'width': width,
2398                         'height': height,
2399                     })
2400                 formats.extend(m3u8_formats)
2401             elif src_ext == 'f4m':
2402                 f4m_url = src_url
2403                 if not f4m_params:
2404                     f4m_params = {
2405                         'hdcore': '3.2.0',
2406                         'plugin': 'flowplayer-3.2.0.1',
2407                     }
2408                 f4m_url += '&' if '?' in f4m_url else '?'
2409                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2410                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2411             elif src_ext == 'mpd':
2412                 formats.extend(self._extract_mpd_formats(
2413                     src_url, video_id, mpd_id='dash', fatal=False))
2414             elif re.search(r'\.ism/[Mm]anifest', src_url):
2415                 formats.extend(self._extract_ism_formats(
2416                     src_url, video_id, ism_id='mss', fatal=False))
2417             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2418                 http_count += 1
2419                 formats.append({
2420                     'url': src_url,
2421                     'ext': ext or src_ext or 'flv',
2422                     'format_id': 'http-%d' % (bitrate or http_count),
2423                     'tbr': bitrate,
2424                     'filesize': filesize,
2425                     'width': width,
2426                     'height': height,
2427                 })
2428
2429         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2430             src = medium.get('src')
2431             if not src or src in srcs:
2432                 continue
2433             srcs.add(src)
2434
2435             imgs_count += 1
2436             formats.append({
2437                 'format_id': 'imagestream-%d' % (imgs_count),
2438                 'url': src,
2439                 'ext': mimetype2ext(medium.get('type')),
2440                 'acodec': 'none',
2441                 'vcodec': 'none',
2442                 'width': int_or_none(medium.get('width')),
2443                 'height': int_or_none(medium.get('height')),
2444                 'format_note': 'SMIL storyboards',
2445             })
2446
2447         return formats
2448
2449     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2450         urls = []
2451         subtitles = {}
2452         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2453             src = textstream.get('src')
2454             if not src or src in urls:
2455                 continue
2456             urls.append(src)
2457             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2458             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2459             subtitles.setdefault(lang, []).append({
2460                 'url': src,
2461                 'ext': ext,
2462             })
2463         return subtitles
2464
2465     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2466         xspf = self._download_xml(
2467             xspf_url, playlist_id, 'Downloading xpsf playlist',
2468             'Unable to download xspf manifest', fatal=fatal)
2469         if xspf is False:
2470             return []
2471         return self._parse_xspf(
2472             xspf, playlist_id, xspf_url=xspf_url,
2473             xspf_base_url=base_url(xspf_url))
2474
2475     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2476         NS_MAP = {
2477             'xspf': 'http://xspf.org/ns/0/',
2478             's1': 'http://static.streamone.nl/player/ns/0',
2479         }
2480
2481         entries = []
2482         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2483             title = xpath_text(
2484                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2485             description = xpath_text(
2486                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2487             thumbnail = xpath_text(
2488                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2489             duration = float_or_none(
2490                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2491
2492             formats = []
2493             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2494                 format_url = urljoin(xspf_base_url, location.text)
2495                 if not format_url:
2496                     continue
2497                 formats.append({
2498                     'url': format_url,
2499                     'manifest_url': xspf_url,
2500                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2501                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2502                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2503                 })
2504             self._sort_formats(formats)
2505
2506             entries.append({
2507                 'id': playlist_id,
2508                 'title': title,
2509                 'description': description,
2510                 'thumbnail': thumbnail,
2511                 'duration': duration,
2512                 'formats': formats,
2513             })
2514         return entries
2515
2516     def _extract_mpd_formats(self, *args, **kwargs):
2517         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2518         if subs:
2519             self._report_ignoring_subs('DASH')
2520         return fmts
2521
2522     def _extract_mpd_formats_and_subtitles(
2523             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2524             fatal=True, data=None, headers={}, query={}):
2525         res = self._download_xml_handle(
2526             mpd_url, video_id,
2527             note='Downloading MPD manifest' if note is None else note,
2528             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2529             fatal=fatal, data=data, headers=headers, query=query)
2530         if res is False:
2531             return [], {}
2532         mpd_doc, urlh = res
2533         if mpd_doc is None:
2534             return [], {}
2535         mpd_base_url = base_url(urlh.geturl())
2536
2537         return self._parse_mpd_formats_and_subtitles(
2538             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2539
2540     def _parse_mpd_formats(self, *args, **kwargs):
2541         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2542         if subs:
2543             self._report_ignoring_subs('DASH')
2544         return fmts
2545
2546     def _parse_mpd_formats_and_subtitles(
2547             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2548         """
2549         Parse formats from MPD manifest.
2550         References:
2551          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2552             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2553          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2554         """
2555         if not self.get_param('dynamic_mpd', True):
2556             if mpd_doc.get('type') == 'dynamic':
2557                 return [], {}
2558
2559         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2560
2561         def _add_ns(path):
2562             return self._xpath_ns(path, namespace)
2563
2564         def is_drm_protected(element):
2565             return element.find(_add_ns('ContentProtection')) is not None
2566
2567         def extract_multisegment_info(element, ms_parent_info):
2568             ms_info = ms_parent_info.copy()
2569
2570             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2571             # common attributes and elements.  We will only extract relevant
2572             # for us.
2573             def extract_common(source):
2574                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2575                 if segment_timeline is not None:
2576                     s_e = segment_timeline.findall(_add_ns('S'))
2577                     if s_e:
2578                         ms_info['total_number'] = 0
2579                         ms_info['s'] = []
2580                         for s in s_e:
2581                             r = int(s.get('r', 0))
2582                             ms_info['total_number'] += 1 + r
2583                             ms_info['s'].append({
2584                                 't': int(s.get('t', 0)),
2585                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2586                                 'd': int(s.attrib['d']),
2587                                 'r': r,
2588                             })
2589                 start_number = source.get('startNumber')
2590                 if start_number:
2591                     ms_info['start_number'] = int(start_number)
2592                 timescale = source.get('timescale')
2593                 if timescale:
2594                     ms_info['timescale'] = int(timescale)
2595                 segment_duration = source.get('duration')
2596                 if segment_duration:
2597                     ms_info['segment_duration'] = float(segment_duration)
2598
2599             def extract_Initialization(source):
2600                 initialization = source.find(_add_ns('Initialization'))
2601                 if initialization is not None:
2602                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2603
2604             segment_list = element.find(_add_ns('SegmentList'))
2605             if segment_list is not None:
2606                 extract_common(segment_list)
2607                 extract_Initialization(segment_list)
2608                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2609                 if segment_urls_e:
2610                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2611             else:
2612                 segment_template = element.find(_add_ns('SegmentTemplate'))
2613                 if segment_template is not None:
2614                     extract_common(segment_template)
2615                     media = segment_template.get('media')
2616                     if media:
2617                         ms_info['media'] = media
2618                     initialization = segment_template.get('initialization')
2619                     if initialization:
2620                         ms_info['initialization'] = initialization
2621                     else:
2622                         extract_Initialization(segment_template)
2623             return ms_info
2624
2625         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2626         formats, subtitles = [], {}
2627         stream_numbers = {'audio': 0, 'video': 0}
2628         for period in mpd_doc.findall(_add_ns('Period')):
2629             period_duration = parse_duration(period.get('duration')) or mpd_duration
2630             period_ms_info = extract_multisegment_info(period, {
2631                 'start_number': 1,
2632                 'timescale': 1,
2633             })
2634             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2635                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2636                 for representation in adaptation_set.findall(_add_ns('Representation')):
2637                     representation_attrib = adaptation_set.attrib.copy()
2638                     representation_attrib.update(representation.attrib)
2639                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2640                     mime_type = representation_attrib['mimeType']
2641                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2642
2643                     codecs = representation_attrib.get('codecs', '')
2644                     if content_type not in ('video', 'audio', 'text'):
2645                         if mime_type == 'image/jpeg':
2646                             content_type = mime_type
2647                         elif codecs.split('.')[0] == 'stpp':
2648                             content_type = 'text'
2649                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2650                             content_type = 'text'
2651                         else:
2652                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2653                             continue
2654
2655                     base_url = ''
2656                     for element in (representation, adaptation_set, period, mpd_doc):
2657                         base_url_e = element.find(_add_ns('BaseURL'))
2658                         if base_url_e is not None:
2659                             base_url = base_url_e.text + base_url
2660                             if re.match(r'^https?://', base_url):
2661                                 break
2662                     if mpd_base_url and base_url.startswith('/'):
2663                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2664                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2665                         if not mpd_base_url.endswith('/'):
2666                             mpd_base_url += '/'
2667                         base_url = mpd_base_url + base_url
2668                     representation_id = representation_attrib.get('id')
2669                     lang = representation_attrib.get('lang')
2670                     url_el = representation.find(_add_ns('BaseURL'))
2671                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2672                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2673                     if representation_id is not None:
2674                         format_id = representation_id
2675                     else:
2676                         format_id = content_type
2677                     if mpd_id:
2678                         format_id = mpd_id + '-' + format_id
2679                     if content_type in ('video', 'audio'):
2680                         f = {
2681                             'format_id': format_id,
2682                             'manifest_url': mpd_url,
2683                             'ext': mimetype2ext(mime_type),
2684                             'width': int_or_none(representation_attrib.get('width')),
2685                             'height': int_or_none(representation_attrib.get('height')),
2686                             'tbr': float_or_none(bandwidth, 1000),
2687                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2688                             'fps': int_or_none(representation_attrib.get('frameRate')),
2689                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2690                             'format_note': 'DASH %s' % content_type,
2691                             'filesize': filesize,
2692                             'container': mimetype2ext(mime_type) + '_dash',
2693                             'manifest_stream_number': stream_numbers[content_type]
2694                         }
2695                         f.update(parse_codecs(codecs))
2696                         stream_numbers[content_type] += 1
2697                     elif content_type == 'text':
2698                         f = {
2699                             'ext': mimetype2ext(mime_type),
2700                             'manifest_url': mpd_url,
2701                             'filesize': filesize,
2702                         }
2703                     elif content_type == 'image/jpeg':
2704                         # See test case in VikiIE
2705                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2706                         f = {
2707                             'format_id': format_id,
2708                             'ext': 'mhtml',
2709                             'manifest_url': mpd_url,
2710                             'format_note': 'DASH storyboards (jpeg)',
2711                             'acodec': 'none',
2712                             'vcodec': 'none',
2713                         }
2714                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2715                         f['has_drm'] = True
2716                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2717
2718                     def prepare_template(template_name, identifiers):
2719                         tmpl = representation_ms_info[template_name]
2720                         # First of, % characters outside $...$ templates
2721                         # must be escaped by doubling for proper processing
2722                         # by % operator string formatting used further (see
2723                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2724                         t = ''
2725                         in_template = False
2726                         for c in tmpl:
2727                             t += c
2728                             if c == '$':
2729                                 in_template = not in_template
2730                             elif c == '%' and not in_template:
2731                                 t += c
2732                         # Next, $...$ templates are translated to their
2733                         # %(...) counterparts to be used with % operator
2734                         if representation_id is not None:
2735                             t = t.replace('$RepresentationID$', representation_id)
2736                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2737                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2738                         t.replace('$$', '$')
2739                         return t
2740
2741                     # @initialization is a regular template like @media one
2742                     # so it should be handled just the same way (see
2743                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2744                     if 'initialization' in representation_ms_info:
2745                         initialization_template = prepare_template(
2746                             'initialization',
2747                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2748                             # $Time$ shall not be included for @initialization thus
2749                             # only $Bandwidth$ remains
2750                             ('Bandwidth', ))
2751                         representation_ms_info['initialization_url'] = initialization_template % {
2752                             'Bandwidth': bandwidth,
2753                         }
2754
2755                     def location_key(location):
2756                         return 'url' if re.match(r'^https?://', location) else 'path'
2757
2758                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2759
2760                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2761                         media_location_key = location_key(media_template)
2762
2763                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2764                         # can't be used at the same time
2765                         if '%(Number' in media_template and 's' not in representation_ms_info:
2766                             segment_duration = None
2767                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2768                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2769                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2770                             representation_ms_info['fragments'] = [{
2771                                 media_location_key: media_template % {
2772                                     'Number': segment_number,
2773                                     'Bandwidth': bandwidth,
2774                                 },
2775                                 'duration': segment_duration,
2776                             } for segment_number in range(
2777                                 representation_ms_info['start_number'],
2778                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2779                         else:
2780                             # $Number*$ or $Time$ in media template with S list available
2781                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2782                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2783                             representation_ms_info['fragments'] = []
2784                             segment_time = 0
2785                             segment_d = None
2786                             segment_number = representation_ms_info['start_number']
2787
2788                             def add_segment_url():
2789                                 segment_url = media_template % {
2790                                     'Time': segment_time,
2791                                     'Bandwidth': bandwidth,
2792                                     'Number': segment_number,
2793                                 }
2794                                 representation_ms_info['fragments'].append({
2795                                     media_location_key: segment_url,
2796                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2797                                 })
2798
2799                             for num, s in enumerate(representation_ms_info['s']):
2800                                 segment_time = s.get('t') or segment_time
2801                                 segment_d = s['d']
2802                                 add_segment_url()
2803                                 segment_number += 1
2804                                 for r in range(s.get('r', 0)):
2805                                     segment_time += segment_d
2806                                     add_segment_url()
2807                                     segment_number += 1
2808                                 segment_time += segment_d
2809                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2810                         # No media template
2811                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2812                         # or any YouTube dashsegments video
2813                         fragments = []
2814                         segment_index = 0
2815                         timescale = representation_ms_info['timescale']
2816                         for s in representation_ms_info['s']:
2817                             duration = float_or_none(s['d'], timescale)
2818                             for r in range(s.get('r', 0) + 1):
2819                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2820                                 fragments.append({
2821                                     location_key(segment_uri): segment_uri,
2822                                     'duration': duration,
2823                                 })
2824                                 segment_index += 1
2825                         representation_ms_info['fragments'] = fragments
2826                     elif 'segment_urls' in representation_ms_info:
2827                         # Segment URLs with no SegmentTimeline
2828                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2829                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2830                         fragments = []
2831                         segment_duration = float_or_none(
2832                             representation_ms_info['segment_duration'],
2833                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2834                         for segment_url in representation_ms_info['segment_urls']:
2835                             fragment = {
2836                                 location_key(segment_url): segment_url,
2837                             }
2838                             if segment_duration:
2839                                 fragment['duration'] = segment_duration
2840                             fragments.append(fragment)
2841                         representation_ms_info['fragments'] = fragments
2842                     # If there is a fragments key available then we correctly recognized fragmented media.
2843                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2844                     # assumption is not necessarily correct since we may simply have no support for
2845                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2846                     if 'fragments' in representation_ms_info:
2847                         f.update({
2848                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2849                             'url': mpd_url or base_url,
2850                             'fragment_base_url': base_url,
2851                             'fragments': [],
2852                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2853                         })
2854                         if 'initialization_url' in representation_ms_info:
2855                             initialization_url = representation_ms_info['initialization_url']
2856                             if not f.get('url'):
2857                                 f['url'] = initialization_url
2858                             f['fragments'].append({location_key(initialization_url): initialization_url})
2859                         f['fragments'].extend(representation_ms_info['fragments'])
2860                     else:
2861                         # Assuming direct URL to unfragmented media.
2862                         f['url'] = base_url
2863                     if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2864                         formats.append(f)
2865                     elif content_type == 'text':
2866                         subtitles.setdefault(lang or 'und', []).append(f)
2867
2868         return formats, subtitles
2869
2870     def _extract_ism_formats(self, *args, **kwargs):
2871         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2872         if subs:
2873             self._report_ignoring_subs('ISM')
2874         return fmts
2875
2876     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2877         res = self._download_xml_handle(
2878             ism_url, video_id,
2879             note='Downloading ISM manifest' if note is None else note,
2880             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2881             fatal=fatal, data=data, headers=headers, query=query)
2882         if res is False:
2883             return [], {}
2884         ism_doc, urlh = res
2885         if ism_doc is None:
2886             return [], {}
2887
2888         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2889
2890     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2891         """
2892         Parse formats from ISM manifest.
2893         References:
2894          1. [MS-SSTR]: Smooth Streaming Protocol,
2895             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2896         """
2897         if ism_doc.get('IsLive') == 'TRUE':
2898             return [], {}
2899
2900         duration = int(ism_doc.attrib['Duration'])
2901         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2902
2903         formats = []
2904         subtitles = {}
2905         for stream in ism_doc.findall('StreamIndex'):
2906             stream_type = stream.get('Type')
2907             if stream_type not in ('video', 'audio', 'text'):
2908                 continue
2909             url_pattern = stream.attrib['Url']
2910             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2911             stream_name = stream.get('Name')
2912             stream_language = stream.get('Language', 'und')
2913             for track in stream.findall('QualityLevel'):
2914                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2915                 # TODO: add support for WVC1 and WMAP
2916                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2917                     self.report_warning('%s is not a supported codec' % fourcc)
2918                     continue
2919                 tbr = int(track.attrib['Bitrate']) // 1000
2920                 # [1] does not mention Width and Height attributes. However,
2921                 # they're often present while MaxWidth and MaxHeight are
2922                 # missing, so should be used as fallbacks
2923                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2924                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2925                 sampling_rate = int_or_none(track.get('SamplingRate'))
2926
2927                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2928                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2929
2930                 fragments = []
2931                 fragment_ctx = {
2932                     'time': 0,
2933                 }
2934                 stream_fragments = stream.findall('c')
2935                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2936                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2937                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2938                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2939                     if not fragment_ctx['duration']:
2940                         try:
2941                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2942                         except IndexError:
2943                             next_fragment_time = duration
2944                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2945                     for _ in range(fragment_repeat):
2946                         fragments.append({
2947                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2948                             'duration': fragment_ctx['duration'] / stream_timescale,
2949                         })
2950                         fragment_ctx['time'] += fragment_ctx['duration']
2951
2952                 format_id = []
2953                 if ism_id:
2954                     format_id.append(ism_id)
2955                 if stream_name:
2956                     format_id.append(stream_name)
2957                 format_id.append(compat_str(tbr))
2958
2959                 if stream_type == 'text':
2960                     subtitles.setdefault(stream_language, []).append({
2961                         'ext': 'ismt',
2962                         'protocol': 'ism',
2963                         'url': ism_url,
2964                         'manifest_url': ism_url,
2965                         'fragments': fragments,
2966                         '_download_params': {
2967                             'stream_type': stream_type,
2968                             'duration': duration,
2969                             'timescale': stream_timescale,
2970                             'fourcc': fourcc,
2971                             'language': stream_language,
2972                             'codec_private_data': track.get('CodecPrivateData'),
2973                         }
2974                     })
2975                 elif stream_type in ('video', 'audio'):
2976                     formats.append({
2977                         'format_id': '-'.join(format_id),
2978                         'url': ism_url,
2979                         'manifest_url': ism_url,
2980                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2981                         'width': width,
2982                         'height': height,
2983                         'tbr': tbr,
2984                         'asr': sampling_rate,
2985                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2986                         'acodec': 'none' if stream_type == 'video' else fourcc,
2987                         'protocol': 'ism',
2988                         'fragments': fragments,
2989                         'has_drm': ism_doc.find('Protection') is not None,
2990                         '_download_params': {
2991                             'stream_type': stream_type,
2992                             'duration': duration,
2993                             'timescale': stream_timescale,
2994                             'width': width or 0,
2995                             'height': height or 0,
2996                             'fourcc': fourcc,
2997                             'language': stream_language,
2998                             'codec_private_data': track.get('CodecPrivateData'),
2999                             'sampling_rate': sampling_rate,
3000                             'channels': int_or_none(track.get('Channels', 2)),
3001                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3002                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3003                         },
3004                     })
3005         return formats, subtitles
3006
3007     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3008         def absolute_url(item_url):
3009             return urljoin(base_url, item_url)
3010
3011         def parse_content_type(content_type):
3012             if not content_type:
3013                 return {}
3014             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3015             if ctr:
3016                 mimetype, codecs = ctr.groups()
3017                 f = parse_codecs(codecs)
3018                 f['ext'] = mimetype2ext(mimetype)
3019                 return f
3020             return {}
3021
3022         def _media_formats(src, cur_media_type, type_info={}):
3023             full_url = absolute_url(src)
3024             ext = type_info.get('ext') or determine_ext(full_url)
3025             if ext == 'm3u8':
3026                 is_plain_url = False
3027                 formats = self._extract_m3u8_formats(
3028                     full_url, video_id, ext='mp4',
3029                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3030                     preference=preference, quality=quality, fatal=False)
3031             elif ext == 'mpd':
3032                 is_plain_url = False
3033                 formats = self._extract_mpd_formats(
3034                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3035             else:
3036                 is_plain_url = True
3037                 formats = [{
3038                     'url': full_url,
3039                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3040                 }]
3041             return is_plain_url, formats
3042
3043         entries = []
3044         # amp-video and amp-audio are very similar to their HTML5 counterparts
3045         # so we wll include them right here (see
3046         # https://www.ampproject.org/docs/reference/components/amp-video)
3047         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3048         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3049         media_tags = [(media_tag, media_tag_name, media_type, '')
3050                       for media_tag, media_tag_name, media_type
3051                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3052         media_tags.extend(re.findall(
3053             # We only allow video|audio followed by a whitespace or '>'.
3054             # Allowing more characters may end up in significant slow down (see
3055             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3056             # http://www.porntrex.com/maps/videositemap.xml).
3057             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3058         for media_tag, _, media_type, media_content in media_tags:
3059             media_info = {
3060                 'formats': [],
3061                 'subtitles': {},
3062             }
3063             media_attributes = extract_attributes(media_tag)
3064             src = strip_or_none(media_attributes.get('src'))
3065             if src:
3066                 _, formats = _media_formats(src, media_type)
3067                 media_info['formats'].extend(formats)
3068             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3069             if media_content:
3070                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3071                     s_attr = extract_attributes(source_tag)
3072                     # data-video-src and data-src are non standard but seen
3073                     # several times in the wild
3074                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3075                     if not src:
3076                         continue
3077                     f = parse_content_type(s_attr.get('type'))
3078                     is_plain_url, formats = _media_formats(src, media_type, f)
3079                     if is_plain_url:
3080                         # width, height, res, label and title attributes are
3081                         # all not standard but seen several times in the wild
3082                         labels = [
3083                             s_attr.get(lbl)
3084                             for lbl in ('label', 'title')
3085                             if str_or_none(s_attr.get(lbl))
3086                         ]
3087                         width = int_or_none(s_attr.get('width'))
3088                         height = (int_or_none(s_attr.get('height'))
3089                                   or int_or_none(s_attr.get('res')))
3090                         if not width or not height:
3091                             for lbl in labels:
3092                                 resolution = parse_resolution(lbl)
3093                                 if not resolution:
3094                                     continue
3095                                 width = width or resolution.get('width')
3096                                 height = height or resolution.get('height')
3097                         for lbl in labels:
3098                             tbr = parse_bitrate(lbl)
3099                             if tbr:
3100                                 break
3101                         else:
3102                             tbr = None
3103                         f.update({
3104                             'width': width,
3105                             'height': height,
3106                             'tbr': tbr,
3107                             'format_id': s_attr.get('label') or s_attr.get('title'),
3108                         })
3109                         f.update(formats[0])
3110                         media_info['formats'].append(f)
3111                     else:
3112                         media_info['formats'].extend(formats)
3113                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3114                     track_attributes = extract_attributes(track_tag)
3115                     kind = track_attributes.get('kind')
3116                     if not kind or kind in ('subtitles', 'captions'):
3117                         src = strip_or_none(track_attributes.get('src'))
3118                         if not src:
3119                             continue
3120                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3121                         media_info['subtitles'].setdefault(lang, []).append({
3122                             'url': absolute_url(src),
3123                         })
3124             for f in media_info['formats']:
3125                 f.setdefault('http_headers', {})['Referer'] = base_url
3126             if media_info['formats'] or media_info['subtitles']:
3127                 entries.append(media_info)
3128         return entries
3129
3130     def _extract_akamai_formats(self, *args, **kwargs):
3131         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3132         if subs:
3133             self._report_ignoring_subs('akamai')
3134         return fmts
3135
3136     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3137         signed = 'hdnea=' in manifest_url
3138         if not signed:
3139             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3140             manifest_url = re.sub(
3141                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3142                 '', manifest_url).strip('?')
3143
3144         formats = []
3145         subtitles = {}
3146
3147         hdcore_sign = 'hdcore=3.7.0'
3148         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3149         hds_host = hosts.get('hds')
3150         if hds_host:
3151             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3152         if 'hdcore=' not in f4m_url:
3153             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3154         f4m_formats = self._extract_f4m_formats(
3155             f4m_url, video_id, f4m_id='hds', fatal=False)
3156         for entry in f4m_formats:
3157             entry.update({'extra_param_to_segment_url': hdcore_sign})
3158         formats.extend(f4m_formats)
3159
3160         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3161         hls_host = hosts.get('hls')
3162         if hls_host:
3163             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3164         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3165             m3u8_url, video_id, 'mp4', 'm3u8_native',
3166             m3u8_id='hls', fatal=False)
3167         formats.extend(m3u8_formats)
3168         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3169
3170         http_host = hosts.get('http')
3171         if http_host and m3u8_formats and not signed:
3172             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3173             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3174             qualities_length = len(qualities)
3175             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3176                 i = 0
3177                 for f in m3u8_formats:
3178                     if f['vcodec'] != 'none':
3179                         for protocol in ('http', 'https'):
3180                             http_f = f.copy()
3181                             del http_f['manifest_url']
3182                             http_url = re.sub(
3183                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3184                             http_f.update({
3185                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3186                                 'url': http_url,
3187                                 'protocol': protocol,
3188                             })
3189                             formats.append(http_f)
3190                         i += 1
3191
3192         return formats, subtitles
3193
3194     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3195         query = compat_urlparse.urlparse(url).query
3196         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3197         mobj = re.search(
3198             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3199         url_base = mobj.group('url')
3200         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3201         formats = []
3202
3203         def manifest_url(manifest):
3204             m_url = '%s/%s' % (http_base_url, manifest)
3205             if query:
3206                 m_url += '?%s' % query
3207             return m_url
3208
3209         if 'm3u8' not in skip_protocols:
3210             formats.extend(self._extract_m3u8_formats(
3211                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3212                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3213         if 'f4m' not in skip_protocols:
3214             formats.extend(self._extract_f4m_formats(
3215                 manifest_url('manifest.f4m'),
3216                 video_id, f4m_id='hds', fatal=False))
3217         if 'dash' not in skip_protocols:
3218             formats.extend(self._extract_mpd_formats(
3219                 manifest_url('manifest.mpd'),
3220                 video_id, mpd_id='dash', fatal=False))
3221         if re.search(r'(?:/smil:|\.smil)', url_base):
3222             if 'smil' not in skip_protocols:
3223                 rtmp_formats = self._extract_smil_formats(
3224                     manifest_url('jwplayer.smil'),
3225                     video_id, fatal=False)
3226                 for rtmp_format in rtmp_formats:
3227                     rtsp_format = rtmp_format.copy()
3228                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3229                     del rtsp_format['play_path']
3230                     del rtsp_format['ext']
3231                     rtsp_format.update({
3232                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3233                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3234                         'protocol': 'rtsp',
3235                     })
3236                     formats.extend([rtmp_format, rtsp_format])
3237         else:
3238             for protocol in ('rtmp', 'rtsp'):
3239                 if protocol not in skip_protocols:
3240                     formats.append({
3241                         'url': '%s:%s' % (protocol, url_base),
3242                         'format_id': protocol,
3243                         'protocol': protocol,
3244                     })
3245         return formats
3246
3247     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3248         mobj = re.search(
3249             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3250             webpage)
3251         if mobj:
3252             try:
3253                 jwplayer_data = self._parse_json(mobj.group('options'),
3254                                                  video_id=video_id,
3255                                                  transform_source=transform_source)
3256             except ExtractorError:
3257                 pass
3258             else:
3259                 if isinstance(jwplayer_data, dict):
3260                     return jwplayer_data
3261
3262     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3263         jwplayer_data = self._find_jwplayer_data(
3264             webpage, video_id, transform_source=js_to_json)
3265         return self._parse_jwplayer_data(
3266             jwplayer_data, video_id, *args, **kwargs)
3267
3268     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3269                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3270         # JWPlayer backward compatibility: flattened playlists
3271         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3272         if 'playlist' not in jwplayer_data:
3273             jwplayer_data = {'playlist': [jwplayer_data]}
3274
3275         entries = []
3276
3277         # JWPlayer backward compatibility: single playlist item
3278         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3279         if not isinstance(jwplayer_data['playlist'], list):
3280             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3281
3282         for video_data in jwplayer_data['playlist']:
3283             # JWPlayer backward compatibility: flattened sources
3284             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3285             if 'sources' not in video_data:
3286                 video_data['sources'] = [video_data]
3287
3288             this_video_id = video_id or video_data['mediaid']
3289
3290             formats = self._parse_jwplayer_formats(
3291                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3292                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3293
3294             subtitles = {}
3295             tracks = video_data.get('tracks')
3296             if tracks and isinstance(tracks, list):
3297                 for track in tracks:
3298                     if not isinstance(track, dict):
3299                         continue
3300                     track_kind = track.get('kind')
3301                     if not track_kind or not isinstance(track_kind, compat_str):
3302                         continue
3303                     if track_kind.lower() not in ('captions', 'subtitles'):
3304                         continue
3305                     track_url = urljoin(base_url, track.get('file'))
3306                     if not track_url:
3307                         continue
3308                     subtitles.setdefault(track.get('label') or 'en', []).append({
3309                         'url': self._proto_relative_url(track_url)
3310                     })
3311
3312             entry = {
3313                 'id': this_video_id,
3314                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3315                 'description': clean_html(video_data.get('description')),
3316                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3317                 'timestamp': int_or_none(video_data.get('pubdate')),
3318                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3319                 'subtitles': subtitles,
3320             }
3321             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3322             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3323                 entry.update({
3324                     '_type': 'url_transparent',
3325                     'url': formats[0]['url'],
3326                 })
3327             else:
3328                 self._sort_formats(formats)
3329                 entry['formats'] = formats
3330             entries.append(entry)
3331         if len(entries) == 1:
3332             return entries[0]
3333         else:
3334             return self.playlist_result(entries)
3335
3336     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3337                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3338         urls = []
3339         formats = []
3340         for source in jwplayer_sources_data:
3341             if not isinstance(source, dict):
3342                 continue
3343             source_url = urljoin(
3344                 base_url, self._proto_relative_url(source.get('file')))
3345             if not source_url or source_url in urls:
3346                 continue
3347             urls.append(source_url)
3348             source_type = source.get('type') or ''
3349             ext = mimetype2ext(source_type) or determine_ext(source_url)
3350             if source_type == 'hls' or ext == 'm3u8':
3351                 formats.extend(self._extract_m3u8_formats(
3352                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3353                     m3u8_id=m3u8_id, fatal=False))
3354             elif source_type == 'dash' or ext == 'mpd':
3355                 formats.extend(self._extract_mpd_formats(
3356                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3357             elif ext == 'smil':
3358                 formats.extend(self._extract_smil_formats(
3359                     source_url, video_id, fatal=False))
3360             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3361             elif source_type.startswith('audio') or ext in (
3362                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3363                 formats.append({
3364                     'url': source_url,
3365                     'vcodec': 'none',
3366                     'ext': ext,
3367                 })
3368             else:
3369                 height = int_or_none(source.get('height'))
3370                 if height is None:
3371                     # Often no height is provided but there is a label in
3372                     # format like "1080p", "720p SD", or 1080.
3373                     height = int_or_none(self._search_regex(
3374                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3375                         'height', default=None))
3376                 a_format = {
3377                     'url': source_url,
3378                     'width': int_or_none(source.get('width')),
3379                     'height': height,
3380                     'tbr': int_or_none(source.get('bitrate')),
3381                     'ext': ext,
3382                 }
3383                 if source_url.startswith('rtmp'):
3384                     a_format['ext'] = 'flv'
3385                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3386                     # of jwplayer.flash.swf
3387                     rtmp_url_parts = re.split(
3388                         r'((?:mp4|mp3|flv):)', source_url, 1)
3389                     if len(rtmp_url_parts) == 3:
3390                         rtmp_url, prefix, play_path = rtmp_url_parts
3391                         a_format.update({
3392                             'url': rtmp_url,
3393                             'play_path': prefix + play_path,
3394                         })
3395                     if rtmp_params:
3396                         a_format.update(rtmp_params)
3397                 formats.append(a_format)
3398         return formats
3399
3400     def _live_title(self, name):
3401         """ Generate the title for a live video """
3402         now = datetime.datetime.now()
3403         now_str = now.strftime('%Y-%m-%d %H:%M')
3404         return name + ' ' + now_str
3405
3406     def _int(self, v, name, fatal=False, **kwargs):
3407         res = int_or_none(v, **kwargs)
3408         if 'get_attr' in kwargs:
3409             print(getattr(v, kwargs['get_attr']))
3410         if res is None:
3411             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3412             if fatal:
3413                 raise ExtractorError(msg)
3414             else:
3415                 self.report_warning(msg)
3416         return res
3417
3418     def _float(self, v, name, fatal=False, **kwargs):
3419         res = float_or_none(v, **kwargs)
3420         if res is None:
3421             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3422             if fatal:
3423                 raise ExtractorError(msg)
3424             else:
3425                 self.report_warning(msg)
3426         return res
3427
3428     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3429                     path='/', secure=False, discard=False, rest={}, **kwargs):
3430         cookie = compat_cookiejar_Cookie(
3431             0, name, value, port, port is not None, domain, True,
3432             domain.startswith('.'), path, True, secure, expire_time,
3433             discard, None, None, rest)
3434         self._downloader.cookiejar.set_cookie(cookie)
3435
3436     def _get_cookies(self, url):
3437         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3438         req = sanitized_Request(url)
3439         self._downloader.cookiejar.add_cookie_header(req)
3440         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3441
3442     def _apply_first_set_cookie_header(self, url_handle, cookie):
3443         """
3444         Apply first Set-Cookie header instead of the last. Experimental.
3445
3446         Some sites (e.g. [1-3]) may serve two cookies under the same name
3447         in Set-Cookie header and expect the first (old) one to be set rather
3448         than second (new). However, as of RFC6265 the newer one cookie
3449         should be set into cookie store what actually happens.
3450         We will workaround this issue by resetting the cookie to
3451         the first one manually.
3452         1. https://new.vk.com/
3453         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3454         3. https://learning.oreilly.com/
3455         """
3456         for header, cookies in url_handle.headers.items():
3457             if header.lower() != 'set-cookie':
3458                 continue
3459             if sys.version_info[0] >= 3:
3460                 cookies = cookies.encode('iso-8859-1')
3461             cookies = cookies.decode('utf-8')
3462             cookie_value = re.search(
3463                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3464             if cookie_value:
3465                 value, domain = cookie_value.groups()
3466                 self._set_cookie(domain, cookie, value)
3467                 break
3468
3469     def get_testcases(self, include_onlymatching=False):
3470         t = getattr(self, '_TEST', None)
3471         if t:
3472             assert not hasattr(self, '_TESTS'), \
3473                 '%s has _TEST and _TESTS' % type(self).__name__
3474             tests = [t]
3475         else:
3476             tests = getattr(self, '_TESTS', [])
3477         for t in tests:
3478             if not include_onlymatching and t.get('only_matching', False):
3479                 continue
3480             t['name'] = type(self).__name__[:-len('IE')]
3481             yield t
3482
3483     def is_suitable(self, age_limit):
3484         """ Test whether the extractor is generally suitable for the given
3485         age limit (i.e. pornographic sites are not, all others usually are) """
3486
3487         any_restricted = False
3488         for tc in self.get_testcases(include_onlymatching=False):
3489             if tc.get('playlist', []):
3490                 tc = tc['playlist'][0]
3491             is_restricted = age_restricted(
3492                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3493             if not is_restricted:
3494                 return True
3495             any_restricted = any_restricted or is_restricted
3496         return not any_restricted
3497
3498     def extract_subtitles(self, *args, **kwargs):
3499         if (self.get_param('writesubtitles', False)
3500                 or self.get_param('listsubtitles')):
3501             return self._get_subtitles(*args, **kwargs)
3502         return {}
3503
3504     def _get_subtitles(self, *args, **kwargs):
3505         raise NotImplementedError('This method must be implemented by subclasses')
3506
3507     def extract_comments(self, *args, **kwargs):
3508         if not self.get_param('getcomments'):
3509             return None
3510         generator = self._get_comments(*args, **kwargs)
3511
3512         def extractor():
3513             comments = []
3514             try:
3515                 while True:
3516                     comments.append(next(generator))
3517             except KeyboardInterrupt:
3518                 interrupted = True
3519                 self.to_screen('Interrupted by user')
3520             except StopIteration:
3521                 interrupted = False
3522             comment_count = len(comments)
3523             self.to_screen(f'Extracted {comment_count} comments')
3524             return {
3525                 'comments': comments,
3526                 'comment_count': None if interrupted else comment_count
3527             }
3528         return extractor
3529
3530     def _get_comments(self, *args, **kwargs):
3531         raise NotImplementedError('This method must be implemented by subclasses')
3532
3533     @staticmethod
3534     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3535         """ Merge subtitle items for one language. Items with duplicated URLs
3536         will be dropped. """
3537         list1_urls = set([item['url'] for item in subtitle_list1])
3538         ret = list(subtitle_list1)
3539         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3540         return ret
3541
3542     @classmethod
3543     def _merge_subtitles(cls, *dicts, target=None):
3544         """ Merge subtitle dictionaries, language by language. """
3545         if target is None:
3546             target = {}
3547         for d in dicts:
3548             for lang, subs in d.items():
3549                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3550         return target
3551
3552     def extract_automatic_captions(self, *args, **kwargs):
3553         if (self.get_param('writeautomaticsub', False)
3554                 or self.get_param('listsubtitles')):
3555             return self._get_automatic_captions(*args, **kwargs)
3556         return {}
3557
3558     def _get_automatic_captions(self, *args, **kwargs):
3559         raise NotImplementedError('This method must be implemented by subclasses')
3560
3561     def mark_watched(self, *args, **kwargs):
3562         if not self.get_param('mark_watched', False):
3563             return
3564         if (self._get_login_info()[0] is not None
3565                 or self.get_param('cookiefile')
3566                 or self.get_param('cookiesfrombrowser')):
3567             self._mark_watched(*args, **kwargs)
3568
3569     def _mark_watched(self, *args, **kwargs):
3570         raise NotImplementedError('This method must be implemented by subclasses')
3571
3572     def geo_verification_headers(self):
3573         headers = {}
3574         geo_verification_proxy = self.get_param('geo_verification_proxy')
3575         if geo_verification_proxy:
3576             headers['Ytdl-request-proxy'] = geo_verification_proxy
3577         return headers
3578
3579     def _generic_id(self, url):
3580         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3581
3582     def _generic_title(self, url):
3583         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3584
3585     @staticmethod
3586     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3587         all_known = all(map(
3588             lambda x: x is not None,
3589             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3590         return (
3591             'private' if is_private
3592             else 'premium_only' if needs_premium
3593             else 'subscriber_only' if needs_subscription
3594             else 'needs_auth' if needs_auth
3595             else 'unlisted' if is_unlisted
3596             else 'public' if all_known
3597             else None)
3598
3599     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3600         '''
3601         @returns            A list of values for the extractor argument given by "key"
3602                             or "default" if no such key is present
3603         @param default      The default value to return when the key is not present (default: [])
3604         @param casesense    When false, the values are converted to lower case
3605         '''
3606         val = traverse_obj(
3607             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3608         if val is None:
3609             return [] if default is NO_DEFAULT else default
3610         return list(val) if casesense else [x.lower() for x in val]
3611
3612
3613 class SearchInfoExtractor(InfoExtractor):
3614     """
3615     Base class for paged search queries extractors.
3616     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3617     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3618     """
3619
3620     @classmethod
3621     def _make_valid_url(cls):
3622         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3623
3624     @classmethod
3625     def suitable(cls, url):
3626         return re.match(cls._make_valid_url(), url) is not None
3627
3628     def _real_extract(self, query):
3629         mobj = re.match(self._make_valid_url(), query)
3630         if mobj is None:
3631             raise ExtractorError('Invalid search query "%s"' % query)
3632
3633         prefix = mobj.group('prefix')
3634         query = mobj.group('query')
3635         if prefix == '':
3636             return self._get_n_results(query, 1)
3637         elif prefix == 'all':
3638             return self._get_n_results(query, self._MAX_RESULTS)
3639         else:
3640             n = int(prefix)
3641             if n <= 0:
3642                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3643             elif n > self._MAX_RESULTS:
3644                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3645                 n = self._MAX_RESULTS
3646             return self._get_n_results(query, n)
3647
3648     def _get_n_results(self, query, n):
3649         """Get a specified number of results for a query.
3650         Either this function or _search_results must be overridden by subclasses """
3651         return self.playlist_result(
3652             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3653             query, query)
3654
3655     def _search_results(self, query):
3656         """Returns an iterator of search results"""
3657         raise NotImplementedError('This method must be implemented by subclasses')
3658
3659     @property
3660     def SEARCH_KEY(self):
3661         return self._SEARCH_KEY