yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar_Cookie,
  18     compat_cookies_SimpleCookie,
  19     compat_etree_Element,
  20     compat_etree_fromstring,
  21     compat_expanduser,
  22     compat_getpass,
  23     compat_http_client,
  24     compat_os_name,
  25     compat_str,
  26     compat_urllib_error,
  27     compat_urllib_parse_unquote,
  28     compat_urllib_parse_urlencode,
  29     compat_urllib_request,
  30     compat_urlparse,
  31     compat_xml_parse_error,
  32 )
  33 from ..downloader import FileDownloader
  34 from ..downloader.f4m import (
  35     get_base_url,
  36     remove_encrypted_media,
  37 )
  38 from ..utils import (
  39     age_restricted,
  40     base_url,
  41     bug_reports_message,
  42     clean_html,
  43     compiled_regex_type,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     error_to_compat_str,
  48     extract_attributes,
  49     ExtractorError,
  50     fix_xml_ampersands,
  51     float_or_none,
  52     format_field,
  53     GeoRestrictedError,
  54     GeoUtils,
  55     int_or_none,
  56     js_to_json,
  57     JSON_LD_RE,
  58     mimetype2ext,
  59     network_exceptions,
  60     NO_DEFAULT,
  61     orderedSet,
  62     parse_bitrate,
  63     parse_codecs,
  64     parse_duration,
  65     parse_iso8601,
  66     parse_m3u8_attributes,
  67     parse_resolution,
  68     RegexNotFoundError,
  69     sanitize_filename,
  70     sanitized_Request,
  71     str_or_none,
  72     str_to_int,
  73     strip_or_none,
  74     traverse_obj,
  75     unescapeHTML,
  76     unified_strdate,
  77     unified_timestamp,
  78     update_Request,
  79     update_url_query,
  80     url_basename,
  81     url_or_none,
  82     urljoin,
  83     variadic,
  84     xpath_element,
  85     xpath_text,
  86     xpath_with_ns,
  87 )
  88
  89
  90 class InfoExtractor(object):
  91     """Information Extractor class.
  92
  93     Information extractors are the classes that, given a URL, extract
  94     information about the video (or videos) the URL refers to. This
  95     information includes the real video URL, the video title, author and
  96     others. The information is stored in a dictionary which is then
  97     passed to the YoutubeDL. The YoutubeDL processes this
  98     information possibly downloading the video to the file system, among
  99     other possible outcomes.
 100
 101     The type field determines the type of the result.
 102     By far the most common value (and the default if _type is missing) is
 103     "video", which indicates a single video.
 104
 105     For a video, the dictionaries must include the following fields:
 106
 107     id:             Video identifier.
 108     title:          Video title, unescaped.
 109
 110     Additionally, it must contain either a formats entry or a url one:
 111
 112     formats:        A list of dictionaries for each format available, ordered
 113                     from worst to best quality.
 114
 115                     Potential fields:
 116                     * url        The mandatory URL representing the media:
 117                                    for plain file media - HTTP URL of this file,
 118                                    for RTMP - RTMP URL,
 119                                    for HLS - URL of the M3U8 media playlist,
 120                                    for HDS - URL of the F4M manifest,
 121                                    for DASH
 122                                      - HTTP URL to plain file media (in case of
 123                                        unfragmented media)
 124                                      - URL of the MPD manifest or base URL
 125                                        representing the media if MPD manifest
 126                                        is parsed from a string (in case of
 127                                        fragmented media)
 128                                    for MSS - URL of the ISM manifest.
 129                     * manifest_url
 130                                  The URL of the manifest file in case of
 131                                  fragmented media:
 132                                    for HLS - URL of the M3U8 master playlist,
 133                                    for HDS - URL of the F4M manifest,
 134                                    for DASH - URL of the MPD manifest,
 135                                    for MSS - URL of the ISM manifest.
 136                     * ext        Will be calculated from URL if missing
 137                     * format     A human-readable description of the format
 138                                  ("mp4 container with h264/opus").
 139                                  Calculated from the format_id, width, height.
 140                                  and format_note fields if missing.
 141                     * format_id  A short description of the format
 142                                  ("mp4_h264_opus" or "19").
 143                                 Technically optional, but strongly recommended.
 144                     * format_note Additional info about the format
 145                                  ("3D" or "DASH video")
 146                     * width      Width of the video, if known
 147                     * height     Height of the video, if known
 148                     * resolution Textual description of width and height
 149                     * tbr        Average bitrate of audio and video in KBit/s
 150                     * abr        Average audio bitrate in KBit/s
 151                     * acodec     Name of the audio codec in use
 152                     * asr        Audio sampling rate in Hertz
 153                     * vbr        Average video bitrate in KBit/s
 154                     * fps        Frame rate
 155                     * vcodec     Name of the video codec in use
 156                     * container  Name of the container format
 157                     * filesize   The number of bytes, if known in advance
 158                     * filesize_approx  An estimate for the number of bytes
 159                     * player_url SWF Player URL (used for rtmpdump).
 160                     * protocol   The protocol that will be used for the actual
 161                                  download, lower-case.
 162                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 163                                  "m3u8", "m3u8_native" or "http_dash_segments".
 164                     * fragment_base_url
 165                                  Base URL for fragments. Each fragment's path
 166                                  value (if present) will be relative to
 167                                  this URL.
 168                     * fragments  A list of fragments of a fragmented media.
 169                                  Each fragment entry must contain either an url
 170                                  or a path. If an url is present it should be
 171                                  considered by a client. Otherwise both path and
 172                                  fragment_base_url must be present. Here is
 173                                  the list of all potential fields:
 174                                  * "url" - fragment's URL
 175                                  * "path" - fragment's path relative to
 176                                             fragment_base_url
 177                                  * "duration" (optional, int or float)
 178                                  * "filesize" (optional, int)
 179                     * preference Order number of this format. If this field is
 180                                  present and not None, the formats get sorted
 181                                  by this field, regardless of all other values.
 182                                  -1 for default (order by other properties),
 183                                  -2 or smaller for less than default.
 184                                  < -1000 to hide the format (if there is
 185                                     another one which is strictly better)
 186                     * language   Language code, e.g. "de" or "en-US".
 187                     * language_preference  Is this in the language mentioned in
 188                                  the URL?
 189                                  10 if it's what the URL is about,
 190                                  -1 for default (don't know),
 191                                  -10 otherwise, other values reserved for now.
 192                     * quality    Order number of the video quality of this
 193                                  format, irrespective of the file format.
 194                                  -1 for default (order by other properties),
 195                                  -2 or smaller for less than default.
 196                     * source_preference  Order number for this video source
 197                                   (quality takes higher priority)
 198                                  -1 for default (order by other properties),
 199                                  -2 or smaller for less than default.
 200                     * http_headers  A dictionary of additional HTTP headers
 201                                  to add to the request.
 202                     * stretched_ratio  If given and not 1, indicates that the
 203                                  video's pixels are not square.
 204                                  width : height ratio as float.
 205                     * no_resume  The server does not support resuming the
 206                                  (HTTP or RTMP) download. Boolean.
 207                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 208                     * downloader_options  A dictionary of downloader options as
 209                                  described in FileDownloader
 210                     RTMP formats can also have the additional fields: page_url,
 211                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 212                     rtmp_protocol, rtmp_real_time
 213
 214     url:            Final video URL.
 215     ext:            Video filename extension.
 216     format:         The video format, defaults to ext (used for --get-format)
 217     player_url:     SWF Player URL (used for rtmpdump).
 218
 219     The following fields are optional:
 220
 221     alt_title:      A secondary title of the video.
 222     display_id      An alternative identifier for the video, not necessarily
 223                     unique, but available before title. Typically, id is
 224                     something like "4234987", title "Dancing naked mole rats",
 225                     and display_id "dancing-naked-mole-rats"
 226     thumbnails:     A list of dictionaries, with the following entries:
 227                         * "id" (optional, string) - Thumbnail format ID
 228                         * "url"
 229                         * "preference" (optional, int) - quality of the image
 230                         * "width" (optional, int)
 231                         * "height" (optional, int)
 232                         * "resolution" (optional, string "{width}x{height}",
 233                                         deprecated)
 234                         * "filesize" (optional, int)
 235                         * "_test_url" (optional, bool) - If true, test the URL
 236     thumbnail:      Full URL to a video thumbnail image.
 237     description:    Full video description.
 238     uploader:       Full name of the video uploader.
 239     license:        License name the video is licensed under.
 240     creator:        The creator of the video.
 241     release_timestamp: UNIX timestamp of the moment the video was released.
 242     release_date:   The date (YYYYMMDD) when the video was released.
 243     timestamp:      UNIX timestamp of the moment the video was uploaded
 244     upload_date:    Video upload date (YYYYMMDD).
 245                     If not explicitly set, calculated from timestamp.
 246     uploader_id:    Nickname or id of the video uploader.
 247     uploader_url:   Full URL to a personal webpage of the video uploader.
 248     channel:        Full name of the channel the video is uploaded on.
 249                     Note that channel fields may or may not repeat uploader
 250                     fields. This depends on a particular extractor.
 251     channel_id:     Id of the channel.
 252     channel_url:    Full URL to a channel webpage.
 253     location:       Physical location where the video was filmed.
 254     subtitles:      The available subtitles as a dictionary in the format
 255                     {tag: subformats}. "tag" is usually a language code, and
 256                     "subformats" is a list sorted from lower to higher
 257                     preference, each element is a dictionary with the "ext"
 258                     entry and one of:
 259                         * "data": The subtitles file contents
 260                         * "url": A URL pointing to the subtitles file
 261                     It can optionally also have:
 262                         * "name": Name or description of the subtitles
 263                     "ext" will be calculated from URL if missing
 264     automatic_captions: Like 'subtitles'; contains automatically generated
 265                     captions instead of normal subtitles
 266     duration:       Length of the video in seconds, as an integer or float.
 267     view_count:     How many users have watched the video on the platform.
 268     like_count:     Number of positive ratings of the video
 269     dislike_count:  Number of negative ratings of the video
 270     repost_count:   Number of reposts of the video
 271     average_rating: Average rating give by users, the scale used depends on the webpage
 272     comment_count:  Number of comments on the video
 273     comments:       A list of comments, each with one or more of the following
 274                     properties (all but one of text or html optional):
 275                         * "author" - human-readable name of the comment author
 276                         * "author_id" - user ID of the comment author
 277                         * "author_thumbnail" - The thumbnail of the comment author
 278                         * "id" - Comment ID
 279                         * "html" - Comment as HTML
 280                         * "text" - Plain text of the comment
 281                         * "timestamp" - UNIX timestamp of comment
 282                         * "parent" - ID of the comment this one is replying to.
 283                                      Set to "root" to indicate that this is a
 284                                      comment to the original video.
 285                         * "like_count" - Number of positive ratings of the comment
 286                         * "dislike_count" - Number of negative ratings of the comment
 287                         * "is_favorited" - Whether the comment is marked as
 288                                            favorite by the video uploader
 289                         * "author_is_uploader" - Whether the comment is made by
 290                                                  the video uploader
 291     age_limit:      Age restriction for the video, as an integer (years)
 292     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 293                     should allow to get the same result again. (It will be set
 294                     by YoutubeDL if it's missing)
 295     categories:     A list of categories that the video falls in, for example
 296                     ["Sports", "Berlin"]
 297     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 298     cast:           A list of the video cast
 299     is_live:        True, False, or None (=unknown). Whether this video is a
 300                     live stream that goes on instead of a fixed-length video.
 301     was_live:       True, False, or None (=unknown). Whether this video was
 302                     originally a live stream.
 303     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 304                     If absent, automatically set from is_live, was_live
 305     start_time:     Time in seconds where the reproduction should start, as
 306                     specified in the URL.
 307     end_time:       Time in seconds where the reproduction should end, as
 308                     specified in the URL.
 309     chapters:       A list of dictionaries, with the following entries:
 310                         * "start_time" - The start time of the chapter in seconds
 311                         * "end_time" - The end time of the chapter in seconds
 312                         * "title" (optional, string)
 313     playable_in_embed: Whether this video is allowed to play in embedded
 314                     players on other sites. Can be True (=always allowed),
 315                     False (=never allowed), None (=unknown), or a string
 316                     specifying the criteria for embedability (Eg: 'whitelist')
 317     availability:   Under what condition the video is available. One of
 318                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 319                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 320                     to set it
 321     __post_extractor: A function to be called just before the metadata is
 322                     written to either disk, logger or console. The function
 323                     must return a dict which will be added to the info_dict.
 324                     This is usefull for additional information that is
 325                     time-consuming to extract. Note that the fields thus
 326                     extracted will not be available to output template and
 327                     match_filter. So, only "comments" and "comment_count" are
 328                     currently allowed to be extracted via this method.
 329
 330     The following fields should only be used when the video belongs to some logical
 331     chapter or section:
 332
 333     chapter:        Name or title of the chapter the video belongs to.
 334     chapter_number: Number of the chapter the video belongs to, as an integer.
 335     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 336
 337     The following fields should only be used when the video is an episode of some
 338     series, programme or podcast:
 339
 340     series:         Title of the series or programme the video episode belongs to.
 341     season:         Title of the season the video episode belongs to.
 342     season_number:  Number of the season the video episode belongs to, as an integer.
 343     season_id:      Id of the season the video episode belongs to, as a unicode string.
 344     episode:        Title of the video episode. Unlike mandatory video title field,
 345                     this field should denote the exact title of the video episode
 346                     without any kind of decoration.
 347     episode_number: Number of the video episode within a season, as an integer.
 348     episode_id:     Id of the video episode, as a unicode string.
 349
 350     The following fields should only be used when the media is a track or a part of
 351     a music album:
 352
 353     track:          Title of the track.
 354     track_number:   Number of the track within an album or a disc, as an integer.
 355     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 356                     as a unicode string.
 357     artist:         Artist(s) of the track.
 358     genre:          Genre(s) of the track.
 359     album:          Title of the album the track belongs to.
 360     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 361     album_artist:   List of all artists appeared on the album (e.g.
 362                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 363                     and compilations).
 364     disc_number:    Number of the disc or other physical medium the track belongs to,
 365                     as an integer.
 366     release_year:   Year (YYYY) when the album was released.
 367
 368     Unless mentioned otherwise, the fields should be Unicode strings.
 369
 370     Unless mentioned otherwise, None is equivalent to absence of information.
 371
 372
 373     _type "playlist" indicates multiple videos.
 374     There must be a key "entries", which is a list, an iterable, or a PagedList
 375     object, each element of which is a valid dictionary by this specification.
 376
 377     Additionally, playlists can have "id", "title", and any other relevent
 378     attributes with the same semantics as videos (see above).
 379
 380
 381     _type "multi_video" indicates that there are multiple videos that
 382     form a single show, for examples multiple acts of an opera or TV episode.
 383     It must have an entries key like a playlist and contain all the keys
 384     required for a video at the same time.
 385
 386
 387     _type "url" indicates that the video must be extracted from another
 388     location, possibly by a different extractor. Its only required key is:
 389     "url" - the next URL to extract.
 390     The key "ie_key" can be set to the class name (minus the trailing "IE",
 391     e.g. "Youtube") if the extractor class is known in advance.
 392     Additionally, the dictionary may have any properties of the resolved entity
 393     known in advance, for example "title" if the title of the referred video is
 394     known ahead of time.
 395
 396
 397     _type "url_transparent" entities have the same specification as "url", but
 398     indicate that the given additional information is more precise than the one
 399     associated with the resolved URL.
 400     This is useful when a site employs a video service that hosts the video and
 401     its technical metadata, but that video service does not embed a useful
 402     title, description etc.
 403
 404
 405     Subclasses of this one should re-define the _real_initialize() and
 406     _real_extract() methods and define a _VALID_URL regexp.
 407     Probably, they should also be added to the list of extractors.
 408
 409     Subclasses may also override suitable() if necessary, but ensure the function
 410     signature is preserved and that this function imports everything it needs
 411     (except other extractors), so that lazy_extractors works correctly
 412
 413     _GEO_BYPASS attribute may be set to False in order to disable
 414     geo restriction bypass mechanisms for a particular extractor.
 415     Though it won't disable explicit geo restriction bypass based on
 416     country code provided with geo_bypass_country.
 417
 418     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 419     countries for this extractor. One of these countries will be used by
 420     geo restriction bypass mechanism right away in order to bypass
 421     geo restriction, of course, if the mechanism is not disabled.
 422
 423     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 424     IP blocks in CIDR notation for this extractor. One of these IP blocks
 425     will be used by geo restriction bypass mechanism similarly
 426     to _GEO_COUNTRIES.
 427
 428     The _WORKING attribute should be set to False for broken IEs
 429     in order to warn the users and skip the tests.
 430     """
 431
 432     _ready = False
 433     _downloader = None
 434     _x_forwarded_for_ip = None
 435     _GEO_BYPASS = True
 436     _GEO_COUNTRIES = None
 437     _GEO_IP_BLOCKS = None
 438     _WORKING = True
 439
 440     _LOGIN_HINTS = {
 441         'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
 442         'cookies': (
 443             'Use --cookies for the authentication. '
 444             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to pass cookies'),
 445         'password': 'Use --username and --password or --netrc to provide account credentials',
 446     }
 447
 448     def __init__(self, downloader=None):
 449         """Constructor. Receives an optional downloader."""
 450         self._ready = False
 451         self._x_forwarded_for_ip = None
 452         self._printed_messages = set()
 453         self.set_downloader(downloader)
 454
 455     @classmethod
 456     def _match_valid_url(cls, url):
 457         # This does not use has/getattr intentionally - we want to know whether
 458         # we have cached the regexp for *this* class, whereas getattr would also
 459         # match the superclass
 460         if '_VALID_URL_RE' not in cls.__dict__:
 461             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 462         return cls._VALID_URL_RE.match(url)
 463
 464     @classmethod
 465     def suitable(cls, url):
 466         """Receives a URL and returns True if suitable for this IE."""
 467         # This function must import everything it needs (except other extractors),
 468         # so that lazy_extractors works correctly
 469         return cls._match_valid_url(url) is not None
 470
 471     @classmethod
 472     def _match_id(cls, url):
 473         return cls._match_valid_url(url).group('id')
 474
 475     @classmethod
 476     def get_temp_id(cls, url):
 477         try:
 478             return cls._match_id(url)
 479         except (IndexError, AttributeError):
 480             return None
 481
 482     @classmethod
 483     def working(cls):
 484         """Getter method for _WORKING."""
 485         return cls._WORKING
 486
 487     def initialize(self):
 488         """Initializes an instance (authentication, etc)."""
 489         self._printed_messages = set()
 490         self._initialize_geo_bypass({
 491             'countries': self._GEO_COUNTRIES,
 492             'ip_blocks': self._GEO_IP_BLOCKS,
 493         })
 494         if not self._ready:
 495             self._real_initialize()
 496             self._ready = True
 497
 498     def _initialize_geo_bypass(self, geo_bypass_context):
 499         """
 500         Initialize geo restriction bypass mechanism.
 501
 502         This method is used to initialize geo bypass mechanism based on faking
 503         X-Forwarded-For HTTP header. A random country from provided country list
 504         is selected and a random IP belonging to this country is generated. This
 505         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 506         HTTP requests.
 507
 508         This method will be used for initial geo bypass mechanism initialization
 509         during the instance initialization with _GEO_COUNTRIES and
 510         _GEO_IP_BLOCKS.
 511
 512         You may also manually call it from extractor's code if geo bypass
 513         information is not available beforehand (e.g. obtained during
 514         extraction) or due to some other reason. In this case you should pass
 515         this information in geo bypass context passed as first argument. It may
 516         contain following fields:
 517
 518         countries:  List of geo unrestricted countries (similar
 519                     to _GEO_COUNTRIES)
 520         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 521                     (similar to _GEO_IP_BLOCKS)
 522
 523         """
 524         if not self._x_forwarded_for_ip:
 525
 526             # Geo bypass mechanism is explicitly disabled by user
 527             if not self.get_param('geo_bypass', True):
 528                 return
 529
 530             if not geo_bypass_context:
 531                 geo_bypass_context = {}
 532
 533             # Backward compatibility: previously _initialize_geo_bypass
 534             # expected a list of countries, some 3rd party code may still use
 535             # it this way
 536             if isinstance(geo_bypass_context, (list, tuple)):
 537                 geo_bypass_context = {
 538                     'countries': geo_bypass_context,
 539                 }
 540
 541             # The whole point of geo bypass mechanism is to fake IP
 542             # as X-Forwarded-For HTTP header based on some IP block or
 543             # country code.
 544
 545             # Path 1: bypassing based on IP block in CIDR notation
 546
 547             # Explicit IP block specified by user, use it right away
 548             # regardless of whether extractor is geo bypassable or not
 549             ip_block = self.get_param('geo_bypass_ip_block', None)
 550
 551             # Otherwise use random IP block from geo bypass context but only
 552             # if extractor is known as geo bypassable
 553             if not ip_block:
 554                 ip_blocks = geo_bypass_context.get('ip_blocks')
 555                 if self._GEO_BYPASS and ip_blocks:
 556                     ip_block = random.choice(ip_blocks)
 557
 558             if ip_block:
 559                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 560                 self._downloader.write_debug(
 561                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 562                 return
 563
 564             # Path 2: bypassing based on country code
 565
 566             # Explicit country code specified by user, use it right away
 567             # regardless of whether extractor is geo bypassable or not
 568             country = self.get_param('geo_bypass_country', None)
 569
 570             # Otherwise use random country code from geo bypass context but
 571             # only if extractor is known as geo bypassable
 572             if not country:
 573                 countries = geo_bypass_context.get('countries')
 574                 if self._GEO_BYPASS and countries:
 575                     country = random.choice(countries)
 576
 577             if country:
 578                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 579                 self._downloader.write_debug(
 580                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 581
 582     def extract(self, url):
 583         """Extracts URL information and returns it in list of dicts."""
 584         try:
 585             for _ in range(2):
 586                 try:
 587                     self.initialize()
 588                     self.write_debug('Extracting URL: %s' % url)
 589                     ie_result = self._real_extract(url)
 590                     if ie_result is None:
 591                         return None
 592                     if self._x_forwarded_for_ip:
 593                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 594                     subtitles = ie_result.get('subtitles')
 595                     if (subtitles and 'live_chat' in subtitles
 596                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 597                         del subtitles['live_chat']
 598                     return ie_result
 599                 except GeoRestrictedError as e:
 600                     if self.__maybe_fake_ip_and_retry(e.countries):
 601                         continue
 602                     raise
 603         except ExtractorError as e:
 604             video_id = e.video_id or self.get_temp_id(url)
 605             raise ExtractorError(
 606                 e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
 607         except compat_http_client.IncompleteRead as e:
 608             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 609         except (KeyError, StopIteration) as e:
 610             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 611
 612     def __maybe_fake_ip_and_retry(self, countries):
 613         if (not self.get_param('geo_bypass_country', None)
 614                 and self._GEO_BYPASS
 615                 and self.get_param('geo_bypass', True)
 616                 and not self._x_forwarded_for_ip
 617                 and countries):
 618             country_code = random.choice(countries)
 619             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 620             if self._x_forwarded_for_ip:
 621                 self.report_warning(
 622                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 623                     % (self._x_forwarded_for_ip, country_code.upper()))
 624                 return True
 625         return False
 626
 627     def set_downloader(self, downloader):
 628         """Sets the downloader for this IE."""
 629         self._downloader = downloader
 630
 631     def _real_initialize(self):
 632         """Real initialization process. Redefine in subclasses."""
 633         pass
 634
 635     def _real_extract(self, url):
 636         """Real extraction process. Redefine in subclasses."""
 637         pass
 638
 639     @classmethod
 640     def ie_key(cls):
 641         """A string for getting the InfoExtractor with get_info_extractor"""
 642         return cls.__name__[:-2]
 643
 644     @property
 645     def IE_NAME(self):
 646         return compat_str(type(self).__name__[:-2])
 647
 648     @staticmethod
 649     def __can_accept_status_code(err, expected_status):
 650         assert isinstance(err, compat_urllib_error.HTTPError)
 651         if expected_status is None:
 652             return False
 653         elif callable(expected_status):
 654             return expected_status(err.code) is True
 655         else:
 656             return err.code in variadic(expected_status)
 657
 658     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 659         """
 660         Return the response handle.
 661
 662         See _download_webpage docstring for arguments specification.
 663         """
 664         if not self._downloader._first_webpage_request:
 665             sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
 666             if sleep_interval > 0:
 667                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 668                 time.sleep(sleep_interval)
 669         else:
 670             self._downloader._first_webpage_request = False
 671
 672         if note is None:
 673             self.report_download_webpage(video_id)
 674         elif note is not False:
 675             if video_id is None:
 676                 self.to_screen('%s' % (note,))
 677             else:
 678                 self.to_screen('%s: %s' % (video_id, note))
 679
 680         # Some sites check X-Forwarded-For HTTP header in order to figure out
 681         # the origin of the client behind proxy. This allows bypassing geo
 682         # restriction by faking this header's value to IP that belongs to some
 683         # geo unrestricted country. We will do so once we encounter any
 684         # geo restriction error.
 685         if self._x_forwarded_for_ip:
 686             if 'X-Forwarded-For' not in headers:
 687                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 688
 689         if isinstance(url_or_request, compat_urllib_request.Request):
 690             url_or_request = update_Request(
 691                 url_or_request, data=data, headers=headers, query=query)
 692         else:
 693             if query:
 694                 url_or_request = update_url_query(url_or_request, query)
 695             if data is not None or headers:
 696                 url_or_request = sanitized_Request(url_or_request, data, headers)
 697         try:
 698             return self._downloader.urlopen(url_or_request)
 699         except network_exceptions as err:
 700             if isinstance(err, compat_urllib_error.HTTPError):
 701                 if self.__can_accept_status_code(err, expected_status):
 702                     # Retain reference to error to prevent file object from
 703                     # being closed before it can be read. Works around the
 704                     # effects of <https://bugs.python.org/issue15002>
 705                     # introduced in Python 3.4.1.
 706                     err.fp._error = err
 707                     return err.fp
 708
 709             if errnote is False:
 710                 return False
 711             if errnote is None:
 712                 errnote = 'Unable to download webpage'
 713
 714             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 715             if fatal:
 716                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 717             else:
 718                 self.report_warning(errmsg)
 719                 return False
 720
 721     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 722         """
 723         Return a tuple (page content as string, URL handle).
 724
 725         See _download_webpage docstring for arguments specification.
 726         """
 727         # Strip hashes from the URL (#1038)
 728         if isinstance(url_or_request, (compat_str, str)):
 729             url_or_request = url_or_request.partition('#')[0]
 730
 731         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 732         if urlh is False:
 733             assert not fatal
 734             return False
 735         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 736         return (content, urlh)
 737
 738     @staticmethod
 739     def _guess_encoding_from_content(content_type, webpage_bytes):
 740         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 741         if m:
 742             encoding = m.group(1)
 743         else:
 744             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 745                           webpage_bytes[:1024])
 746             if m:
 747                 encoding = m.group(1).decode('ascii')
 748             elif webpage_bytes.startswith(b'\xff\xfe'):
 749                 encoding = 'utf-16'
 750             else:
 751                 encoding = 'utf-8'
 752
 753         return encoding
 754
 755     def __check_blocked(self, content):
 756         first_block = content[:512]
 757         if ('<title>Access to this site is blocked</title>' in content
 758                 and 'Websense' in first_block):
 759             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 760             blocked_iframe = self._html_search_regex(
 761                 r'<iframe src="([^"]+)"', content,
 762                 'Websense information URL', default=None)
 763             if blocked_iframe:
 764                 msg += ' Visit %s for more details' % blocked_iframe
 765             raise ExtractorError(msg, expected=True)
 766         if '<title>The URL you requested has been blocked</title>' in first_block:
 767             msg = (
 768                 'Access to this webpage has been blocked by Indian censorship. '
 769                 'Use a VPN or proxy server (with --proxy) to route around it.')
 770             block_msg = self._html_search_regex(
 771                 r'</h1><p>(.*?)</p>',
 772                 content, 'block message', default=None)
 773             if block_msg:
 774                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 775             raise ExtractorError(msg, expected=True)
 776         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 777                 and 'blocklist.rkn.gov.ru' in content):
 778             raise ExtractorError(
 779                 'Access to this webpage has been blocked by decision of the Russian government. '
 780                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 781                 expected=True)
 782
 783     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 784         content_type = urlh.headers.get('Content-Type', '')
 785         webpage_bytes = urlh.read()
 786         if prefix is not None:
 787             webpage_bytes = prefix + webpage_bytes
 788         if not encoding:
 789             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 790         if self.get_param('dump_intermediate_pages', False):
 791             self.to_screen('Dumping request to ' + urlh.geturl())
 792             dump = base64.b64encode(webpage_bytes).decode('ascii')
 793             self._downloader.to_screen(dump)
 794         if self.get_param('write_pages', False):
 795             basen = '%s_%s' % (video_id, urlh.geturl())
 796             trim_length = self.get_param('trim_file_name') or 240
 797             if len(basen) > trim_length:
 798                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 799                 basen = basen[:trim_length - len(h)] + h
 800             raw_filename = basen + '.dump'
 801             filename = sanitize_filename(raw_filename, restricted=True)
 802             self.to_screen('Saving request to ' + filename)
 803             # Working around MAX_PATH limitation on Windows (see
 804             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 805             if compat_os_name == 'nt':
 806                 absfilepath = os.path.abspath(filename)
 807                 if len(absfilepath) > 259:
 808                     filename = '\\\\?\\' + absfilepath
 809             with open(filename, 'wb') as outf:
 810                 outf.write(webpage_bytes)
 811
 812         try:
 813             content = webpage_bytes.decode(encoding, 'replace')
 814         except LookupError:
 815             content = webpage_bytes.decode('utf-8', 'replace')
 816
 817         self.__check_blocked(content)
 818
 819         return content
 820
 821     def _download_webpage(
 822             self, url_or_request, video_id, note=None, errnote=None,
 823             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 824             headers={}, query={}, expected_status=None):
 825         """
 826         Return the data of the page as a string.
 827
 828         Arguments:
 829         url_or_request -- plain text URL as a string or
 830             a compat_urllib_request.Requestobject
 831         video_id -- Video/playlist/item identifier (string)
 832
 833         Keyword arguments:
 834         note -- note printed before downloading (string)
 835         errnote -- note printed in case of an error (string)
 836         fatal -- flag denoting whether error should be considered fatal,
 837             i.e. whether it should cause ExtractionError to be raised,
 838             otherwise a warning will be reported and extraction continued
 839         tries -- number of tries
 840         timeout -- sleep interval between tries
 841         encoding -- encoding for a page content decoding, guessed automatically
 842             when not explicitly specified
 843         data -- POST data (bytes)
 844         headers -- HTTP headers (dict)
 845         query -- URL query (dict)
 846         expected_status -- allows to accept failed HTTP requests (non 2xx
 847             status code) by explicitly specifying a set of accepted status
 848             codes. Can be any of the following entities:
 849                 - an integer type specifying an exact failed status code to
 850                   accept
 851                 - a list or a tuple of integer types specifying a list of
 852                   failed status codes to accept
 853                 - a callable accepting an actual failed status code and
 854                   returning True if it should be accepted
 855             Note that this argument does not affect success status codes (2xx)
 856             which are always accepted.
 857         """
 858
 859         success = False
 860         try_count = 0
 861         while success is False:
 862             try:
 863                 res = self._download_webpage_handle(
 864                     url_or_request, video_id, note, errnote, fatal,
 865                     encoding=encoding, data=data, headers=headers, query=query,
 866                     expected_status=expected_status)
 867                 success = True
 868             except compat_http_client.IncompleteRead as e:
 869                 try_count += 1
 870                 if try_count >= tries:
 871                     raise e
 872                 self._sleep(timeout, video_id)
 873         if res is False:
 874             return res
 875         else:
 876             content, _ = res
 877             return content
 878
 879     def _download_xml_handle(
 880             self, url_or_request, video_id, note='Downloading XML',
 881             errnote='Unable to download XML', transform_source=None,
 882             fatal=True, encoding=None, data=None, headers={}, query={},
 883             expected_status=None):
 884         """
 885         Return a tuple (xml as an compat_etree_Element, URL handle).
 886
 887         See _download_webpage docstring for arguments specification.
 888         """
 889         res = self._download_webpage_handle(
 890             url_or_request, video_id, note, errnote, fatal=fatal,
 891             encoding=encoding, data=data, headers=headers, query=query,
 892             expected_status=expected_status)
 893         if res is False:
 894             return res
 895         xml_string, urlh = res
 896         return self._parse_xml(
 897             xml_string, video_id, transform_source=transform_source,
 898             fatal=fatal), urlh
 899
 900     def _download_xml(
 901             self, url_or_request, video_id,
 902             note='Downloading XML', errnote='Unable to download XML',
 903             transform_source=None, fatal=True, encoding=None,
 904             data=None, headers={}, query={}, expected_status=None):
 905         """
 906         Return the xml as an compat_etree_Element.
 907
 908         See _download_webpage docstring for arguments specification.
 909         """
 910         res = self._download_xml_handle(
 911             url_or_request, video_id, note=note, errnote=errnote,
 912             transform_source=transform_source, fatal=fatal, encoding=encoding,
 913             data=data, headers=headers, query=query,
 914             expected_status=expected_status)
 915         return res if res is False else res[0]
 916
 917     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 918         if transform_source:
 919             xml_string = transform_source(xml_string)
 920         try:
 921             return compat_etree_fromstring(xml_string.encode('utf-8'))
 922         except compat_xml_parse_error as ve:
 923             errmsg = '%s: Failed to parse XML ' % video_id
 924             if fatal:
 925                 raise ExtractorError(errmsg, cause=ve)
 926             else:
 927                 self.report_warning(errmsg + str(ve))
 928
 929     def _download_json_handle(
 930             self, url_or_request, video_id, note='Downloading JSON metadata',
 931             errnote='Unable to download JSON metadata', transform_source=None,
 932             fatal=True, encoding=None, data=None, headers={}, query={},
 933             expected_status=None):
 934         """
 935         Return a tuple (JSON object, URL handle).
 936
 937         See _download_webpage docstring for arguments specification.
 938         """
 939         res = self._download_webpage_handle(
 940             url_or_request, video_id, note, errnote, fatal=fatal,
 941             encoding=encoding, data=data, headers=headers, query=query,
 942             expected_status=expected_status)
 943         if res is False:
 944             return res
 945         json_string, urlh = res
 946         return self._parse_json(
 947             json_string, video_id, transform_source=transform_source,
 948             fatal=fatal), urlh
 949
 950     def _download_json(
 951             self, url_or_request, video_id, note='Downloading JSON metadata',
 952             errnote='Unable to download JSON metadata', transform_source=None,
 953             fatal=True, encoding=None, data=None, headers={}, query={},
 954             expected_status=None):
 955         """
 956         Return the JSON object as a dict.
 957
 958         See _download_webpage docstring for arguments specification.
 959         """
 960         res = self._download_json_handle(
 961             url_or_request, video_id, note=note, errnote=errnote,
 962             transform_source=transform_source, fatal=fatal, encoding=encoding,
 963             data=data, headers=headers, query=query,
 964             expected_status=expected_status)
 965         return res if res is False else res[0]
 966
 967     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 968         if transform_source:
 969             json_string = transform_source(json_string)
 970         try:
 971             return json.loads(json_string)
 972         except ValueError as ve:
 973             errmsg = '%s: Failed to parse JSON ' % video_id
 974             if fatal:
 975                 raise ExtractorError(errmsg, cause=ve)
 976             else:
 977                 self.report_warning(errmsg + str(ve))
 978
 979     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 980         return self._parse_json(
 981             data[data.find('{'):data.rfind('}') + 1],
 982             video_id, transform_source, fatal)
 983
 984     def _download_socket_json_handle(
 985             self, url_or_request, video_id, note='Polling socket',
 986             errnote='Unable to poll socket', transform_source=None,
 987             fatal=True, encoding=None, data=None, headers={}, query={},
 988             expected_status=None):
 989         """
 990         Return a tuple (JSON object, URL handle).
 991
 992         See _download_webpage docstring for arguments specification.
 993         """
 994         res = self._download_webpage_handle(
 995             url_or_request, video_id, note, errnote, fatal=fatal,
 996             encoding=encoding, data=data, headers=headers, query=query,
 997             expected_status=expected_status)
 998         if res is False:
 999             return res
1000         webpage, urlh = res
1001         return self._parse_socket_response_as_json(
1002             webpage, video_id, transform_source=transform_source,
1003             fatal=fatal), urlh
1004
1005     def _download_socket_json(
1006             self, url_or_request, video_id, note='Polling socket',
1007             errnote='Unable to poll socket', transform_source=None,
1008             fatal=True, encoding=None, data=None, headers={}, query={},
1009             expected_status=None):
1010         """
1011         Return the JSON object as a dict.
1012
1013         See _download_webpage docstring for arguments specification.
1014         """
1015         res = self._download_socket_json_handle(
1016             url_or_request, video_id, note=note, errnote=errnote,
1017             transform_source=transform_source, fatal=fatal, encoding=encoding,
1018             data=data, headers=headers, query=query,
1019             expected_status=expected_status)
1020         return res if res is False else res[0]
1021
1022     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1023         idstr = format_field(video_id, template='%s: ')
1024         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1025         if only_once:
1026             if f'WARNING: {msg}' in self._printed_messages:
1027                 return
1028             self._printed_messages.add(f'WARNING: {msg}')
1029         self._downloader.report_warning(msg, *args, **kwargs)
1030
1031     def to_screen(self, msg, *args, **kwargs):
1032         """Print msg to screen, prefixing it with '[ie_name]'"""
1033         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1034
1035     def write_debug(self, msg, *args, **kwargs):
1036         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1037
1038     def get_param(self, name, default=None, *args, **kwargs):
1039         if self._downloader:
1040             return self._downloader.params.get(name, default, *args, **kwargs)
1041         return default
1042
1043     def report_drm(self, video_id, partial=False):
1044         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1045
1046     def report_extraction(self, id_or_name):
1047         """Report information extraction."""
1048         self.to_screen('%s: Extracting information' % id_or_name)
1049
1050     def report_download_webpage(self, video_id):
1051         """Report webpage download."""
1052         self.to_screen('%s: Downloading webpage' % video_id)
1053
1054     def report_age_confirmation(self):
1055         """Report attempt to confirm age."""
1056         self.to_screen('Confirming age')
1057
1058     def report_login(self):
1059         """Report attempt to log in."""
1060         self.to_screen('Logging in')
1061
1062     def raise_login_required(
1063             self, msg='This video is only available for registered users',
1064             metadata_available=False, method='any'):
1065         if metadata_available and self.get_param('ignore_no_formats_error'):
1066             self.report_warning(msg)
1067         if method is not None:
1068             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1069         raise ExtractorError(msg, expected=True)
1070
1071     def raise_geo_restricted(
1072             self, msg='This video is not available from your location due to geo restriction',
1073             countries=None, metadata_available=False):
1074         if metadata_available and self.get_param('ignore_no_formats_error'):
1075             self.report_warning(msg)
1076         else:
1077             raise GeoRestrictedError(msg, countries=countries)
1078
1079     def raise_no_formats(self, msg, expected=False, video_id=None):
1080         if expected and self.get_param('ignore_no_formats_error'):
1081             self.report_warning(msg, video_id)
1082         elif isinstance(msg, ExtractorError):
1083             raise msg
1084         else:
1085             raise ExtractorError(msg, expected=expected, video_id=video_id)
1086
1087     # Methods for following #608
1088     @staticmethod
1089     def url_result(url, ie=None, video_id=None, video_title=None):
1090         """Returns a URL that points to a page that should be processed"""
1091         # TODO: ie should be the class used for getting the info
1092         video_info = {'_type': 'url',
1093                       'url': url,
1094                       'ie_key': ie}
1095         if video_id is not None:
1096             video_info['id'] = video_id
1097         if video_title is not None:
1098             video_info['title'] = video_title
1099         return video_info
1100
1101     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1102         urls = orderedSet(
1103             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1104             for m in matches)
1105         return self.playlist_result(
1106             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1107
1108     @staticmethod
1109     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1110         """Returns a playlist"""
1111         video_info = {'_type': 'playlist',
1112                       'entries': entries}
1113         video_info.update(kwargs)
1114         if playlist_id:
1115             video_info['id'] = playlist_id
1116         if playlist_title:
1117             video_info['title'] = playlist_title
1118         if playlist_description is not None:
1119             video_info['description'] = playlist_description
1120         return video_info
1121
1122     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1123         """
1124         Perform a regex search on the given string, using a single or a list of
1125         patterns returning the first matching group.
1126         In case of failure return a default value or raise a WARNING or a
1127         RegexNotFoundError, depending on fatal, specifying the field name.
1128         """
1129         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1130             mobj = re.search(pattern, string, flags)
1131         else:
1132             for p in pattern:
1133                 mobj = re.search(p, string, flags)
1134                 if mobj:
1135                     break
1136
1137         _name = self._downloader._color_text(name, 'blue')
1138
1139         if mobj:
1140             if group is None:
1141                 # return the first matching group
1142                 return next(g for g in mobj.groups() if g is not None)
1143             elif isinstance(group, (list, tuple)):
1144                 return tuple(mobj.group(g) for g in group)
1145             else:
1146                 return mobj.group(group)
1147         elif default is not NO_DEFAULT:
1148             return default
1149         elif fatal:
1150             raise RegexNotFoundError('Unable to extract %s' % _name)
1151         else:
1152             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1153             return None
1154
1155     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1156         """
1157         Like _search_regex, but strips HTML tags and unescapes entities.
1158         """
1159         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1160         if res:
1161             return clean_html(res).strip()
1162         else:
1163             return res
1164
1165     def _get_netrc_login_info(self, netrc_machine=None):
1166         username = None
1167         password = None
1168         netrc_machine = netrc_machine or self._NETRC_MACHINE
1169
1170         if self.get_param('usenetrc', False):
1171             try:
1172                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1173                 if os.path.isdir(netrc_file):
1174                     netrc_file = os.path.join(netrc_file, '.netrc')
1175                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1176                 if info is not None:
1177                     username = info[0]
1178                     password = info[2]
1179                 else:
1180                     raise netrc.NetrcParseError(
1181                         'No authenticators for %s' % netrc_machine)
1182             except (IOError, netrc.NetrcParseError) as err:
1183                 self.report_warning(
1184                     'parsing .netrc: %s' % error_to_compat_str(err))
1185
1186         return username, password
1187
1188     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1189         """
1190         Get the login info as (username, password)
1191         First look for the manually specified credentials using username_option
1192         and password_option as keys in params dictionary. If no such credentials
1193         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1194         value.
1195         If there's no info available, return (None, None)
1196         """
1197
1198         # Attempt to use provided username and password or .netrc data
1199         username = self.get_param(username_option)
1200         if username is not None:
1201             password = self.get_param(password_option)
1202         else:
1203             username, password = self._get_netrc_login_info(netrc_machine)
1204
1205         return username, password
1206
1207     def _get_tfa_info(self, note='two-factor verification code'):
1208         """
1209         Get the two-factor authentication info
1210         TODO - asking the user will be required for sms/phone verify
1211         currently just uses the command line option
1212         If there's no info available, return None
1213         """
1214
1215         tfa = self.get_param('twofactor')
1216         if tfa is not None:
1217             return tfa
1218
1219         return compat_getpass('Type %s and press [Return]: ' % note)
1220
1221     # Helper functions for extracting OpenGraph info
1222     @staticmethod
1223     def _og_regexes(prop):
1224         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1225         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1226                        % {'prop': re.escape(prop)})
1227         template = r'<meta[^>]+?%s[^>]+?%s'
1228         return [
1229             template % (property_re, content_re),
1230             template % (content_re, property_re),
1231         ]
1232
1233     @staticmethod
1234     def _meta_regex(prop):
1235         return r'''(?isx)<meta
1236                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1237                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1238
1239     def _og_search_property(self, prop, html, name=None, **kargs):
1240         prop = variadic(prop)
1241         if name is None:
1242             name = 'OpenGraph %s' % prop[0]
1243         og_regexes = []
1244         for p in prop:
1245             og_regexes.extend(self._og_regexes(p))
1246         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1247         if escaped is None:
1248             return None
1249         return unescapeHTML(escaped)
1250
1251     def _og_search_thumbnail(self, html, **kargs):
1252         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1253
1254     def _og_search_description(self, html, **kargs):
1255         return self._og_search_property('description', html, fatal=False, **kargs)
1256
1257     def _og_search_title(self, html, **kargs):
1258         return self._og_search_property('title', html, **kargs)
1259
1260     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1261         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1262         if secure:
1263             regexes = self._og_regexes('video:secure_url') + regexes
1264         return self._html_search_regex(regexes, html, name, **kargs)
1265
1266     def _og_search_url(self, html, **kargs):
1267         return self._og_search_property('url', html, **kargs)
1268
1269     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1270         name = variadic(name)
1271         if display_name is None:
1272             display_name = name[0]
1273         return self._html_search_regex(
1274             [self._meta_regex(n) for n in name],
1275             html, display_name, fatal=fatal, group='content', **kwargs)
1276
1277     def _dc_search_uploader(self, html):
1278         return self._html_search_meta('dc.creator', html, 'uploader')
1279
1280     def _rta_search(self, html):
1281         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1282         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1283                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1284                      html):
1285             return 18
1286         return 0
1287
1288     def _media_rating_search(self, html):
1289         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1290         rating = self._html_search_meta('rating', html)
1291
1292         if not rating:
1293             return None
1294
1295         RATING_TABLE = {
1296             'safe for kids': 0,
1297             'general': 8,
1298             '14 years': 14,
1299             'mature': 17,
1300             'restricted': 19,
1301         }
1302         return RATING_TABLE.get(rating.lower())
1303
1304     def _family_friendly_search(self, html):
1305         # See http://schema.org/VideoObject
1306         family_friendly = self._html_search_meta(
1307             'isFamilyFriendly', html, default=None)
1308
1309         if not family_friendly:
1310             return None
1311
1312         RATING_TABLE = {
1313             '1': 0,
1314             'true': 0,
1315             '0': 18,
1316             'false': 18,
1317         }
1318         return RATING_TABLE.get(family_friendly.lower())
1319
1320     def _twitter_search_player(self, html):
1321         return self._html_search_meta('twitter:player', html,
1322                                       'twitter card player')
1323
1324     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1325         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1326         default = kwargs.get('default', NO_DEFAULT)
1327         # JSON-LD may be malformed and thus `fatal` should be respected.
1328         # At the same time `default` may be passed that assumes `fatal=False`
1329         # for _search_regex. Let's simulate the same behavior here as well.
1330         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1331         json_ld = []
1332         for mobj in json_ld_list:
1333             json_ld_item = self._parse_json(
1334                 mobj.group('json_ld'), video_id, fatal=fatal)
1335             if not json_ld_item:
1336                 continue
1337             if isinstance(json_ld_item, dict):
1338                 json_ld.append(json_ld_item)
1339             elif isinstance(json_ld_item, (list, tuple)):
1340                 json_ld.extend(json_ld_item)
1341         if json_ld:
1342             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1343         if json_ld:
1344             return json_ld
1345         if default is not NO_DEFAULT:
1346             return default
1347         elif fatal:
1348             raise RegexNotFoundError('Unable to extract JSON-LD')
1349         else:
1350             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1351             return {}
1352
1353     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1354         if isinstance(json_ld, compat_str):
1355             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1356         if not json_ld:
1357             return {}
1358         info = {}
1359         if not isinstance(json_ld, (list, tuple, dict)):
1360             return info
1361         if isinstance(json_ld, dict):
1362             json_ld = [json_ld]
1363
1364         INTERACTION_TYPE_MAP = {
1365             'CommentAction': 'comment',
1366             'AgreeAction': 'like',
1367             'DisagreeAction': 'dislike',
1368             'LikeAction': 'like',
1369             'DislikeAction': 'dislike',
1370             'ListenAction': 'view',
1371             'WatchAction': 'view',
1372             'ViewAction': 'view',
1373         }
1374
1375         def extract_interaction_type(e):
1376             interaction_type = e.get('interactionType')
1377             if isinstance(interaction_type, dict):
1378                 interaction_type = interaction_type.get('@type')
1379             return str_or_none(interaction_type)
1380
1381         def extract_interaction_statistic(e):
1382             interaction_statistic = e.get('interactionStatistic')
1383             if isinstance(interaction_statistic, dict):
1384                 interaction_statistic = [interaction_statistic]
1385             if not isinstance(interaction_statistic, list):
1386                 return
1387             for is_e in interaction_statistic:
1388                 if not isinstance(is_e, dict):
1389                     continue
1390                 if is_e.get('@type') != 'InteractionCounter':
1391                     continue
1392                 interaction_type = extract_interaction_type(is_e)
1393                 if not interaction_type:
1394                     continue
1395                 # For interaction count some sites provide string instead of
1396                 # an integer (as per spec) with non digit characters (e.g. ",")
1397                 # so extracting count with more relaxed str_to_int
1398                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1399                 if interaction_count is None:
1400                     continue
1401                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1402                 if not count_kind:
1403                     continue
1404                 count_key = '%s_count' % count_kind
1405                 if info.get(count_key) is not None:
1406                     continue
1407                 info[count_key] = interaction_count
1408
1409         def extract_video_object(e):
1410             assert e['@type'] == 'VideoObject'
1411             author = e.get('author')
1412             info.update({
1413                 'url': url_or_none(e.get('contentUrl')),
1414                 'title': unescapeHTML(e.get('name')),
1415                 'description': unescapeHTML(e.get('description')),
1416                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1417                 'duration': parse_duration(e.get('duration')),
1418                 'timestamp': unified_timestamp(e.get('uploadDate')),
1419                 # author can be an instance of 'Organization' or 'Person' types.
1420                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1421                 # however some websites are using 'Text' type instead.
1422                 # 1. https://schema.org/VideoObject
1423                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1424                 'filesize': float_or_none(e.get('contentSize')),
1425                 'tbr': int_or_none(e.get('bitrate')),
1426                 'width': int_or_none(e.get('width')),
1427                 'height': int_or_none(e.get('height')),
1428                 'view_count': int_or_none(e.get('interactionCount')),
1429             })
1430             extract_interaction_statistic(e)
1431
1432         for e in json_ld:
1433             if '@context' in e:
1434                 item_type = e.get('@type')
1435                 if expected_type is not None and expected_type != item_type:
1436                     continue
1437                 if item_type in ('TVEpisode', 'Episode'):
1438                     episode_name = unescapeHTML(e.get('name'))
1439                     info.update({
1440                         'episode': episode_name,
1441                         'episode_number': int_or_none(e.get('episodeNumber')),
1442                         'description': unescapeHTML(e.get('description')),
1443                     })
1444                     if not info.get('title') and episode_name:
1445                         info['title'] = episode_name
1446                     part_of_season = e.get('partOfSeason')
1447                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1448                         info.update({
1449                             'season': unescapeHTML(part_of_season.get('name')),
1450                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1451                         })
1452                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1453                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1454                         info['series'] = unescapeHTML(part_of_series.get('name'))
1455                 elif item_type == 'Movie':
1456                     info.update({
1457                         'title': unescapeHTML(e.get('name')),
1458                         'description': unescapeHTML(e.get('description')),
1459                         'duration': parse_duration(e.get('duration')),
1460                         'timestamp': unified_timestamp(e.get('dateCreated')),
1461                     })
1462                 elif item_type in ('Article', 'NewsArticle'):
1463                     info.update({
1464                         'timestamp': parse_iso8601(e.get('datePublished')),
1465                         'title': unescapeHTML(e.get('headline')),
1466                         'description': unescapeHTML(e.get('articleBody')),
1467                     })
1468                 elif item_type == 'VideoObject':
1469                     extract_video_object(e)
1470                     if expected_type is None:
1471                         continue
1472                     else:
1473                         break
1474                 video = e.get('video')
1475                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1476                     extract_video_object(video)
1477                 if expected_type is None:
1478                     continue
1479                 else:
1480                     break
1481         return dict((k, v) for k, v in info.items() if v is not None)
1482
1483     @staticmethod
1484     def _hidden_inputs(html):
1485         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1486         hidden_inputs = {}
1487         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1488             attrs = extract_attributes(input)
1489             if not input:
1490                 continue
1491             if attrs.get('type') not in ('hidden', 'submit'):
1492                 continue
1493             name = attrs.get('name') or attrs.get('id')
1494             value = attrs.get('value')
1495             if name and value is not None:
1496                 hidden_inputs[name] = value
1497         return hidden_inputs
1498
1499     def _form_hidden_inputs(self, form_id, html):
1500         form = self._search_regex(
1501             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1502             html, '%s form' % form_id, group='form')
1503         return self._hidden_inputs(form)
1504
1505     class FormatSort:
1506         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1507
1508         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1509                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1510                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1511         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1512                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1513                         'fps', 'fs_approx', 'source', 'format_id')
1514
1515         settings = {
1516             'vcodec': {'type': 'ordered', 'regex': True,
1517                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1518             'acodec': {'type': 'ordered', 'regex': True,
1519                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1520             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1521                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1522             'vext': {'type': 'ordered', 'field': 'video_ext',
1523                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1524                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1525             'aext': {'type': 'ordered', 'field': 'audio_ext',
1526                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1527                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1528             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1529             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1530                            'field': ('vcodec', 'acodec'),
1531                            'function': lambda it: int(any(v != 'none' for v in it))},
1532             'ie_pref': {'priority': True, 'type': 'extractor'},
1533             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1534             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1535             'lang': {'convert': 'ignore', 'field': 'language_preference'},
1536             'quality': {'convert': 'float_none', 'default': -1},
1537             'filesize': {'convert': 'bytes'},
1538             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1539             'id': {'convert': 'string', 'field': 'format_id'},
1540             'height': {'convert': 'float_none'},
1541             'width': {'convert': 'float_none'},
1542             'fps': {'convert': 'float_none'},
1543             'tbr': {'convert': 'float_none'},
1544             'vbr': {'convert': 'float_none'},
1545             'abr': {'convert': 'float_none'},
1546             'asr': {'convert': 'float_none'},
1547             'source': {'convert': 'ignore', 'field': 'source_preference'},
1548
1549             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1550             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1551             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1552             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1553             'res': {'type': 'multiple', 'field': ('height', 'width'),
1554                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1555
1556             # Most of these exist only for compatibility reasons
1557             'dimension': {'type': 'alias', 'field': 'res'},
1558             'resolution': {'type': 'alias', 'field': 'res'},
1559             'extension': {'type': 'alias', 'field': 'ext'},
1560             'bitrate': {'type': 'alias', 'field': 'br'},
1561             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1562             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1563             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1564             'framerate': {'type': 'alias', 'field': 'fps'},
1565             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1566             'protocol': {'type': 'alias', 'field': 'proto'},
1567             'source_preference': {'type': 'alias', 'field': 'source'},
1568             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1569             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1570             'samplerate': {'type': 'alias', 'field': 'asr'},
1571             'video_ext': {'type': 'alias', 'field': 'vext'},
1572             'audio_ext': {'type': 'alias', 'field': 'aext'},
1573             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1574             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1575             'video': {'type': 'alias', 'field': 'hasvid'},
1576             'has_video': {'type': 'alias', 'field': 'hasvid'},
1577             'audio': {'type': 'alias', 'field': 'hasaud'},
1578             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1579             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1580             'preference': {'type': 'alias', 'field': 'ie_pref'},
1581             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1582             'format_id': {'type': 'alias', 'field': 'id'},
1583         }
1584
1585         _order = []
1586
1587         def _get_field_setting(self, field, key):
1588             if field not in self.settings:
1589                 self.settings[field] = {}
1590             propObj = self.settings[field]
1591             if key not in propObj:
1592                 type = propObj.get('type')
1593                 if key == 'field':
1594                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1595                 elif key == 'convert':
1596                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1597                 else:
1598                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1599                 propObj[key] = default
1600             return propObj[key]
1601
1602         def _resolve_field_value(self, field, value, convertNone=False):
1603             if value is None:
1604                 if not convertNone:
1605                     return None
1606             else:
1607                 value = value.lower()
1608             conversion = self._get_field_setting(field, 'convert')
1609             if conversion == 'ignore':
1610                 return None
1611             if conversion == 'string':
1612                 return value
1613             elif conversion == 'float_none':
1614                 return float_or_none(value)
1615             elif conversion == 'bytes':
1616                 return FileDownloader.parse_bytes(value)
1617             elif conversion == 'order':
1618                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1619                 use_regex = self._get_field_setting(field, 'regex')
1620                 list_length = len(order_list)
1621                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1622                 if use_regex and value is not None:
1623                     for i, regex in enumerate(order_list):
1624                         if regex and re.match(regex, value):
1625                             return list_length - i
1626                     return list_length - empty_pos  # not in list
1627                 else:  # not regex or  value = None
1628                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1629             else:
1630                 if value.isnumeric():
1631                     return float(value)
1632                 else:
1633                     self.settings[field]['convert'] = 'string'
1634                     return value
1635
1636         def evaluate_params(self, params, sort_extractor):
1637             self._use_free_order = params.get('prefer_free_formats', False)
1638             self._sort_user = params.get('format_sort', [])
1639             self._sort_extractor = sort_extractor
1640
1641             def add_item(field, reverse, closest, limit_text):
1642                 field = field.lower()
1643                 if field in self._order:
1644                     return
1645                 self._order.append(field)
1646                 limit = self._resolve_field_value(field, limit_text)
1647                 data = {
1648                     'reverse': reverse,
1649                     'closest': False if limit is None else closest,
1650                     'limit_text': limit_text,
1651                     'limit': limit}
1652                 if field in self.settings:
1653                     self.settings[field].update(data)
1654                 else:
1655                     self.settings[field] = data
1656
1657             sort_list = (
1658                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1659                 + (tuple() if params.get('format_sort_force', False)
1660                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1661                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1662
1663             for item in sort_list:
1664                 match = re.match(self.regex, item)
1665                 if match is None:
1666                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1667                 field = match.group('field')
1668                 if field is None:
1669                     continue
1670                 if self._get_field_setting(field, 'type') == 'alias':
1671                     field = self._get_field_setting(field, 'field')
1672                 reverse = match.group('reverse') is not None
1673                 closest = match.group('separator') == '~'
1674                 limit_text = match.group('limit')
1675
1676                 has_limit = limit_text is not None
1677                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1678                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1679
1680                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1681                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1682                 limit_count = len(limits)
1683                 for (i, f) in enumerate(fields):
1684                     add_item(f, reverse, closest,
1685                              limits[i] if i < limit_count
1686                              else limits[0] if has_limit and not has_multiple_limits
1687                              else None)
1688
1689         def print_verbose_info(self, write_debug):
1690             if self._sort_user:
1691                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1692             if self._sort_extractor:
1693                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1694             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1695                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1696                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1697                               self._get_field_setting(field, 'limit_text'),
1698                               self._get_field_setting(field, 'limit'))
1699                 if self._get_field_setting(field, 'limit_text') is not None else '')
1700                 for field in self._order if self._get_field_setting(field, 'visible')]))
1701
1702         def _calculate_field_preference_from_value(self, format, field, type, value):
1703             reverse = self._get_field_setting(field, 'reverse')
1704             closest = self._get_field_setting(field, 'closest')
1705             limit = self._get_field_setting(field, 'limit')
1706
1707             if type == 'extractor':
1708                 maximum = self._get_field_setting(field, 'max')
1709                 if value is None or (maximum is not None and value >= maximum):
1710                     value = -1
1711             elif type == 'boolean':
1712                 in_list = self._get_field_setting(field, 'in_list')
1713                 not_in_list = self._get_field_setting(field, 'not_in_list')
1714                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1715             elif type == 'ordered':
1716                 value = self._resolve_field_value(field, value, True)
1717
1718             # try to convert to number
1719             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1720             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1721             if is_num:
1722                 value = val_num
1723
1724             return ((-10, 0) if value is None
1725                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1726                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1727                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1728                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1729                     else (-1, value, 0))
1730
1731         def _calculate_field_preference(self, format, field):
1732             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1733             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1734             if type == 'multiple':
1735                 type = 'field'  # Only 'field' is allowed in multiple for now
1736                 actual_fields = self._get_field_setting(field, 'field')
1737
1738                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1739             else:
1740                 value = get_value(field)
1741             return self._calculate_field_preference_from_value(format, field, type, value)
1742
1743         def calculate_preference(self, format):
1744             # Determine missing protocol
1745             if not format.get('protocol'):
1746                 format['protocol'] = determine_protocol(format)
1747
1748             # Determine missing ext
1749             if not format.get('ext') and 'url' in format:
1750                 format['ext'] = determine_ext(format['url'])
1751             if format.get('vcodec') == 'none':
1752                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1753                 format['video_ext'] = 'none'
1754             else:
1755                 format['video_ext'] = format['ext']
1756                 format['audio_ext'] = 'none'
1757             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1758             #    format['preference'] = -1000
1759
1760             # Determine missing bitrates
1761             if format.get('tbr') is None:
1762                 if format.get('vbr') is not None and format.get('abr') is not None:
1763                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1764             else:
1765                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1766                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1767                 if format.get('acodec') != "none" and format.get('abr') is None:
1768                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1769
1770             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1771
1772     def _sort_formats(self, formats, field_preference=[]):
1773         if not formats:
1774             return
1775         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1776         format_sort.evaluate_params(self._downloader.params, field_preference)
1777         if self.get_param('verbose', False):
1778             format_sort.print_verbose_info(self._downloader.write_debug)
1779         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1780
1781     def _check_formats(self, formats, video_id):
1782         if formats:
1783             formats[:] = filter(
1784                 lambda f: self._is_valid_url(
1785                     f['url'], video_id,
1786                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1787                 formats)
1788
1789     @staticmethod
1790     def _remove_duplicate_formats(formats):
1791         format_urls = set()
1792         unique_formats = []
1793         for f in formats:
1794             if f['url'] not in format_urls:
1795                 format_urls.add(f['url'])
1796                 unique_formats.append(f)
1797         formats[:] = unique_formats
1798
1799     def _is_valid_url(self, url, video_id, item='video', headers={}):
1800         url = self._proto_relative_url(url, scheme='http:')
1801         # For now assume non HTTP(S) URLs always valid
1802         if not (url.startswith('http://') or url.startswith('https://')):
1803             return True
1804         try:
1805             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1806             return True
1807         except ExtractorError as e:
1808             self.to_screen(
1809                 '%s: %s URL is invalid, skipping: %s'
1810                 % (video_id, item, error_to_compat_str(e.cause)))
1811             return False
1812
1813     def http_scheme(self):
1814         """ Either "http:" or "https:", depending on the user's preferences """
1815         return (
1816             'http:'
1817             if self.get_param('prefer_insecure', False)
1818             else 'https:')
1819
1820     def _proto_relative_url(self, url, scheme=None):
1821         if url is None:
1822             return url
1823         if url.startswith('//'):
1824             if scheme is None:
1825                 scheme = self.http_scheme()
1826             return scheme + url
1827         else:
1828             return url
1829
1830     def _sleep(self, timeout, video_id, msg_template=None):
1831         if msg_template is None:
1832             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1833         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1834         self.to_screen(msg)
1835         time.sleep(timeout)
1836
1837     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1838                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1839                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1840         manifest = self._download_xml(
1841             manifest_url, video_id, 'Downloading f4m manifest',
1842             'Unable to download f4m manifest',
1843             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1844             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1845             transform_source=transform_source,
1846             fatal=fatal, data=data, headers=headers, query=query)
1847
1848         if manifest is False:
1849             return []
1850
1851         return self._parse_f4m_formats(
1852             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1853             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1854
1855     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1856                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1857                            fatal=True, m3u8_id=None):
1858         if not isinstance(manifest, compat_etree_Element) and not fatal:
1859             return []
1860
1861         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1862         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1863         if akamai_pv is not None and ';' in akamai_pv.text:
1864             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1865             if playerVerificationChallenge.strip() != '':
1866                 return []
1867
1868         formats = []
1869         manifest_version = '1.0'
1870         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1871         if not media_nodes:
1872             manifest_version = '2.0'
1873             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1874         # Remove unsupported DRM protected media from final formats
1875         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1876         media_nodes = remove_encrypted_media(media_nodes)
1877         if not media_nodes:
1878             return formats
1879
1880         manifest_base_url = get_base_url(manifest)
1881
1882         bootstrap_info = xpath_element(
1883             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1884             'bootstrap info', default=None)
1885
1886         vcodec = None
1887         mime_type = xpath_text(
1888             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1889             'base URL', default=None)
1890         if mime_type and mime_type.startswith('audio/'):
1891             vcodec = 'none'
1892
1893         for i, media_el in enumerate(media_nodes):
1894             tbr = int_or_none(media_el.attrib.get('bitrate'))
1895             width = int_or_none(media_el.attrib.get('width'))
1896             height = int_or_none(media_el.attrib.get('height'))
1897             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1898             # If <bootstrapInfo> is present, the specified f4m is a
1899             # stream-level manifest, and only set-level manifests may refer to
1900             # external resources.  See section 11.4 and section 4 of F4M spec
1901             if bootstrap_info is None:
1902                 media_url = None
1903                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1904                 if manifest_version == '2.0':
1905                     media_url = media_el.attrib.get('href')
1906                 if media_url is None:
1907                     media_url = media_el.attrib.get('url')
1908                 if not media_url:
1909                     continue
1910                 manifest_url = (
1911                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1912                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1913                 # If media_url is itself a f4m manifest do the recursive extraction
1914                 # since bitrates in parent manifest (this one) and media_url manifest
1915                 # may differ leading to inability to resolve the format by requested
1916                 # bitrate in f4m downloader
1917                 ext = determine_ext(manifest_url)
1918                 if ext == 'f4m':
1919                     f4m_formats = self._extract_f4m_formats(
1920                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1921                         transform_source=transform_source, fatal=fatal)
1922                     # Sometimes stream-level manifest contains single media entry that
1923                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1924                     # At the same time parent's media entry in set-level manifest may
1925                     # contain it. We will copy it from parent in such cases.
1926                     if len(f4m_formats) == 1:
1927                         f = f4m_formats[0]
1928                         f.update({
1929                             'tbr': f.get('tbr') or tbr,
1930                             'width': f.get('width') or width,
1931                             'height': f.get('height') or height,
1932                             'format_id': f.get('format_id') if not tbr else format_id,
1933                             'vcodec': vcodec,
1934                         })
1935                     formats.extend(f4m_formats)
1936                     continue
1937                 elif ext == 'm3u8':
1938                     formats.extend(self._extract_m3u8_formats(
1939                         manifest_url, video_id, 'mp4', preference=preference,
1940                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1941                     continue
1942             formats.append({
1943                 'format_id': format_id,
1944                 'url': manifest_url,
1945                 'manifest_url': manifest_url,
1946                 'ext': 'flv' if bootstrap_info is not None else None,
1947                 'protocol': 'f4m',
1948                 'tbr': tbr,
1949                 'width': width,
1950                 'height': height,
1951                 'vcodec': vcodec,
1952                 'preference': preference,
1953                 'quality': quality,
1954             })
1955         return formats
1956
1957     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1958         return {
1959             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1960             'url': m3u8_url,
1961             'ext': ext,
1962             'protocol': 'm3u8',
1963             'preference': preference - 100 if preference else -100,
1964             'quality': quality,
1965             'resolution': 'multiple',
1966             'format_note': 'Quality selection URL',
1967         }
1968
1969     def _extract_m3u8_formats(self, *args, **kwargs):
1970         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1971         if subs:
1972             self.report_warning(bug_reports_message(
1973                 "Ignoring subtitle tracks found in the HLS manifest; "
1974                 "if any subtitle tracks are missing,"
1975             ), only_once=True)
1976         return fmts
1977
1978     def _extract_m3u8_formats_and_subtitles(
1979             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1980             preference=None, quality=None, m3u8_id=None, note=None,
1981             errnote=None, fatal=True, live=False, data=None, headers={},
1982             query={}):
1983
1984         res = self._download_webpage_handle(
1985             m3u8_url, video_id,
1986             note='Downloading m3u8 information' if note is None else note,
1987             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1988             fatal=fatal, data=data, headers=headers, query=query)
1989
1990         if res is False:
1991             return [], {}
1992
1993         m3u8_doc, urlh = res
1994         m3u8_url = urlh.geturl()
1995
1996         return self._parse_m3u8_formats_and_subtitles(
1997             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1998             preference=preference, quality=quality, m3u8_id=m3u8_id,
1999             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2000             headers=headers, query=query, video_id=video_id)
2001
2002     def _parse_m3u8_formats_and_subtitles(
2003             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2004             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2005             errnote=None, fatal=True, data=None, headers={}, query={},
2006             video_id=None):
2007         formats, subtitles = [], {}
2008
2009         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
2010             return formats, subtitles
2011
2012         has_drm = re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)
2013
2014         def format_url(url):
2015             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2016
2017         if self.get_param('hls_split_discontinuity', False):
2018             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2019                 if not m3u8_doc:
2020                     if not manifest_url:
2021                         return []
2022                     m3u8_doc = self._download_webpage(
2023                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2024                         note=False, errnote='Failed to download m3u8 playlist information')
2025                     if m3u8_doc is False:
2026                         return []
2027                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2028
2029         else:
2030             def _extract_m3u8_playlist_indices(*args, **kwargs):
2031                 return [None]
2032
2033         # References:
2034         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2035         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2036         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2037
2038         # We should try extracting formats only from master playlists [1, 4.3.4],
2039         # i.e. playlists that describe available qualities. On the other hand
2040         # media playlists [1, 4.3.3] should be returned as is since they contain
2041         # just the media without qualities renditions.
2042         # Fortunately, master playlist can be easily distinguished from media
2043         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2044         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2045         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2046         # media playlist and MUST NOT appear in master playlist thus we can
2047         # clearly detect media playlist with this criterion.
2048
2049         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2050             formats = [{
2051                 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
2052                 'format_index': idx,
2053                 'url': m3u8_url,
2054                 'ext': ext,
2055                 'protocol': entry_protocol,
2056                 'preference': preference,
2057                 'quality': quality,
2058                 'has_drm': has_drm,
2059             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2060
2061             return formats, subtitles
2062
2063         groups = {}
2064         last_stream_inf = {}
2065
2066         def extract_media(x_media_line):
2067             media = parse_m3u8_attributes(x_media_line)
2068             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2069             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2070             if not (media_type and group_id and name):
2071                 return
2072             groups.setdefault(group_id, []).append(media)
2073             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2074             if media_type == 'SUBTITLES':
2075                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2076                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2077                 # However, lack of URI has been spotted in the wild.
2078                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2079                 if not media.get('URI'):
2080                     return
2081                 url = format_url(media['URI'])
2082                 sub_info = {
2083                     'url': url,
2084                     'ext': determine_ext(url),
2085                 }
2086                 if sub_info['ext'] == 'm3u8':
2087                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2088                     # files may contain is WebVTT:
2089                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2090                     sub_info['ext'] = 'vtt'
2091                     sub_info['protocol'] = 'm3u8_native'
2092                 lang = media.get('LANGUAGE') or 'und'
2093                 subtitles.setdefault(lang, []).append(sub_info)
2094             if media_type not in ('VIDEO', 'AUDIO'):
2095                 return
2096             media_url = media.get('URI')
2097             if media_url:
2098                 manifest_url = format_url(media_url)
2099                 formats.extend({
2100                     'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
2101                     'format_note': name,
2102                     'format_index': idx,
2103                     'url': manifest_url,
2104                     'manifest_url': m3u8_url,
2105                     'language': media.get('LANGUAGE'),
2106                     'ext': ext,
2107                     'protocol': entry_protocol,
2108                     'preference': preference,
2109                     'quality': quality,
2110                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2111                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2112
2113         def build_stream_name():
2114             # Despite specification does not mention NAME attribute for
2115             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2116             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2117             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2118             stream_name = last_stream_inf.get('NAME')
2119             if stream_name:
2120                 return stream_name
2121             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2122             # from corresponding rendition group
2123             stream_group_id = last_stream_inf.get('VIDEO')
2124             if not stream_group_id:
2125                 return
2126             stream_group = groups.get(stream_group_id)
2127             if not stream_group:
2128                 return stream_group_id
2129             rendition = stream_group[0]
2130             return rendition.get('NAME') or stream_group_id
2131
2132         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2133         # chance to detect video only formats when EXT-X-STREAM-INF tags
2134         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2135         for line in m3u8_doc.splitlines():
2136             if line.startswith('#EXT-X-MEDIA:'):
2137                 extract_media(line)
2138
2139         for line in m3u8_doc.splitlines():
2140             if line.startswith('#EXT-X-STREAM-INF:'):
2141                 last_stream_inf = parse_m3u8_attributes(line)
2142             elif line.startswith('#') or not line.strip():
2143                 continue
2144             else:
2145                 tbr = float_or_none(
2146                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2147                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2148                 manifest_url = format_url(line.strip())
2149
2150                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2151                     format_id = [m3u8_id, None, idx]
2152                     # Bandwidth of live streams may differ over time thus making
2153                     # format_id unpredictable. So it's better to keep provided
2154                     # format_id intact.
2155                     if not live:
2156                         stream_name = build_stream_name()
2157                         format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
2158                     f = {
2159                         'format_id': '-'.join(map(str, filter(None, format_id))),
2160                         'format_index': idx,
2161                         'url': manifest_url,
2162                         'manifest_url': m3u8_url,
2163                         'tbr': tbr,
2164                         'ext': ext,
2165                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2166                         'protocol': entry_protocol,
2167                         'preference': preference,
2168                         'quality': quality,
2169                     }
2170                     resolution = last_stream_inf.get('RESOLUTION')
2171                     if resolution:
2172                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2173                         if mobj:
2174                             f['width'] = int(mobj.group('width'))
2175                             f['height'] = int(mobj.group('height'))
2176                     # Unified Streaming Platform
2177                     mobj = re.search(
2178                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2179                     if mobj:
2180                         abr, vbr = mobj.groups()
2181                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2182                         f.update({
2183                             'vbr': vbr,
2184                             'abr': abr,
2185                         })
2186                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2187                     f.update(codecs)
2188                     audio_group_id = last_stream_inf.get('AUDIO')
2189                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2190                     # references a rendition group MUST have a CODECS attribute.
2191                     # However, this is not always respected, for example, [2]
2192                     # contains EXT-X-STREAM-INF tag which references AUDIO
2193                     # rendition group but does not have CODECS and despite
2194                     # referencing an audio group it represents a complete
2195                     # (with audio and video) format. So, for such cases we will
2196                     # ignore references to rendition groups and treat them
2197                     # as complete formats.
2198                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2199                         audio_group = groups.get(audio_group_id)
2200                         if audio_group and audio_group[0].get('URI'):
2201                             # TODO: update acodec for audio only formats with
2202                             # the same GROUP-ID
2203                             f['acodec'] = 'none'
2204                     if not f.get('ext'):
2205                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2206                     formats.append(f)
2207
2208                     # for DailyMotion
2209                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2210                     if progressive_uri:
2211                         http_f = f.copy()
2212                         del http_f['manifest_url']
2213                         http_f.update({
2214                             'format_id': f['format_id'].replace('hls-', 'http-'),
2215                             'protocol': 'http',
2216                             'url': progressive_uri,
2217                         })
2218                         formats.append(http_f)
2219
2220                 last_stream_inf = {}
2221         return formats, subtitles
2222
2223     def _extract_m3u8_vod_duration(
2224             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2225
2226         m3u8_vod = self._download_webpage(
2227             m3u8_vod_url, video_id,
2228             note='Downloading m3u8 VOD manifest' if note is None else note,
2229             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2230             fatal=False, data=data, headers=headers, query=query)
2231
2232         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2233
2234     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2235         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2236             return None
2237
2238         return int(sum(
2239             float(line[len('#EXTINF:'):].split(',')[0])
2240             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2241
2242     @staticmethod
2243     def _xpath_ns(path, namespace=None):
2244         if not namespace:
2245             return path
2246         out = []
2247         for c in path.split('/'):
2248             if not c or c == '.':
2249                 out.append(c)
2250             else:
2251                 out.append('{%s}%s' % (namespace, c))
2252         return '/'.join(out)
2253
2254     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2255         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2256
2257         if smil is False:
2258             assert not fatal
2259             return []
2260
2261         namespace = self._parse_smil_namespace(smil)
2262
2263         fmts = self._parse_smil_formats(
2264             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2265         subs = self._parse_smil_subtitles(
2266             smil, namespace=namespace)
2267
2268         return fmts, subs
2269
2270     def _extract_smil_formats(self, *args, **kwargs):
2271         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2272         if subs:
2273             self.report_warning(bug_reports_message(
2274                 "Ignoring subtitle tracks found in the SMIL manifest; "
2275                 "if any subtitle tracks are missing,"
2276             ), only_once=True)
2277         return fmts
2278
2279     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2280         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2281         if smil is False:
2282             return {}
2283         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2284
2285     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2286         return self._download_xml(
2287             smil_url, video_id, 'Downloading SMIL file',
2288             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2289
2290     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2291         namespace = self._parse_smil_namespace(smil)
2292
2293         formats = self._parse_smil_formats(
2294             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2295         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2296
2297         video_id = os.path.splitext(url_basename(smil_url))[0]
2298         title = None
2299         description = None
2300         upload_date = None
2301         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2302             name = meta.attrib.get('name')
2303             content = meta.attrib.get('content')
2304             if not name or not content:
2305                 continue
2306             if not title and name == 'title':
2307                 title = content
2308             elif not description and name in ('description', 'abstract'):
2309                 description = content
2310             elif not upload_date and name == 'date':
2311                 upload_date = unified_strdate(content)
2312
2313         thumbnails = [{
2314             'id': image.get('type'),
2315             'url': image.get('src'),
2316             'width': int_or_none(image.get('width')),
2317             'height': int_or_none(image.get('height')),
2318         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2319
2320         return {
2321             'id': video_id,
2322             'title': title or video_id,
2323             'description': description,
2324             'upload_date': upload_date,
2325             'thumbnails': thumbnails,
2326             'formats': formats,
2327             'subtitles': subtitles,
2328         }
2329
2330     def _parse_smil_namespace(self, smil):
2331         return self._search_regex(
2332             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2333
2334     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2335         base = smil_url
2336         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2337             b = meta.get('base') or meta.get('httpBase')
2338             if b:
2339                 base = b
2340                 break
2341
2342         formats = []
2343         rtmp_count = 0
2344         http_count = 0
2345         m3u8_count = 0
2346         imgs_count = 0
2347
2348         srcs = set()
2349         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2350         for medium in media:
2351             src = medium.get('src')
2352             if not src or src in srcs:
2353                 continue
2354             srcs.add(src)
2355
2356             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2357             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2358             width = int_or_none(medium.get('width'))
2359             height = int_or_none(medium.get('height'))
2360             proto = medium.get('proto')
2361             ext = medium.get('ext')
2362             src_ext = determine_ext(src)
2363             streamer = medium.get('streamer') or base
2364
2365             if proto == 'rtmp' or streamer.startswith('rtmp'):
2366                 rtmp_count += 1
2367                 formats.append({
2368                     'url': streamer,
2369                     'play_path': src,
2370                     'ext': 'flv',
2371                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2372                     'tbr': bitrate,
2373                     'filesize': filesize,
2374                     'width': width,
2375                     'height': height,
2376                 })
2377                 if transform_rtmp_url:
2378                     streamer, src = transform_rtmp_url(streamer, src)
2379                     formats[-1].update({
2380                         'url': streamer,
2381                         'play_path': src,
2382                     })
2383                 continue
2384
2385             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2386             src_url = src_url.strip()
2387
2388             if proto == 'm3u8' or src_ext == 'm3u8':
2389                 m3u8_formats = self._extract_m3u8_formats(
2390                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2391                 if len(m3u8_formats) == 1:
2392                     m3u8_count += 1
2393                     m3u8_formats[0].update({
2394                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2395                         'tbr': bitrate,
2396                         'width': width,
2397                         'height': height,
2398                     })
2399                 formats.extend(m3u8_formats)
2400             elif src_ext == 'f4m':
2401                 f4m_url = src_url
2402                 if not f4m_params:
2403                     f4m_params = {
2404                         'hdcore': '3.2.0',
2405                         'plugin': 'flowplayer-3.2.0.1',
2406                     }
2407                 f4m_url += '&' if '?' in f4m_url else '?'
2408                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2409                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2410             elif src_ext == 'mpd':
2411                 formats.extend(self._extract_mpd_formats(
2412                     src_url, video_id, mpd_id='dash', fatal=False))
2413             elif re.search(r'\.ism/[Mm]anifest', src_url):
2414                 formats.extend(self._extract_ism_formats(
2415                     src_url, video_id, ism_id='mss', fatal=False))
2416             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2417                 http_count += 1
2418                 formats.append({
2419                     'url': src_url,
2420                     'ext': ext or src_ext or 'flv',
2421                     'format_id': 'http-%d' % (bitrate or http_count),
2422                     'tbr': bitrate,
2423                     'filesize': filesize,
2424                     'width': width,
2425                     'height': height,
2426                 })
2427
2428         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2429             src = medium.get('src')
2430             if not src or src in srcs:
2431                 continue
2432             srcs.add(src)
2433
2434             imgs_count += 1
2435             formats.append({
2436                 'format_id': 'imagestream-%d' % (imgs_count),
2437                 'url': src,
2438                 'ext': mimetype2ext(medium.get('type')),
2439                 'acodec': 'none',
2440                 'vcodec': 'none',
2441                 'width': int_or_none(medium.get('width')),
2442                 'height': int_or_none(medium.get('height')),
2443                 'format_note': 'SMIL storyboards',
2444             })
2445
2446         return formats
2447
2448     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2449         urls = []
2450         subtitles = {}
2451         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2452             src = textstream.get('src')
2453             if not src or src in urls:
2454                 continue
2455             urls.append(src)
2456             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2457             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2458             subtitles.setdefault(lang, []).append({
2459                 'url': src,
2460                 'ext': ext,
2461             })
2462         return subtitles
2463
2464     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2465         xspf = self._download_xml(
2466             xspf_url, playlist_id, 'Downloading xpsf playlist',
2467             'Unable to download xspf manifest', fatal=fatal)
2468         if xspf is False:
2469             return []
2470         return self._parse_xspf(
2471             xspf, playlist_id, xspf_url=xspf_url,
2472             xspf_base_url=base_url(xspf_url))
2473
2474     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2475         NS_MAP = {
2476             'xspf': 'http://xspf.org/ns/0/',
2477             's1': 'http://static.streamone.nl/player/ns/0',
2478         }
2479
2480         entries = []
2481         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2482             title = xpath_text(
2483                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2484             description = xpath_text(
2485                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2486             thumbnail = xpath_text(
2487                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2488             duration = float_or_none(
2489                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2490
2491             formats = []
2492             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2493                 format_url = urljoin(xspf_base_url, location.text)
2494                 if not format_url:
2495                     continue
2496                 formats.append({
2497                     'url': format_url,
2498                     'manifest_url': xspf_url,
2499                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2500                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2501                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2502                 })
2503             self._sort_formats(formats)
2504
2505             entries.append({
2506                 'id': playlist_id,
2507                 'title': title,
2508                 'description': description,
2509                 'thumbnail': thumbnail,
2510                 'duration': duration,
2511                 'formats': formats,
2512             })
2513         return entries
2514
2515     def _extract_mpd_formats(self, *args, **kwargs):
2516         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2517         if subs:
2518             self.report_warning(bug_reports_message(
2519                 "Ignoring subtitle tracks found in the DASH manifest; "
2520                 "if any subtitle tracks are missing,"
2521             ), only_once=True)
2522         return fmts
2523
2524     def _extract_mpd_formats_and_subtitles(
2525             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2526             fatal=True, data=None, headers={}, query={}):
2527         res = self._download_xml_handle(
2528             mpd_url, video_id,
2529             note='Downloading MPD manifest' if note is None else note,
2530             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2531             fatal=fatal, data=data, headers=headers, query=query)
2532         if res is False:
2533             return [], {}
2534         mpd_doc, urlh = res
2535         if mpd_doc is None:
2536             return [], {}
2537         mpd_base_url = base_url(urlh.geturl())
2538
2539         return self._parse_mpd_formats_and_subtitles(
2540             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2541
2542     def _parse_mpd_formats(self, *args, **kwargs):
2543         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2544         if subs:
2545             self.report_warning(bug_reports_message(
2546                 "Ignoring subtitle tracks found in the DASH manifest; "
2547                 "if any subtitle tracks are missing,"
2548             ), only_once=True)
2549         return fmts
2550
2551     def _parse_mpd_formats_and_subtitles(
2552             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2553         """
2554         Parse formats from MPD manifest.
2555         References:
2556          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2557             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2558          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2559         """
2560         if not self.get_param('dynamic_mpd', True):
2561             if mpd_doc.get('type') == 'dynamic':
2562                 return [], {}
2563
2564         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2565
2566         def _add_ns(path):
2567             return self._xpath_ns(path, namespace)
2568
2569         def is_drm_protected(element):
2570             return element.find(_add_ns('ContentProtection')) is not None
2571
2572         def extract_multisegment_info(element, ms_parent_info):
2573             ms_info = ms_parent_info.copy()
2574
2575             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2576             # common attributes and elements.  We will only extract relevant
2577             # for us.
2578             def extract_common(source):
2579                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2580                 if segment_timeline is not None:
2581                     s_e = segment_timeline.findall(_add_ns('S'))
2582                     if s_e:
2583                         ms_info['total_number'] = 0
2584                         ms_info['s'] = []
2585                         for s in s_e:
2586                             r = int(s.get('r', 0))
2587                             ms_info['total_number'] += 1 + r
2588                             ms_info['s'].append({
2589                                 't': int(s.get('t', 0)),
2590                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2591                                 'd': int(s.attrib['d']),
2592                                 'r': r,
2593                             })
2594                 start_number = source.get('startNumber')
2595                 if start_number:
2596                     ms_info['start_number'] = int(start_number)
2597                 timescale = source.get('timescale')
2598                 if timescale:
2599                     ms_info['timescale'] = int(timescale)
2600                 segment_duration = source.get('duration')
2601                 if segment_duration:
2602                     ms_info['segment_duration'] = float(segment_duration)
2603
2604             def extract_Initialization(source):
2605                 initialization = source.find(_add_ns('Initialization'))
2606                 if initialization is not None:
2607                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2608
2609             segment_list = element.find(_add_ns('SegmentList'))
2610             if segment_list is not None:
2611                 extract_common(segment_list)
2612                 extract_Initialization(segment_list)
2613                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2614                 if segment_urls_e:
2615                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2616             else:
2617                 segment_template = element.find(_add_ns('SegmentTemplate'))
2618                 if segment_template is not None:
2619                     extract_common(segment_template)
2620                     media = segment_template.get('media')
2621                     if media:
2622                         ms_info['media'] = media
2623                     initialization = segment_template.get('initialization')
2624                     if initialization:
2625                         ms_info['initialization'] = initialization
2626                     else:
2627                         extract_Initialization(segment_template)
2628             return ms_info
2629
2630         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2631         formats, subtitles = [], {}
2632         stream_numbers = {'audio': 0, 'video': 0}
2633         for period in mpd_doc.findall(_add_ns('Period')):
2634             period_duration = parse_duration(period.get('duration')) or mpd_duration
2635             period_ms_info = extract_multisegment_info(period, {
2636                 'start_number': 1,
2637                 'timescale': 1,
2638             })
2639             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2640                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2641                 for representation in adaptation_set.findall(_add_ns('Representation')):
2642                     representation_attrib = adaptation_set.attrib.copy()
2643                     representation_attrib.update(representation.attrib)
2644                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2645                     mime_type = representation_attrib['mimeType']
2646                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2647
2648                     codecs = representation_attrib.get('codecs', '')
2649                     if content_type not in ('video', 'audio', 'text'):
2650                         if mime_type == 'image/jpeg':
2651                             content_type = mime_type
2652                         elif codecs.split('.')[0] == 'stpp':
2653                             content_type = 'text'
2654                         else:
2655                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2656                             continue
2657
2658                     base_url = ''
2659                     for element in (representation, adaptation_set, period, mpd_doc):
2660                         base_url_e = element.find(_add_ns('BaseURL'))
2661                         if base_url_e is not None:
2662                             base_url = base_url_e.text + base_url
2663                             if re.match(r'^https?://', base_url):
2664                                 break
2665                     if mpd_base_url and base_url.startswith('/'):
2666                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2667                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2668                         if not mpd_base_url.endswith('/'):
2669                             mpd_base_url += '/'
2670                         base_url = mpd_base_url + base_url
2671                     representation_id = representation_attrib.get('id')
2672                     lang = representation_attrib.get('lang')
2673                     url_el = representation.find(_add_ns('BaseURL'))
2674                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2675                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2676                     if representation_id is not None:
2677                         format_id = representation_id
2678                     else:
2679                         format_id = content_type
2680                     if mpd_id:
2681                         format_id = mpd_id + '-' + format_id
2682                     if content_type in ('video', 'audio'):
2683                         f = {
2684                             'format_id': format_id,
2685                             'manifest_url': mpd_url,
2686                             'ext': mimetype2ext(mime_type),
2687                             'width': int_or_none(representation_attrib.get('width')),
2688                             'height': int_or_none(representation_attrib.get('height')),
2689                             'tbr': float_or_none(bandwidth, 1000),
2690                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2691                             'fps': int_or_none(representation_attrib.get('frameRate')),
2692                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2693                             'format_note': 'DASH %s' % content_type,
2694                             'filesize': filesize,
2695                             'container': mimetype2ext(mime_type) + '_dash',
2696                             'manifest_stream_number': stream_numbers[content_type]
2697                         }
2698                         f.update(parse_codecs(codecs))
2699                         stream_numbers[content_type] += 1
2700                     elif content_type == 'text':
2701                         f = {
2702                             'ext': mimetype2ext(mime_type),
2703                             'manifest_url': mpd_url,
2704                             'filesize': filesize,
2705                         }
2706                     elif content_type == 'image/jpeg':
2707                         # See test case in VikiIE
2708                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2709                         f = {
2710                             'format_id': format_id,
2711                             'ext': 'mhtml',
2712                             'manifest_url': mpd_url,
2713                             'format_note': 'DASH storyboards (jpeg)',
2714                             'acodec': 'none',
2715                             'vcodec': 'none',
2716                         }
2717                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2718                         f['has_drm'] = True
2719                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2720
2721                     def prepare_template(template_name, identifiers):
2722                         tmpl = representation_ms_info[template_name]
2723                         # First of, % characters outside $...$ templates
2724                         # must be escaped by doubling for proper processing
2725                         # by % operator string formatting used further (see
2726                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2727                         t = ''
2728                         in_template = False
2729                         for c in tmpl:
2730                             t += c
2731                             if c == '$':
2732                                 in_template = not in_template
2733                             elif c == '%' and not in_template:
2734                                 t += c
2735                         # Next, $...$ templates are translated to their
2736                         # %(...) counterparts to be used with % operator
2737                         if representation_id is not None:
2738                             t = t.replace('$RepresentationID$', representation_id)
2739                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2740                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2741                         t.replace('$$', '$')
2742                         return t
2743
2744                     # @initialization is a regular template like @media one
2745                     # so it should be handled just the same way (see
2746                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2747                     if 'initialization' in representation_ms_info:
2748                         initialization_template = prepare_template(
2749                             'initialization',
2750                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2751                             # $Time$ shall not be included for @initialization thus
2752                             # only $Bandwidth$ remains
2753                             ('Bandwidth', ))
2754                         representation_ms_info['initialization_url'] = initialization_template % {
2755                             'Bandwidth': bandwidth,
2756                         }
2757
2758                     def location_key(location):
2759                         return 'url' if re.match(r'^https?://', location) else 'path'
2760
2761                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2762
2763                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2764                         media_location_key = location_key(media_template)
2765
2766                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2767                         # can't be used at the same time
2768                         if '%(Number' in media_template and 's' not in representation_ms_info:
2769                             segment_duration = None
2770                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2771                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2772                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2773                             representation_ms_info['fragments'] = [{
2774                                 media_location_key: media_template % {
2775                                     'Number': segment_number,
2776                                     'Bandwidth': bandwidth,
2777                                 },
2778                                 'duration': segment_duration,
2779                             } for segment_number in range(
2780                                 representation_ms_info['start_number'],
2781                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2782                         else:
2783                             # $Number*$ or $Time$ in media template with S list available
2784                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2785                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2786                             representation_ms_info['fragments'] = []
2787                             segment_time = 0
2788                             segment_d = None
2789                             segment_number = representation_ms_info['start_number']
2790
2791                             def add_segment_url():
2792                                 segment_url = media_template % {
2793                                     'Time': segment_time,
2794                                     'Bandwidth': bandwidth,
2795                                     'Number': segment_number,
2796                                 }
2797                                 representation_ms_info['fragments'].append({
2798                                     media_location_key: segment_url,
2799                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2800                                 })
2801
2802                             for num, s in enumerate(representation_ms_info['s']):
2803                                 segment_time = s.get('t') or segment_time
2804                                 segment_d = s['d']
2805                                 add_segment_url()
2806                                 segment_number += 1
2807                                 for r in range(s.get('r', 0)):
2808                                     segment_time += segment_d
2809                                     add_segment_url()
2810                                     segment_number += 1
2811                                 segment_time += segment_d
2812                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2813                         # No media template
2814                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2815                         # or any YouTube dashsegments video
2816                         fragments = []
2817                         segment_index = 0
2818                         timescale = representation_ms_info['timescale']
2819                         for s in representation_ms_info['s']:
2820                             duration = float_or_none(s['d'], timescale)
2821                             for r in range(s.get('r', 0) + 1):
2822                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2823                                 fragments.append({
2824                                     location_key(segment_uri): segment_uri,
2825                                     'duration': duration,
2826                                 })
2827                                 segment_index += 1
2828                         representation_ms_info['fragments'] = fragments
2829                     elif 'segment_urls' in representation_ms_info:
2830                         # Segment URLs with no SegmentTimeline
2831                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2832                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2833                         fragments = []
2834                         segment_duration = float_or_none(
2835                             representation_ms_info['segment_duration'],
2836                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2837                         for segment_url in representation_ms_info['segment_urls']:
2838                             fragment = {
2839                                 location_key(segment_url): segment_url,
2840                             }
2841                             if segment_duration:
2842                                 fragment['duration'] = segment_duration
2843                             fragments.append(fragment)
2844                         representation_ms_info['fragments'] = fragments
2845                     # If there is a fragments key available then we correctly recognized fragmented media.
2846                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2847                     # assumption is not necessarily correct since we may simply have no support for
2848                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2849                     if 'fragments' in representation_ms_info:
2850                         f.update({
2851                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2852                             'url': mpd_url or base_url,
2853                             'fragment_base_url': base_url,
2854                             'fragments': [],
2855                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2856                         })
2857                         if 'initialization_url' in representation_ms_info:
2858                             initialization_url = representation_ms_info['initialization_url']
2859                             if not f.get('url'):
2860                                 f['url'] = initialization_url
2861                             f['fragments'].append({location_key(initialization_url): initialization_url})
2862                         f['fragments'].extend(representation_ms_info['fragments'])
2863                     else:
2864                         # Assuming direct URL to unfragmented media.
2865                         f['url'] = base_url
2866                     if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2867                         formats.append(f)
2868                     elif content_type == 'text':
2869                         subtitles.setdefault(lang or 'und', []).append(f)
2870
2871         return formats, subtitles
2872
2873     def _extract_ism_formats(self, *args, **kwargs):
2874         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2875         if subs:
2876             self.report_warning(bug_reports_message(
2877                 "Ignoring subtitle tracks found in the ISM manifest; "
2878                 "if any subtitle tracks are missing,"
2879             ))
2880         return fmts
2881
2882     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2883         res = self._download_xml_handle(
2884             ism_url, video_id,
2885             note='Downloading ISM manifest' if note is None else note,
2886             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2887             fatal=fatal, data=data, headers=headers, query=query)
2888         if res is False:
2889             return [], {}
2890         ism_doc, urlh = res
2891         if ism_doc is None:
2892             return [], {}
2893
2894         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2895
2896     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2897         """
2898         Parse formats from ISM manifest.
2899         References:
2900          1. [MS-SSTR]: Smooth Streaming Protocol,
2901             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2902         """
2903         if ism_doc.get('IsLive') == 'TRUE':
2904             return [], {}
2905
2906         duration = int(ism_doc.attrib['Duration'])
2907         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2908
2909         formats = []
2910         subtitles = {}
2911         for stream in ism_doc.findall('StreamIndex'):
2912             stream_type = stream.get('Type')
2913             if stream_type not in ('video', 'audio', 'text'):
2914                 continue
2915             url_pattern = stream.attrib['Url']
2916             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2917             stream_name = stream.get('Name')
2918             stream_language = stream.get('Language', 'und')
2919             for track in stream.findall('QualityLevel'):
2920                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2921                 # TODO: add support for WVC1 and WMAP
2922                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2923                     self.report_warning('%s is not a supported codec' % fourcc)
2924                     continue
2925                 tbr = int(track.attrib['Bitrate']) // 1000
2926                 # [1] does not mention Width and Height attributes. However,
2927                 # they're often present while MaxWidth and MaxHeight are
2928                 # missing, so should be used as fallbacks
2929                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2930                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2931                 sampling_rate = int_or_none(track.get('SamplingRate'))
2932
2933                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2934                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2935
2936                 fragments = []
2937                 fragment_ctx = {
2938                     'time': 0,
2939                 }
2940                 stream_fragments = stream.findall('c')
2941                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2942                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2943                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2944                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2945                     if not fragment_ctx['duration']:
2946                         try:
2947                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2948                         except IndexError:
2949                             next_fragment_time = duration
2950                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2951                     for _ in range(fragment_repeat):
2952                         fragments.append({
2953                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2954                             'duration': fragment_ctx['duration'] / stream_timescale,
2955                         })
2956                         fragment_ctx['time'] += fragment_ctx['duration']
2957
2958                 format_id = []
2959                 if ism_id:
2960                     format_id.append(ism_id)
2961                 if stream_name:
2962                     format_id.append(stream_name)
2963                 format_id.append(compat_str(tbr))
2964
2965                 if stream_type == 'text':
2966                     subtitles.setdefault(stream_language, []).append({
2967                         'ext': 'ismt',
2968                         'protocol': 'ism',
2969                         'url': ism_url,
2970                         'manifest_url': ism_url,
2971                         'fragments': fragments,
2972                         '_download_params': {
2973                             'stream_type': stream_type,
2974                             'duration': duration,
2975                             'timescale': stream_timescale,
2976                             'fourcc': fourcc,
2977                             'language': stream_language,
2978                             'codec_private_data': track.get('CodecPrivateData'),
2979                         }
2980                     })
2981                 elif stream_type in ('video', 'audio'):
2982                     formats.append({
2983                         'format_id': '-'.join(format_id),
2984                         'url': ism_url,
2985                         'manifest_url': ism_url,
2986                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2987                         'width': width,
2988                         'height': height,
2989                         'tbr': tbr,
2990                         'asr': sampling_rate,
2991                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2992                         'acodec': 'none' if stream_type == 'video' else fourcc,
2993                         'protocol': 'ism',
2994                         'fragments': fragments,
2995                         'has_drm': ism_doc.find('Protection') is not None,
2996                         '_download_params': {
2997                             'stream_type': stream_type,
2998                             'duration': duration,
2999                             'timescale': stream_timescale,
3000                             'width': width or 0,
3001                             'height': height or 0,
3002                             'fourcc': fourcc,
3003                             'language': stream_language,
3004                             'codec_private_data': track.get('CodecPrivateData'),
3005                             'sampling_rate': sampling_rate,
3006                             'channels': int_or_none(track.get('Channels', 2)),
3007                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3008                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3009                         },
3010                     })
3011         return formats, subtitles
3012
3013     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3014         def absolute_url(item_url):
3015             return urljoin(base_url, item_url)
3016
3017         def parse_content_type(content_type):
3018             if not content_type:
3019                 return {}
3020             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3021             if ctr:
3022                 mimetype, codecs = ctr.groups()
3023                 f = parse_codecs(codecs)
3024                 f['ext'] = mimetype2ext(mimetype)
3025                 return f
3026             return {}
3027
3028         def _media_formats(src, cur_media_type, type_info={}):
3029             full_url = absolute_url(src)
3030             ext = type_info.get('ext') or determine_ext(full_url)
3031             if ext == 'm3u8':
3032                 is_plain_url = False
3033                 formats = self._extract_m3u8_formats(
3034                     full_url, video_id, ext='mp4',
3035                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3036                     preference=preference, quality=quality, fatal=False)
3037             elif ext == 'mpd':
3038                 is_plain_url = False
3039                 formats = self._extract_mpd_formats(
3040                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3041             else:
3042                 is_plain_url = True
3043                 formats = [{
3044                     'url': full_url,
3045                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3046                 }]
3047             return is_plain_url, formats
3048
3049         entries = []
3050         # amp-video and amp-audio are very similar to their HTML5 counterparts
3051         # so we wll include them right here (see
3052         # https://www.ampproject.org/docs/reference/components/amp-video)
3053         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3054         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3055         media_tags = [(media_tag, media_tag_name, media_type, '')
3056                       for media_tag, media_tag_name, media_type
3057                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3058         media_tags.extend(re.findall(
3059             # We only allow video|audio followed by a whitespace or '>'.
3060             # Allowing more characters may end up in significant slow down (see
3061             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3062             # http://www.porntrex.com/maps/videositemap.xml).
3063             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3064         for media_tag, _, media_type, media_content in media_tags:
3065             media_info = {
3066                 'formats': [],
3067                 'subtitles': {},
3068             }
3069             media_attributes = extract_attributes(media_tag)
3070             src = strip_or_none(media_attributes.get('src'))
3071             if src:
3072                 _, formats = _media_formats(src, media_type)
3073                 media_info['formats'].extend(formats)
3074             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3075             if media_content:
3076                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3077                     s_attr = extract_attributes(source_tag)
3078                     # data-video-src and data-src are non standard but seen
3079                     # several times in the wild
3080                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3081                     if not src:
3082                         continue
3083                     f = parse_content_type(s_attr.get('type'))
3084                     is_plain_url, formats = _media_formats(src, media_type, f)
3085                     if is_plain_url:
3086                         # width, height, res, label and title attributes are
3087                         # all not standard but seen several times in the wild
3088                         labels = [
3089                             s_attr.get(lbl)
3090                             for lbl in ('label', 'title')
3091                             if str_or_none(s_attr.get(lbl))
3092                         ]
3093                         width = int_or_none(s_attr.get('width'))
3094                         height = (int_or_none(s_attr.get('height'))
3095                                   or int_or_none(s_attr.get('res')))
3096                         if not width or not height:
3097                             for lbl in labels:
3098                                 resolution = parse_resolution(lbl)
3099                                 if not resolution:
3100                                     continue
3101                                 width = width or resolution.get('width')
3102                                 height = height or resolution.get('height')
3103                         for lbl in labels:
3104                             tbr = parse_bitrate(lbl)
3105                             if tbr:
3106                                 break
3107                         else:
3108                             tbr = None
3109                         f.update({
3110                             'width': width,
3111                             'height': height,
3112                             'tbr': tbr,
3113                             'format_id': s_attr.get('label') or s_attr.get('title'),
3114                         })
3115                         f.update(formats[0])
3116                         media_info['formats'].append(f)
3117                     else:
3118                         media_info['formats'].extend(formats)
3119                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3120                     track_attributes = extract_attributes(track_tag)
3121                     kind = track_attributes.get('kind')
3122                     if not kind or kind in ('subtitles', 'captions'):
3123                         src = strip_or_none(track_attributes.get('src'))
3124                         if not src:
3125                             continue
3126                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3127                         media_info['subtitles'].setdefault(lang, []).append({
3128                             'url': absolute_url(src),
3129                         })
3130             for f in media_info['formats']:
3131                 f.setdefault('http_headers', {})['Referer'] = base_url
3132             if media_info['formats'] or media_info['subtitles']:
3133                 entries.append(media_info)
3134         return entries
3135
3136     def _extract_akamai_formats(self, *args, **kwargs):
3137         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3138         if subs:
3139             self.report_warning(bug_reports_message(
3140                 "Ignoring subtitle tracks found in the manifests; "
3141                 "if any subtitle tracks are missing,"
3142             ))
3143         return fmts
3144
3145     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3146         signed = 'hdnea=' in manifest_url
3147         if not signed:
3148             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3149             manifest_url = re.sub(
3150                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3151                 '', manifest_url).strip('?')
3152
3153         formats = []
3154         subtitles = {}
3155
3156         hdcore_sign = 'hdcore=3.7.0'
3157         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3158         hds_host = hosts.get('hds')
3159         if hds_host:
3160             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3161         if 'hdcore=' not in f4m_url:
3162             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3163         f4m_formats = self._extract_f4m_formats(
3164             f4m_url, video_id, f4m_id='hds', fatal=False)
3165         for entry in f4m_formats:
3166             entry.update({'extra_param_to_segment_url': hdcore_sign})
3167         formats.extend(f4m_formats)
3168
3169         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3170         hls_host = hosts.get('hls')
3171         if hls_host:
3172             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3173         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3174             m3u8_url, video_id, 'mp4', 'm3u8_native',
3175             m3u8_id='hls', fatal=False)
3176         formats.extend(m3u8_formats)
3177         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3178
3179         http_host = hosts.get('http')
3180         if http_host and m3u8_formats and not signed:
3181             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3182             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3183             qualities_length = len(qualities)
3184             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3185                 i = 0
3186                 for f in m3u8_formats:
3187                     if f['vcodec'] != 'none':
3188                         for protocol in ('http', 'https'):
3189                             http_f = f.copy()
3190                             del http_f['manifest_url']
3191                             http_url = re.sub(
3192                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3193                             http_f.update({
3194                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3195                                 'url': http_url,
3196                                 'protocol': protocol,
3197                             })
3198                             formats.append(http_f)
3199                         i += 1
3200
3201         return formats, subtitles
3202
3203     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3204         query = compat_urlparse.urlparse(url).query
3205         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3206         mobj = re.search(
3207             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3208         url_base = mobj.group('url')
3209         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3210         formats = []
3211
3212         def manifest_url(manifest):
3213             m_url = '%s/%s' % (http_base_url, manifest)
3214             if query:
3215                 m_url += '?%s' % query
3216             return m_url
3217
3218         if 'm3u8' not in skip_protocols:
3219             formats.extend(self._extract_m3u8_formats(
3220                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3221                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3222         if 'f4m' not in skip_protocols:
3223             formats.extend(self._extract_f4m_formats(
3224                 manifest_url('manifest.f4m'),
3225                 video_id, f4m_id='hds', fatal=False))
3226         if 'dash' not in skip_protocols:
3227             formats.extend(self._extract_mpd_formats(
3228                 manifest_url('manifest.mpd'),
3229                 video_id, mpd_id='dash', fatal=False))
3230         if re.search(r'(?:/smil:|\.smil)', url_base):
3231             if 'smil' not in skip_protocols:
3232                 rtmp_formats = self._extract_smil_formats(
3233                     manifest_url('jwplayer.smil'),
3234                     video_id, fatal=False)
3235                 for rtmp_format in rtmp_formats:
3236                     rtsp_format = rtmp_format.copy()
3237                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3238                     del rtsp_format['play_path']
3239                     del rtsp_format['ext']
3240                     rtsp_format.update({
3241                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3242                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3243                         'protocol': 'rtsp',
3244                     })
3245                     formats.extend([rtmp_format, rtsp_format])
3246         else:
3247             for protocol in ('rtmp', 'rtsp'):
3248                 if protocol not in skip_protocols:
3249                     formats.append({
3250                         'url': '%s:%s' % (protocol, url_base),
3251                         'format_id': protocol,
3252                         'protocol': protocol,
3253                     })
3254         return formats
3255
3256     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3257         mobj = re.search(
3258             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3259             webpage)
3260         if mobj:
3261             try:
3262                 jwplayer_data = self._parse_json(mobj.group('options'),
3263                                                  video_id=video_id,
3264                                                  transform_source=transform_source)
3265             except ExtractorError:
3266                 pass
3267             else:
3268                 if isinstance(jwplayer_data, dict):
3269                     return jwplayer_data
3270
3271     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3272         jwplayer_data = self._find_jwplayer_data(
3273             webpage, video_id, transform_source=js_to_json)
3274         return self._parse_jwplayer_data(
3275             jwplayer_data, video_id, *args, **kwargs)
3276
3277     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3278                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3279         # JWPlayer backward compatibility: flattened playlists
3280         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3281         if 'playlist' not in jwplayer_data:
3282             jwplayer_data = {'playlist': [jwplayer_data]}
3283
3284         entries = []
3285
3286         # JWPlayer backward compatibility: single playlist item
3287         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3288         if not isinstance(jwplayer_data['playlist'], list):
3289             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3290
3291         for video_data in jwplayer_data['playlist']:
3292             # JWPlayer backward compatibility: flattened sources
3293             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3294             if 'sources' not in video_data:
3295                 video_data['sources'] = [video_data]
3296
3297             this_video_id = video_id or video_data['mediaid']
3298
3299             formats = self._parse_jwplayer_formats(
3300                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3301                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3302
3303             subtitles = {}
3304             tracks = video_data.get('tracks')
3305             if tracks and isinstance(tracks, list):
3306                 for track in tracks:
3307                     if not isinstance(track, dict):
3308                         continue
3309                     track_kind = track.get('kind')
3310                     if not track_kind or not isinstance(track_kind, compat_str):
3311                         continue
3312                     if track_kind.lower() not in ('captions', 'subtitles'):
3313                         continue
3314                     track_url = urljoin(base_url, track.get('file'))
3315                     if not track_url:
3316                         continue
3317                     subtitles.setdefault(track.get('label') or 'en', []).append({
3318                         'url': self._proto_relative_url(track_url)
3319                     })
3320
3321             entry = {
3322                 'id': this_video_id,
3323                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3324                 'description': clean_html(video_data.get('description')),
3325                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3326                 'timestamp': int_or_none(video_data.get('pubdate')),
3327                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3328                 'subtitles': subtitles,
3329             }
3330             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3331             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3332                 entry.update({
3333                     '_type': 'url_transparent',
3334                     'url': formats[0]['url'],
3335                 })
3336             else:
3337                 self._sort_formats(formats)
3338                 entry['formats'] = formats
3339             entries.append(entry)
3340         if len(entries) == 1:
3341             return entries[0]
3342         else:
3343             return self.playlist_result(entries)
3344
3345     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3346                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3347         urls = []
3348         formats = []
3349         for source in jwplayer_sources_data:
3350             if not isinstance(source, dict):
3351                 continue
3352             source_url = urljoin(
3353                 base_url, self._proto_relative_url(source.get('file')))
3354             if not source_url or source_url in urls:
3355                 continue
3356             urls.append(source_url)
3357             source_type = source.get('type') or ''
3358             ext = mimetype2ext(source_type) or determine_ext(source_url)
3359             if source_type == 'hls' or ext == 'm3u8':
3360                 formats.extend(self._extract_m3u8_formats(
3361                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3362                     m3u8_id=m3u8_id, fatal=False))
3363             elif source_type == 'dash' or ext == 'mpd':
3364                 formats.extend(self._extract_mpd_formats(
3365                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3366             elif ext == 'smil':
3367                 formats.extend(self._extract_smil_formats(
3368                     source_url, video_id, fatal=False))
3369             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3370             elif source_type.startswith('audio') or ext in (
3371                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3372                 formats.append({
3373                     'url': source_url,
3374                     'vcodec': 'none',
3375                     'ext': ext,
3376                 })
3377             else:
3378                 height = int_or_none(source.get('height'))
3379                 if height is None:
3380                     # Often no height is provided but there is a label in
3381                     # format like "1080p", "720p SD", or 1080.
3382                     height = int_or_none(self._search_regex(
3383                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3384                         'height', default=None))
3385                 a_format = {
3386                     'url': source_url,
3387                     'width': int_or_none(source.get('width')),
3388                     'height': height,
3389                     'tbr': int_or_none(source.get('bitrate')),
3390                     'ext': ext,
3391                 }
3392                 if source_url.startswith('rtmp'):
3393                     a_format['ext'] = 'flv'
3394                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3395                     # of jwplayer.flash.swf
3396                     rtmp_url_parts = re.split(
3397                         r'((?:mp4|mp3|flv):)', source_url, 1)
3398                     if len(rtmp_url_parts) == 3:
3399                         rtmp_url, prefix, play_path = rtmp_url_parts
3400                         a_format.update({
3401                             'url': rtmp_url,
3402                             'play_path': prefix + play_path,
3403                         })
3404                     if rtmp_params:
3405                         a_format.update(rtmp_params)
3406                 formats.append(a_format)
3407         return formats
3408
3409     def _live_title(self, name):
3410         """ Generate the title for a live video """
3411         now = datetime.datetime.now()
3412         now_str = now.strftime('%Y-%m-%d %H:%M')
3413         return name + ' ' + now_str
3414
3415     def _int(self, v, name, fatal=False, **kwargs):
3416         res = int_or_none(v, **kwargs)
3417         if 'get_attr' in kwargs:
3418             print(getattr(v, kwargs['get_attr']))
3419         if res is None:
3420             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3421             if fatal:
3422                 raise ExtractorError(msg)
3423             else:
3424                 self.report_warning(msg)
3425         return res
3426
3427     def _float(self, v, name, fatal=False, **kwargs):
3428         res = float_or_none(v, **kwargs)
3429         if res is None:
3430             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3431             if fatal:
3432                 raise ExtractorError(msg)
3433             else:
3434                 self.report_warning(msg)
3435         return res
3436
3437     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3438                     path='/', secure=False, discard=False, rest={}, **kwargs):
3439         cookie = compat_cookiejar_Cookie(
3440             0, name, value, port, port is not None, domain, True,
3441             domain.startswith('.'), path, True, secure, expire_time,
3442             discard, None, None, rest)
3443         self._downloader.cookiejar.set_cookie(cookie)
3444
3445     def _get_cookies(self, url):
3446         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3447         req = sanitized_Request(url)
3448         self._downloader.cookiejar.add_cookie_header(req)
3449         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3450
3451     def _apply_first_set_cookie_header(self, url_handle, cookie):
3452         """
3453         Apply first Set-Cookie header instead of the last. Experimental.
3454
3455         Some sites (e.g. [1-3]) may serve two cookies under the same name
3456         in Set-Cookie header and expect the first (old) one to be set rather
3457         than second (new). However, as of RFC6265 the newer one cookie
3458         should be set into cookie store what actually happens.
3459         We will workaround this issue by resetting the cookie to
3460         the first one manually.
3461         1. https://new.vk.com/
3462         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3463         3. https://learning.oreilly.com/
3464         """
3465         for header, cookies in url_handle.headers.items():
3466             if header.lower() != 'set-cookie':
3467                 continue
3468             if sys.version_info[0] >= 3:
3469                 cookies = cookies.encode('iso-8859-1')
3470             cookies = cookies.decode('utf-8')
3471             cookie_value = re.search(
3472                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3473             if cookie_value:
3474                 value, domain = cookie_value.groups()
3475                 self._set_cookie(domain, cookie, value)
3476                 break
3477
3478     def get_testcases(self, include_onlymatching=False):
3479         t = getattr(self, '_TEST', None)
3480         if t:
3481             assert not hasattr(self, '_TESTS'), \
3482                 '%s has _TEST and _TESTS' % type(self).__name__
3483             tests = [t]
3484         else:
3485             tests = getattr(self, '_TESTS', [])
3486         for t in tests:
3487             if not include_onlymatching and t.get('only_matching', False):
3488                 continue
3489             t['name'] = type(self).__name__[:-len('IE')]
3490             yield t
3491
3492     def is_suitable(self, age_limit):
3493         """ Test whether the extractor is generally suitable for the given
3494         age limit (i.e. pornographic sites are not, all others usually are) """
3495
3496         any_restricted = False
3497         for tc in self.get_testcases(include_onlymatching=False):
3498             if tc.get('playlist', []):
3499                 tc = tc['playlist'][0]
3500             is_restricted = age_restricted(
3501                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3502             if not is_restricted:
3503                 return True
3504             any_restricted = any_restricted or is_restricted
3505         return not any_restricted
3506
3507     def extract_subtitles(self, *args, **kwargs):
3508         if (self.get_param('writesubtitles', False)
3509                 or self.get_param('listsubtitles')):
3510             return self._get_subtitles(*args, **kwargs)
3511         return {}
3512
3513     def _get_subtitles(self, *args, **kwargs):
3514         raise NotImplementedError('This method must be implemented by subclasses')
3515
3516     @staticmethod
3517     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3518         """ Merge subtitle items for one language. Items with duplicated URLs
3519         will be dropped. """
3520         list1_urls = set([item['url'] for item in subtitle_list1])
3521         ret = list(subtitle_list1)
3522         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3523         return ret
3524
3525     @classmethod
3526     def _merge_subtitles(cls, *dicts, target=None):
3527         """ Merge subtitle dictionaries, language by language. """
3528         if target is None:
3529             target = {}
3530         for d in dicts:
3531             for lang, subs in d.items():
3532                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3533         return target
3534
3535     def extract_automatic_captions(self, *args, **kwargs):
3536         if (self.get_param('writeautomaticsub', False)
3537                 or self.get_param('listsubtitles')):
3538             return self._get_automatic_captions(*args, **kwargs)
3539         return {}
3540
3541     def _get_automatic_captions(self, *args, **kwargs):
3542         raise NotImplementedError('This method must be implemented by subclasses')
3543
3544     def mark_watched(self, *args, **kwargs):
3545         if not self.get_param('mark_watched', False):
3546             return
3547         if (self._get_login_info()[0] is not None
3548                 or self.get_param('cookiefile')
3549                 or self.get_param('cookiesfrombrowser')):
3550             self._mark_watched(*args, **kwargs)
3551
3552     def _mark_watched(self, *args, **kwargs):
3553         raise NotImplementedError('This method must be implemented by subclasses')
3554
3555     def geo_verification_headers(self):
3556         headers = {}
3557         geo_verification_proxy = self.get_param('geo_verification_proxy')
3558         if geo_verification_proxy:
3559             headers['Ytdl-request-proxy'] = geo_verification_proxy
3560         return headers
3561
3562     def _generic_id(self, url):
3563         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3564
3565     def _generic_title(self, url):
3566         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3567
3568     @staticmethod
3569     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3570         all_known = all(map(
3571             lambda x: x is not None,
3572             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3573         return (
3574             'private' if is_private
3575             else 'premium_only' if needs_premium
3576             else 'subscriber_only' if needs_subscription
3577             else 'needs_auth' if needs_auth
3578             else 'unlisted' if is_unlisted
3579             else 'public' if all_known
3580             else None)
3581
3582     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3583         '''
3584         @returns            A list of values for the extractor argument given by "key"
3585                             or "default" if no such key is present
3586         @param default      The default value to return when the key is not present (default: [])
3587         @param casesense    When false, the values are converted to lower case
3588         '''
3589         val = traverse_obj(
3590             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3591         if val is None:
3592             return [] if default is NO_DEFAULT else default
3593         return list(val) if casesense else [x.lower() for x in val]
3594
3595
3596 class SearchInfoExtractor(InfoExtractor):
3597     """
3598     Base class for paged search queries extractors.
3599     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3600     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3601     """
3602
3603     @classmethod
3604     def _make_valid_url(cls):
3605         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3606
3607     @classmethod
3608     def suitable(cls, url):
3609         return re.match(cls._make_valid_url(), url) is not None
3610
3611     def _real_extract(self, query):
3612         mobj = re.match(self._make_valid_url(), query)
3613         if mobj is None:
3614             raise ExtractorError('Invalid search query "%s"' % query)
3615
3616         prefix = mobj.group('prefix')
3617         query = mobj.group('query')
3618         if prefix == '':
3619             return self._get_n_results(query, 1)
3620         elif prefix == 'all':
3621             return self._get_n_results(query, self._MAX_RESULTS)
3622         else:
3623             n = int(prefix)
3624             if n <= 0:
3625                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3626             elif n > self._MAX_RESULTS:
3627                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3628                 n = self._MAX_RESULTS
3629             return self._get_n_results(query, n)
3630
3631     def _get_n_results(self, query, n):
3632         """Get a specified number of results for a query"""
3633         raise NotImplementedError('This method must be implemented by subclasses')
3634
3635     @property
3636     def SEARCH_KEY(self):
3637         return self._SEARCH_KEY