yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar_Cookie,
  18     compat_cookies_SimpleCookie,
  19     compat_etree_Element,
  20     compat_etree_fromstring,
  21     compat_expanduser,
  22     compat_getpass,
  23     compat_http_client,
  24     compat_os_name,
  25     compat_str,
  26     compat_urllib_error,
  27     compat_urllib_parse_unquote,
  28     compat_urllib_parse_urlencode,
  29     compat_urllib_request,
  30     compat_urlparse,
  31     compat_xml_parse_error,
  32 )
  33 from ..downloader import FileDownloader
  34 from ..downloader.f4m import (
  35     get_base_url,
  36     remove_encrypted_media,
  37 )
  38 from ..utils import (
  39     age_restricted,
  40     base_url,
  41     bug_reports_message,
  42     clean_html,
  43     compiled_regex_type,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     error_to_compat_str,
  48     extract_attributes,
  49     ExtractorError,
  50     fix_xml_ampersands,
  51     float_or_none,
  52     format_field,
  53     GeoRestrictedError,
  54     GeoUtils,
  55     int_or_none,
  56     js_to_json,
  57     JSON_LD_RE,
  58     mimetype2ext,
  59     network_exceptions,
  60     NO_DEFAULT,
  61     orderedSet,
  62     parse_bitrate,
  63     parse_codecs,
  64     parse_duration,
  65     parse_iso8601,
  66     parse_m3u8_attributes,
  67     parse_resolution,
  68     RegexNotFoundError,
  69     sanitize_filename,
  70     sanitized_Request,
  71     str_or_none,
  72     str_to_int,
  73     strip_or_none,
  74     traverse_obj,
  75     unescapeHTML,
  76     unified_strdate,
  77     unified_timestamp,
  78     update_Request,
  79     update_url_query,
  80     url_basename,
  81     url_or_none,
  82     urljoin,
  83     variadic,
  84     xpath_element,
  85     xpath_text,
  86     xpath_with_ns,
  87 )
  88
  89
  90 class InfoExtractor(object):
  91     """Information Extractor class.
  92
  93     Information extractors are the classes that, given a URL, extract
  94     information about the video (or videos) the URL refers to. This
  95     information includes the real video URL, the video title, author and
  96     others. The information is stored in a dictionary which is then
  97     passed to the YoutubeDL. The YoutubeDL processes this
  98     information possibly downloading the video to the file system, among
  99     other possible outcomes.
 100
 101     The type field determines the type of the result.
 102     By far the most common value (and the default if _type is missing) is
 103     "video", which indicates a single video.
 104
 105     For a video, the dictionaries must include the following fields:
 106
 107     id:             Video identifier.
 108     title:          Video title, unescaped.
 109
 110     Additionally, it must contain either a formats entry or a url one:
 111
 112     formats:        A list of dictionaries for each format available, ordered
 113                     from worst to best quality.
 114
 115                     Potential fields:
 116                     * url        The mandatory URL representing the media:
 117                                    for plain file media - HTTP URL of this file,
 118                                    for RTMP - RTMP URL,
 119                                    for HLS - URL of the M3U8 media playlist,
 120                                    for HDS - URL of the F4M manifest,
 121                                    for DASH
 122                                      - HTTP URL to plain file media (in case of
 123                                        unfragmented media)
 124                                      - URL of the MPD manifest or base URL
 125                                        representing the media if MPD manifest
 126                                        is parsed from a string (in case of
 127                                        fragmented media)
 128                                    for MSS - URL of the ISM manifest.
 129                     * manifest_url
 130                                  The URL of the manifest file in case of
 131                                  fragmented media:
 132                                    for HLS - URL of the M3U8 master playlist,
 133                                    for HDS - URL of the F4M manifest,
 134                                    for DASH - URL of the MPD manifest,
 135                                    for MSS - URL of the ISM manifest.
 136                     * ext        Will be calculated from URL if missing
 137                     * format     A human-readable description of the format
 138                                  ("mp4 container with h264/opus").
 139                                  Calculated from the format_id, width, height.
 140                                  and format_note fields if missing.
 141                     * format_id  A short description of the format
 142                                  ("mp4_h264_opus" or "19").
 143                                 Technically optional, but strongly recommended.
 144                     * format_note Additional info about the format
 145                                  ("3D" or "DASH video")
 146                     * width      Width of the video, if known
 147                     * height     Height of the video, if known
 148                     * resolution Textual description of width and height
 149                     * tbr        Average bitrate of audio and video in KBit/s
 150                     * abr        Average audio bitrate in KBit/s
 151                     * acodec     Name of the audio codec in use
 152                     * asr        Audio sampling rate in Hertz
 153                     * vbr        Average video bitrate in KBit/s
 154                     * fps        Frame rate
 155                     * vcodec     Name of the video codec in use
 156                     * container  Name of the container format
 157                     * filesize   The number of bytes, if known in advance
 158                     * filesize_approx  An estimate for the number of bytes
 159                     * player_url SWF Player URL (used for rtmpdump).
 160                     * protocol   The protocol that will be used for the actual
 161                                  download, lower-case.
 162                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 163                                  "m3u8", "m3u8_native" or "http_dash_segments".
 164                     * fragment_base_url
 165                                  Base URL for fragments. Each fragment's path
 166                                  value (if present) will be relative to
 167                                  this URL.
 168                     * fragments  A list of fragments of a fragmented media.
 169                                  Each fragment entry must contain either an url
 170                                  or a path. If an url is present it should be
 171                                  considered by a client. Otherwise both path and
 172                                  fragment_base_url must be present. Here is
 173                                  the list of all potential fields:
 174                                  * "url" - fragment's URL
 175                                  * "path" - fragment's path relative to
 176                                             fragment_base_url
 177                                  * "duration" (optional, int or float)
 178                                  * "filesize" (optional, int)
 179                     * preference Order number of this format. If this field is
 180                                  present and not None, the formats get sorted
 181                                  by this field, regardless of all other values.
 182                                  -1 for default (order by other properties),
 183                                  -2 or smaller for less than default.
 184                                  < -1000 to hide the format (if there is
 185                                     another one which is strictly better)
 186                     * language   Language code, e.g. "de" or "en-US".
 187                     * language_preference  Is this in the language mentioned in
 188                                  the URL?
 189                                  10 if it's what the URL is about,
 190                                  -1 for default (don't know),
 191                                  -10 otherwise, other values reserved for now.
 192                     * quality    Order number of the video quality of this
 193                                  format, irrespective of the file format.
 194                                  -1 for default (order by other properties),
 195                                  -2 or smaller for less than default.
 196                     * source_preference  Order number for this video source
 197                                   (quality takes higher priority)
 198                                  -1 for default (order by other properties),
 199                                  -2 or smaller for less than default.
 200                     * http_headers  A dictionary of additional HTTP headers
 201                                  to add to the request.
 202                     * stretched_ratio  If given and not 1, indicates that the
 203                                  video's pixels are not square.
 204                                  width : height ratio as float.
 205                     * no_resume  The server does not support resuming the
 206                                  (HTTP or RTMP) download. Boolean.
 207                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 208                     * downloader_options  A dictionary of downloader options as
 209                                  described in FileDownloader
 210                     RTMP formats can also have the additional fields: page_url,
 211                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 212                     rtmp_protocol, rtmp_real_time
 213
 214     url:            Final video URL.
 215     ext:            Video filename extension.
 216     format:         The video format, defaults to ext (used for --get-format)
 217     player_url:     SWF Player URL (used for rtmpdump).
 218
 219     The following fields are optional:
 220
 221     alt_title:      A secondary title of the video.
 222     display_id      An alternative identifier for the video, not necessarily
 223                     unique, but available before title. Typically, id is
 224                     something like "4234987", title "Dancing naked mole rats",
 225                     and display_id "dancing-naked-mole-rats"
 226     thumbnails:     A list of dictionaries, with the following entries:
 227                         * "id" (optional, string) - Thumbnail format ID
 228                         * "url"
 229                         * "preference" (optional, int) - quality of the image
 230                         * "width" (optional, int)
 231                         * "height" (optional, int)
 232                         * "resolution" (optional, string "{width}x{height}",
 233                                         deprecated)
 234                         * "filesize" (optional, int)
 235                         * "_test_url" (optional, bool) - If true, test the URL
 236     thumbnail:      Full URL to a video thumbnail image.
 237     description:    Full video description.
 238     uploader:       Full name of the video uploader.
 239     license:        License name the video is licensed under.
 240     creator:        The creator of the video.
 241     release_timestamp: UNIX timestamp of the moment the video was released.
 242     release_date:   The date (YYYYMMDD) when the video was released.
 243     timestamp:      UNIX timestamp of the moment the video was uploaded
 244     upload_date:    Video upload date (YYYYMMDD).
 245                     If not explicitly set, calculated from timestamp.
 246     uploader_id:    Nickname or id of the video uploader.
 247     uploader_url:   Full URL to a personal webpage of the video uploader.
 248     channel:        Full name of the channel the video is uploaded on.
 249                     Note that channel fields may or may not repeat uploader
 250                     fields. This depends on a particular extractor.
 251     channel_id:     Id of the channel.
 252     channel_url:    Full URL to a channel webpage.
 253     location:       Physical location where the video was filmed.
 254     subtitles:      The available subtitles as a dictionary in the format
 255                     {tag: subformats}. "tag" is usually a language code, and
 256                     "subformats" is a list sorted from lower to higher
 257                     preference, each element is a dictionary with the "ext"
 258                     entry and one of:
 259                         * "data": The subtitles file contents
 260                         * "url": A URL pointing to the subtitles file
 261                     It can optionally also have:
 262                         * "name": Name or description of the subtitles
 263                     "ext" will be calculated from URL if missing
 264     automatic_captions: Like 'subtitles'; contains automatically generated
 265                     captions instead of normal subtitles
 266     duration:       Length of the video in seconds, as an integer or float.
 267     view_count:     How many users have watched the video on the platform.
 268     like_count:     Number of positive ratings of the video
 269     dislike_count:  Number of negative ratings of the video
 270     repost_count:   Number of reposts of the video
 271     average_rating: Average rating give by users, the scale used depends on the webpage
 272     comment_count:  Number of comments on the video
 273     comments:       A list of comments, each with one or more of the following
 274                     properties (all but one of text or html optional):
 275                         * "author" - human-readable name of the comment author
 276                         * "author_id" - user ID of the comment author
 277                         * "author_thumbnail" - The thumbnail of the comment author
 278                         * "id" - Comment ID
 279                         * "html" - Comment as HTML
 280                         * "text" - Plain text of the comment
 281                         * "timestamp" - UNIX timestamp of comment
 282                         * "parent" - ID of the comment this one is replying to.
 283                                      Set to "root" to indicate that this is a
 284                                      comment to the original video.
 285                         * "like_count" - Number of positive ratings of the comment
 286                         * "dislike_count" - Number of negative ratings of the comment
 287                         * "is_favorited" - Whether the comment is marked as
 288                                            favorite by the video uploader
 289                         * "author_is_uploader" - Whether the comment is made by
 290                                                  the video uploader
 291     age_limit:      Age restriction for the video, as an integer (years)
 292     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 293                     should allow to get the same result again. (It will be set
 294                     by YoutubeDL if it's missing)
 295     categories:     A list of categories that the video falls in, for example
 296                     ["Sports", "Berlin"]
 297     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 298     cast:           A list of the video cast
 299     is_live:        True, False, or None (=unknown). Whether this video is a
 300                     live stream that goes on instead of a fixed-length video.
 301     was_live:       True, False, or None (=unknown). Whether this video was
 302                     originally a live stream.
 303     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 304                     If absent, automatically set from is_live, was_live
 305     start_time:     Time in seconds where the reproduction should start, as
 306                     specified in the URL.
 307     end_time:       Time in seconds where the reproduction should end, as
 308                     specified in the URL.
 309     chapters:       A list of dictionaries, with the following entries:
 310                         * "start_time" - The start time of the chapter in seconds
 311                         * "end_time" - The end time of the chapter in seconds
 312                         * "title" (optional, string)
 313     playable_in_embed: Whether this video is allowed to play in embedded
 314                     players on other sites. Can be True (=always allowed),
 315                     False (=never allowed), None (=unknown), or a string
 316                     specifying the criteria for embedability (Eg: 'whitelist')
 317     availability:   Under what condition the video is available. One of
 318                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 319                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 320                     to set it
 321     __post_extractor: A function to be called just before the metadata is
 322                     written to either disk, logger or console. The function
 323                     must return a dict which will be added to the info_dict.
 324                     This is usefull for additional information that is
 325                     time-consuming to extract. Note that the fields thus
 326                     extracted will not be available to output template and
 327                     match_filter. So, only "comments" and "comment_count" are
 328                     currently allowed to be extracted via this method.
 329
 330     The following fields should only be used when the video belongs to some logical
 331     chapter or section:
 332
 333     chapter:        Name or title of the chapter the video belongs to.
 334     chapter_number: Number of the chapter the video belongs to, as an integer.
 335     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 336
 337     The following fields should only be used when the video is an episode of some
 338     series, programme or podcast:
 339
 340     series:         Title of the series or programme the video episode belongs to.
 341     season:         Title of the season the video episode belongs to.
 342     season_number:  Number of the season the video episode belongs to, as an integer.
 343     season_id:      Id of the season the video episode belongs to, as a unicode string.
 344     episode:        Title of the video episode. Unlike mandatory video title field,
 345                     this field should denote the exact title of the video episode
 346                     without any kind of decoration.
 347     episode_number: Number of the video episode within a season, as an integer.
 348     episode_id:     Id of the video episode, as a unicode string.
 349
 350     The following fields should only be used when the media is a track or a part of
 351     a music album:
 352
 353     track:          Title of the track.
 354     track_number:   Number of the track within an album or a disc, as an integer.
 355     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 356                     as a unicode string.
 357     artist:         Artist(s) of the track.
 358     genre:          Genre(s) of the track.
 359     album:          Title of the album the track belongs to.
 360     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 361     album_artist:   List of all artists appeared on the album (e.g.
 362                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 363                     and compilations).
 364     disc_number:    Number of the disc or other physical medium the track belongs to,
 365                     as an integer.
 366     release_year:   Year (YYYY) when the album was released.
 367
 368     Unless mentioned otherwise, the fields should be Unicode strings.
 369
 370     Unless mentioned otherwise, None is equivalent to absence of information.
 371
 372
 373     _type "playlist" indicates multiple videos.
 374     There must be a key "entries", which is a list, an iterable, or a PagedList
 375     object, each element of which is a valid dictionary by this specification.
 376
 377     Additionally, playlists can have "id", "title", and any other relevent
 378     attributes with the same semantics as videos (see above).
 379
 380
 381     _type "multi_video" indicates that there are multiple videos that
 382     form a single show, for examples multiple acts of an opera or TV episode.
 383     It must have an entries key like a playlist and contain all the keys
 384     required for a video at the same time.
 385
 386
 387     _type "url" indicates that the video must be extracted from another
 388     location, possibly by a different extractor. Its only required key is:
 389     "url" - the next URL to extract.
 390     The key "ie_key" can be set to the class name (minus the trailing "IE",
 391     e.g. "Youtube") if the extractor class is known in advance.
 392     Additionally, the dictionary may have any properties of the resolved entity
 393     known in advance, for example "title" if the title of the referred video is
 394     known ahead of time.
 395
 396
 397     _type "url_transparent" entities have the same specification as "url", but
 398     indicate that the given additional information is more precise than the one
 399     associated with the resolved URL.
 400     This is useful when a site employs a video service that hosts the video and
 401     its technical metadata, but that video service does not embed a useful
 402     title, description etc.
 403
 404
 405     Subclasses of this one should re-define the _real_initialize() and
 406     _real_extract() methods and define a _VALID_URL regexp.
 407     Probably, they should also be added to the list of extractors.
 408
 409     _GEO_BYPASS attribute may be set to False in order to disable
 410     geo restriction bypass mechanisms for a particular extractor.
 411     Though it won't disable explicit geo restriction bypass based on
 412     country code provided with geo_bypass_country.
 413
 414     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 415     countries for this extractor. One of these countries will be used by
 416     geo restriction bypass mechanism right away in order to bypass
 417     geo restriction, of course, if the mechanism is not disabled.
 418
 419     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 420     IP blocks in CIDR notation for this extractor. One of these IP blocks
 421     will be used by geo restriction bypass mechanism similarly
 422     to _GEO_COUNTRIES.
 423
 424     Finally, the _WORKING attribute should be set to False for broken IEs
 425     in order to warn the users and skip the tests.
 426     """
 427
 428     _ready = False
 429     _downloader = None
 430     _x_forwarded_for_ip = None
 431     _GEO_BYPASS = True
 432     _GEO_COUNTRIES = None
 433     _GEO_IP_BLOCKS = None
 434     _WORKING = True
 435
 436     _LOGIN_HINTS = {
 437         'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
 438         'cookies': (
 439             'Use --cookies for the authentication. '
 440             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to pass cookies'),
 441         'password': 'Use --username and --password or --netrc to provide account credentials',
 442     }
 443
 444     def __init__(self, downloader=None):
 445         """Constructor. Receives an optional downloader."""
 446         self._ready = False
 447         self._x_forwarded_for_ip = None
 448         self._printed_messages = set()
 449         self.set_downloader(downloader)
 450
 451     @classmethod
 452     def _match_valid_url(cls, url):
 453         # This does not use has/getattr intentionally - we want to know whether
 454         # we have cached the regexp for *this* class, whereas getattr would also
 455         # match the superclass
 456         if '_VALID_URL_RE' not in cls.__dict__:
 457             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 458         return cls._VALID_URL_RE.match(url)
 459
 460     @classmethod
 461     def suitable(cls, url):
 462         """Receives a URL and returns True if suitable for this IE."""
 463         # This function must import everything it needs (except other extractors),
 464         # so that lazy_extractors works correctly
 465         return cls._match_valid_url(url) is not None
 466
 467     @classmethod
 468     def _match_id(cls, url):
 469         return cls._match_valid_url(url).group('id')
 470
 471     @classmethod
 472     def get_temp_id(cls, url):
 473         try:
 474             return cls._match_id(url)
 475         except (IndexError, AttributeError):
 476             return None
 477
 478     @classmethod
 479     def working(cls):
 480         """Getter method for _WORKING."""
 481         return cls._WORKING
 482
 483     def initialize(self):
 484         """Initializes an instance (authentication, etc)."""
 485         self._printed_messages = set()
 486         self._initialize_geo_bypass({
 487             'countries': self._GEO_COUNTRIES,
 488             'ip_blocks': self._GEO_IP_BLOCKS,
 489         })
 490         if not self._ready:
 491             self._real_initialize()
 492             self._ready = True
 493
 494     def _initialize_geo_bypass(self, geo_bypass_context):
 495         """
 496         Initialize geo restriction bypass mechanism.
 497
 498         This method is used to initialize geo bypass mechanism based on faking
 499         X-Forwarded-For HTTP header. A random country from provided country list
 500         is selected and a random IP belonging to this country is generated. This
 501         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 502         HTTP requests.
 503
 504         This method will be used for initial geo bypass mechanism initialization
 505         during the instance initialization with _GEO_COUNTRIES and
 506         _GEO_IP_BLOCKS.
 507
 508         You may also manually call it from extractor's code if geo bypass
 509         information is not available beforehand (e.g. obtained during
 510         extraction) or due to some other reason. In this case you should pass
 511         this information in geo bypass context passed as first argument. It may
 512         contain following fields:
 513
 514         countries:  List of geo unrestricted countries (similar
 515                     to _GEO_COUNTRIES)
 516         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 517                     (similar to _GEO_IP_BLOCKS)
 518
 519         """
 520         if not self._x_forwarded_for_ip:
 521
 522             # Geo bypass mechanism is explicitly disabled by user
 523             if not self.get_param('geo_bypass', True):
 524                 return
 525
 526             if not geo_bypass_context:
 527                 geo_bypass_context = {}
 528
 529             # Backward compatibility: previously _initialize_geo_bypass
 530             # expected a list of countries, some 3rd party code may still use
 531             # it this way
 532             if isinstance(geo_bypass_context, (list, tuple)):
 533                 geo_bypass_context = {
 534                     'countries': geo_bypass_context,
 535                 }
 536
 537             # The whole point of geo bypass mechanism is to fake IP
 538             # as X-Forwarded-For HTTP header based on some IP block or
 539             # country code.
 540
 541             # Path 1: bypassing based on IP block in CIDR notation
 542
 543             # Explicit IP block specified by user, use it right away
 544             # regardless of whether extractor is geo bypassable or not
 545             ip_block = self.get_param('geo_bypass_ip_block', None)
 546
 547             # Otherwise use random IP block from geo bypass context but only
 548             # if extractor is known as geo bypassable
 549             if not ip_block:
 550                 ip_blocks = geo_bypass_context.get('ip_blocks')
 551                 if self._GEO_BYPASS and ip_blocks:
 552                     ip_block = random.choice(ip_blocks)
 553
 554             if ip_block:
 555                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 556                 self._downloader.write_debug(
 557                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 558                 return
 559
 560             # Path 2: bypassing based on country code
 561
 562             # Explicit country code specified by user, use it right away
 563             # regardless of whether extractor is geo bypassable or not
 564             country = self.get_param('geo_bypass_country', None)
 565
 566             # Otherwise use random country code from geo bypass context but
 567             # only if extractor is known as geo bypassable
 568             if not country:
 569                 countries = geo_bypass_context.get('countries')
 570                 if self._GEO_BYPASS and countries:
 571                     country = random.choice(countries)
 572
 573             if country:
 574                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 575                 self._downloader.write_debug(
 576                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 577
 578     def extract(self, url):
 579         """Extracts URL information and returns it in list of dicts."""
 580         try:
 581             for _ in range(2):
 582                 try:
 583                     self.initialize()
 584                     self.write_debug('Extracting URL: %s' % url)
 585                     ie_result = self._real_extract(url)
 586                     if ie_result is None:
 587                         return None
 588                     if self._x_forwarded_for_ip:
 589                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 590                     subtitles = ie_result.get('subtitles')
 591                     if (subtitles and 'live_chat' in subtitles
 592                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 593                         del subtitles['live_chat']
 594                     return ie_result
 595                 except GeoRestrictedError as e:
 596                     if self.__maybe_fake_ip_and_retry(e.countries):
 597                         continue
 598                     raise
 599         except ExtractorError as e:
 600             video_id = e.video_id or self.get_temp_id(url)
 601             raise ExtractorError(
 602                 e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
 603         except compat_http_client.IncompleteRead as e:
 604             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 605         except (KeyError, StopIteration) as e:
 606             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 607
 608     def __maybe_fake_ip_and_retry(self, countries):
 609         if (not self.get_param('geo_bypass_country', None)
 610                 and self._GEO_BYPASS
 611                 and self.get_param('geo_bypass', True)
 612                 and not self._x_forwarded_for_ip
 613                 and countries):
 614             country_code = random.choice(countries)
 615             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 616             if self._x_forwarded_for_ip:
 617                 self.report_warning(
 618                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 619                     % (self._x_forwarded_for_ip, country_code.upper()))
 620                 return True
 621         return False
 622
 623     def set_downloader(self, downloader):
 624         """Sets the downloader for this IE."""
 625         self._downloader = downloader
 626
 627     def _real_initialize(self):
 628         """Real initialization process. Redefine in subclasses."""
 629         pass
 630
 631     def _real_extract(self, url):
 632         """Real extraction process. Redefine in subclasses."""
 633         pass
 634
 635     @classmethod
 636     def ie_key(cls):
 637         """A string for getting the InfoExtractor with get_info_extractor"""
 638         return cls.__name__[:-2]
 639
 640     @property
 641     def IE_NAME(self):
 642         return compat_str(type(self).__name__[:-2])
 643
 644     @staticmethod
 645     def __can_accept_status_code(err, expected_status):
 646         assert isinstance(err, compat_urllib_error.HTTPError)
 647         if expected_status is None:
 648             return False
 649         elif callable(expected_status):
 650             return expected_status(err.code) is True
 651         else:
 652             return err.code in variadic(expected_status)
 653
 654     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 655         """
 656         Return the response handle.
 657
 658         See _download_webpage docstring for arguments specification.
 659         """
 660         if not self._downloader._first_webpage_request:
 661             sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
 662             if sleep_interval > 0:
 663                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 664                 time.sleep(sleep_interval)
 665         else:
 666             self._downloader._first_webpage_request = False
 667
 668         if note is None:
 669             self.report_download_webpage(video_id)
 670         elif note is not False:
 671             if video_id is None:
 672                 self.to_screen('%s' % (note,))
 673             else:
 674                 self.to_screen('%s: %s' % (video_id, note))
 675
 676         # Some sites check X-Forwarded-For HTTP header in order to figure out
 677         # the origin of the client behind proxy. This allows bypassing geo
 678         # restriction by faking this header's value to IP that belongs to some
 679         # geo unrestricted country. We will do so once we encounter any
 680         # geo restriction error.
 681         if self._x_forwarded_for_ip:
 682             if 'X-Forwarded-For' not in headers:
 683                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 684
 685         if isinstance(url_or_request, compat_urllib_request.Request):
 686             url_or_request = update_Request(
 687                 url_or_request, data=data, headers=headers, query=query)
 688         else:
 689             if query:
 690                 url_or_request = update_url_query(url_or_request, query)
 691             if data is not None or headers:
 692                 url_or_request = sanitized_Request(url_or_request, data, headers)
 693         try:
 694             return self._downloader.urlopen(url_or_request)
 695         except network_exceptions as err:
 696             if isinstance(err, compat_urllib_error.HTTPError):
 697                 if self.__can_accept_status_code(err, expected_status):
 698                     # Retain reference to error to prevent file object from
 699                     # being closed before it can be read. Works around the
 700                     # effects of <https://bugs.python.org/issue15002>
 701                     # introduced in Python 3.4.1.
 702                     err.fp._error = err
 703                     return err.fp
 704
 705             if errnote is False:
 706                 return False
 707             if errnote is None:
 708                 errnote = 'Unable to download webpage'
 709
 710             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 711             if fatal:
 712                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 713             else:
 714                 self.report_warning(errmsg)
 715                 return False
 716
 717     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 718         """
 719         Return a tuple (page content as string, URL handle).
 720
 721         See _download_webpage docstring for arguments specification.
 722         """
 723         # Strip hashes from the URL (#1038)
 724         if isinstance(url_or_request, (compat_str, str)):
 725             url_or_request = url_or_request.partition('#')[0]
 726
 727         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 728         if urlh is False:
 729             assert not fatal
 730             return False
 731         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 732         return (content, urlh)
 733
 734     @staticmethod
 735     def _guess_encoding_from_content(content_type, webpage_bytes):
 736         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 737         if m:
 738             encoding = m.group(1)
 739         else:
 740             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 741                           webpage_bytes[:1024])
 742             if m:
 743                 encoding = m.group(1).decode('ascii')
 744             elif webpage_bytes.startswith(b'\xff\xfe'):
 745                 encoding = 'utf-16'
 746             else:
 747                 encoding = 'utf-8'
 748
 749         return encoding
 750
 751     def __check_blocked(self, content):
 752         first_block = content[:512]
 753         if ('<title>Access to this site is blocked</title>' in content
 754                 and 'Websense' in first_block):
 755             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 756             blocked_iframe = self._html_search_regex(
 757                 r'<iframe src="([^"]+)"', content,
 758                 'Websense information URL', default=None)
 759             if blocked_iframe:
 760                 msg += ' Visit %s for more details' % blocked_iframe
 761             raise ExtractorError(msg, expected=True)
 762         if '<title>The URL you requested has been blocked</title>' in first_block:
 763             msg = (
 764                 'Access to this webpage has been blocked by Indian censorship. '
 765                 'Use a VPN or proxy server (with --proxy) to route around it.')
 766             block_msg = self._html_search_regex(
 767                 r'</h1><p>(.*?)</p>',
 768                 content, 'block message', default=None)
 769             if block_msg:
 770                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 771             raise ExtractorError(msg, expected=True)
 772         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 773                 and 'blocklist.rkn.gov.ru' in content):
 774             raise ExtractorError(
 775                 'Access to this webpage has been blocked by decision of the Russian government. '
 776                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 777                 expected=True)
 778
 779     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 780         content_type = urlh.headers.get('Content-Type', '')
 781         webpage_bytes = urlh.read()
 782         if prefix is not None:
 783             webpage_bytes = prefix + webpage_bytes
 784         if not encoding:
 785             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 786         if self.get_param('dump_intermediate_pages', False):
 787             self.to_screen('Dumping request to ' + urlh.geturl())
 788             dump = base64.b64encode(webpage_bytes).decode('ascii')
 789             self._downloader.to_screen(dump)
 790         if self.get_param('write_pages', False):
 791             basen = '%s_%s' % (video_id, urlh.geturl())
 792             if len(basen) > 240:
 793                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 794                 basen = basen[:240 - len(h)] + h
 795             raw_filename = basen + '.dump'
 796             filename = sanitize_filename(raw_filename, restricted=True)
 797             self.to_screen('Saving request to ' + filename)
 798             # Working around MAX_PATH limitation on Windows (see
 799             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 800             if compat_os_name == 'nt':
 801                 absfilepath = os.path.abspath(filename)
 802                 if len(absfilepath) > 259:
 803                     filename = '\\\\?\\' + absfilepath
 804             with open(filename, 'wb') as outf:
 805                 outf.write(webpage_bytes)
 806
 807         try:
 808             content = webpage_bytes.decode(encoding, 'replace')
 809         except LookupError:
 810             content = webpage_bytes.decode('utf-8', 'replace')
 811
 812         self.__check_blocked(content)
 813
 814         return content
 815
 816     def _download_webpage(
 817             self, url_or_request, video_id, note=None, errnote=None,
 818             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 819             headers={}, query={}, expected_status=None):
 820         """
 821         Return the data of the page as a string.
 822
 823         Arguments:
 824         url_or_request -- plain text URL as a string or
 825             a compat_urllib_request.Requestobject
 826         video_id -- Video/playlist/item identifier (string)
 827
 828         Keyword arguments:
 829         note -- note printed before downloading (string)
 830         errnote -- note printed in case of an error (string)
 831         fatal -- flag denoting whether error should be considered fatal,
 832             i.e. whether it should cause ExtractionError to be raised,
 833             otherwise a warning will be reported and extraction continued
 834         tries -- number of tries
 835         timeout -- sleep interval between tries
 836         encoding -- encoding for a page content decoding, guessed automatically
 837             when not explicitly specified
 838         data -- POST data (bytes)
 839         headers -- HTTP headers (dict)
 840         query -- URL query (dict)
 841         expected_status -- allows to accept failed HTTP requests (non 2xx
 842             status code) by explicitly specifying a set of accepted status
 843             codes. Can be any of the following entities:
 844                 - an integer type specifying an exact failed status code to
 845                   accept
 846                 - a list or a tuple of integer types specifying a list of
 847                   failed status codes to accept
 848                 - a callable accepting an actual failed status code and
 849                   returning True if it should be accepted
 850             Note that this argument does not affect success status codes (2xx)
 851             which are always accepted.
 852         """
 853
 854         success = False
 855         try_count = 0
 856         while success is False:
 857             try:
 858                 res = self._download_webpage_handle(
 859                     url_or_request, video_id, note, errnote, fatal,
 860                     encoding=encoding, data=data, headers=headers, query=query,
 861                     expected_status=expected_status)
 862                 success = True
 863             except compat_http_client.IncompleteRead as e:
 864                 try_count += 1
 865                 if try_count >= tries:
 866                     raise e
 867                 self._sleep(timeout, video_id)
 868         if res is False:
 869             return res
 870         else:
 871             content, _ = res
 872             return content
 873
 874     def _download_xml_handle(
 875             self, url_or_request, video_id, note='Downloading XML',
 876             errnote='Unable to download XML', transform_source=None,
 877             fatal=True, encoding=None, data=None, headers={}, query={},
 878             expected_status=None):
 879         """
 880         Return a tuple (xml as an compat_etree_Element, URL handle).
 881
 882         See _download_webpage docstring for arguments specification.
 883         """
 884         res = self._download_webpage_handle(
 885             url_or_request, video_id, note, errnote, fatal=fatal,
 886             encoding=encoding, data=data, headers=headers, query=query,
 887             expected_status=expected_status)
 888         if res is False:
 889             return res
 890         xml_string, urlh = res
 891         return self._parse_xml(
 892             xml_string, video_id, transform_source=transform_source,
 893             fatal=fatal), urlh
 894
 895     def _download_xml(
 896             self, url_or_request, video_id,
 897             note='Downloading XML', errnote='Unable to download XML',
 898             transform_source=None, fatal=True, encoding=None,
 899             data=None, headers={}, query={}, expected_status=None):
 900         """
 901         Return the xml as an compat_etree_Element.
 902
 903         See _download_webpage docstring for arguments specification.
 904         """
 905         res = self._download_xml_handle(
 906             url_or_request, video_id, note=note, errnote=errnote,
 907             transform_source=transform_source, fatal=fatal, encoding=encoding,
 908             data=data, headers=headers, query=query,
 909             expected_status=expected_status)
 910         return res if res is False else res[0]
 911
 912     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 913         if transform_source:
 914             xml_string = transform_source(xml_string)
 915         try:
 916             return compat_etree_fromstring(xml_string.encode('utf-8'))
 917         except compat_xml_parse_error as ve:
 918             errmsg = '%s: Failed to parse XML ' % video_id
 919             if fatal:
 920                 raise ExtractorError(errmsg, cause=ve)
 921             else:
 922                 self.report_warning(errmsg + str(ve))
 923
 924     def _download_json_handle(
 925             self, url_or_request, video_id, note='Downloading JSON metadata',
 926             errnote='Unable to download JSON metadata', transform_source=None,
 927             fatal=True, encoding=None, data=None, headers={}, query={},
 928             expected_status=None):
 929         """
 930         Return a tuple (JSON object, URL handle).
 931
 932         See _download_webpage docstring for arguments specification.
 933         """
 934         res = self._download_webpage_handle(
 935             url_or_request, video_id, note, errnote, fatal=fatal,
 936             encoding=encoding, data=data, headers=headers, query=query,
 937             expected_status=expected_status)
 938         if res is False:
 939             return res
 940         json_string, urlh = res
 941         return self._parse_json(
 942             json_string, video_id, transform_source=transform_source,
 943             fatal=fatal), urlh
 944
 945     def _download_json(
 946             self, url_or_request, video_id, note='Downloading JSON metadata',
 947             errnote='Unable to download JSON metadata', transform_source=None,
 948             fatal=True, encoding=None, data=None, headers={}, query={},
 949             expected_status=None):
 950         """
 951         Return the JSON object as a dict.
 952
 953         See _download_webpage docstring for arguments specification.
 954         """
 955         res = self._download_json_handle(
 956             url_or_request, video_id, note=note, errnote=errnote,
 957             transform_source=transform_source, fatal=fatal, encoding=encoding,
 958             data=data, headers=headers, query=query,
 959             expected_status=expected_status)
 960         return res if res is False else res[0]
 961
 962     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 963         if transform_source:
 964             json_string = transform_source(json_string)
 965         try:
 966             return json.loads(json_string)
 967         except ValueError as ve:
 968             errmsg = '%s: Failed to parse JSON ' % video_id
 969             if fatal:
 970                 raise ExtractorError(errmsg, cause=ve)
 971             else:
 972                 self.report_warning(errmsg + str(ve))
 973
 974     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 975         return self._parse_json(
 976             data[data.find('{'):data.rfind('}') + 1],
 977             video_id, transform_source, fatal)
 978
 979     def _download_socket_json_handle(
 980             self, url_or_request, video_id, note='Polling socket',
 981             errnote='Unable to poll socket', transform_source=None,
 982             fatal=True, encoding=None, data=None, headers={}, query={},
 983             expected_status=None):
 984         """
 985         Return a tuple (JSON object, URL handle).
 986
 987         See _download_webpage docstring for arguments specification.
 988         """
 989         res = self._download_webpage_handle(
 990             url_or_request, video_id, note, errnote, fatal=fatal,
 991             encoding=encoding, data=data, headers=headers, query=query,
 992             expected_status=expected_status)
 993         if res is False:
 994             return res
 995         webpage, urlh = res
 996         return self._parse_socket_response_as_json(
 997             webpage, video_id, transform_source=transform_source,
 998             fatal=fatal), urlh
 999
1000     def _download_socket_json(
1001             self, url_or_request, video_id, note='Polling socket',
1002             errnote='Unable to poll socket', transform_source=None,
1003             fatal=True, encoding=None, data=None, headers={}, query={},
1004             expected_status=None):
1005         """
1006         Return the JSON object as a dict.
1007
1008         See _download_webpage docstring for arguments specification.
1009         """
1010         res = self._download_socket_json_handle(
1011             url_or_request, video_id, note=note, errnote=errnote,
1012             transform_source=transform_source, fatal=fatal, encoding=encoding,
1013             data=data, headers=headers, query=query,
1014             expected_status=expected_status)
1015         return res if res is False else res[0]
1016
1017     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1018         idstr = format_field(video_id, template='%s: ')
1019         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1020         if only_once:
1021             if f'WARNING: {msg}' in self._printed_messages:
1022                 return
1023             self._printed_messages.add(f'WARNING: {msg}')
1024         self._downloader.report_warning(msg, *args, **kwargs)
1025
1026     def to_screen(self, msg, *args, **kwargs):
1027         """Print msg to screen, prefixing it with '[ie_name]'"""
1028         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1029
1030     def write_debug(self, msg, *args, **kwargs):
1031         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1032
1033     def get_param(self, name, default=None, *args, **kwargs):
1034         if self._downloader:
1035             return self._downloader.params.get(name, default, *args, **kwargs)
1036         return default
1037
1038     def report_drm(self, video_id, partial=False):
1039         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1040
1041     def report_extraction(self, id_or_name):
1042         """Report information extraction."""
1043         self.to_screen('%s: Extracting information' % id_or_name)
1044
1045     def report_download_webpage(self, video_id):
1046         """Report webpage download."""
1047         self.to_screen('%s: Downloading webpage' % video_id)
1048
1049     def report_age_confirmation(self):
1050         """Report attempt to confirm age."""
1051         self.to_screen('Confirming age')
1052
1053     def report_login(self):
1054         """Report attempt to log in."""
1055         self.to_screen('Logging in')
1056
1057     def raise_login_required(
1058             self, msg='This video is only available for registered users',
1059             metadata_available=False, method='any'):
1060         if metadata_available and self.get_param('ignore_no_formats_error'):
1061             self.report_warning(msg)
1062         if method is not None:
1063             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1064         raise ExtractorError(msg, expected=True)
1065
1066     def raise_geo_restricted(
1067             self, msg='This video is not available from your location due to geo restriction',
1068             countries=None, metadata_available=False):
1069         if metadata_available and self.get_param('ignore_no_formats_error'):
1070             self.report_warning(msg)
1071         else:
1072             raise GeoRestrictedError(msg, countries=countries)
1073
1074     def raise_no_formats(self, msg, expected=False, video_id=None):
1075         if expected and self.get_param('ignore_no_formats_error'):
1076             self.report_warning(msg, video_id)
1077         elif isinstance(msg, ExtractorError):
1078             raise msg
1079         else:
1080             raise ExtractorError(msg, expected=expected, video_id=video_id)
1081
1082     # Methods for following #608
1083     @staticmethod
1084     def url_result(url, ie=None, video_id=None, video_title=None):
1085         """Returns a URL that points to a page that should be processed"""
1086         # TODO: ie should be the class used for getting the info
1087         video_info = {'_type': 'url',
1088                       'url': url,
1089                       'ie_key': ie}
1090         if video_id is not None:
1091             video_info['id'] = video_id
1092         if video_title is not None:
1093             video_info['title'] = video_title
1094         return video_info
1095
1096     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1097         urls = orderedSet(
1098             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1099             for m in matches)
1100         return self.playlist_result(
1101             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1102
1103     @staticmethod
1104     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1105         """Returns a playlist"""
1106         video_info = {'_type': 'playlist',
1107                       'entries': entries}
1108         video_info.update(kwargs)
1109         if playlist_id:
1110             video_info['id'] = playlist_id
1111         if playlist_title:
1112             video_info['title'] = playlist_title
1113         if playlist_description is not None:
1114             video_info['description'] = playlist_description
1115         return video_info
1116
1117     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1118         """
1119         Perform a regex search on the given string, using a single or a list of
1120         patterns returning the first matching group.
1121         In case of failure return a default value or raise a WARNING or a
1122         RegexNotFoundError, depending on fatal, specifying the field name.
1123         """
1124         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1125             mobj = re.search(pattern, string, flags)
1126         else:
1127             for p in pattern:
1128                 mobj = re.search(p, string, flags)
1129                 if mobj:
1130                     break
1131
1132         if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1133             _name = '\033[0;34m%s\033[0m' % name
1134         else:
1135             _name = name
1136
1137         if mobj:
1138             if group is None:
1139                 # return the first matching group
1140                 return next(g for g in mobj.groups() if g is not None)
1141             elif isinstance(group, (list, tuple)):
1142                 return tuple(mobj.group(g) for g in group)
1143             else:
1144                 return mobj.group(group)
1145         elif default is not NO_DEFAULT:
1146             return default
1147         elif fatal:
1148             raise RegexNotFoundError('Unable to extract %s' % _name)
1149         else:
1150             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1151             return None
1152
1153     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1154         """
1155         Like _search_regex, but strips HTML tags and unescapes entities.
1156         """
1157         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1158         if res:
1159             return clean_html(res).strip()
1160         else:
1161             return res
1162
1163     def _get_netrc_login_info(self, netrc_machine=None):
1164         username = None
1165         password = None
1166         netrc_machine = netrc_machine or self._NETRC_MACHINE
1167
1168         if self.get_param('usenetrc', False):
1169             try:
1170                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1171                 if os.path.isdir(netrc_file):
1172                     netrc_file = os.path.join(netrc_file, '.netrc')
1173                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1174                 if info is not None:
1175                     username = info[0]
1176                     password = info[2]
1177                 else:
1178                     raise netrc.NetrcParseError(
1179                         'No authenticators for %s' % netrc_machine)
1180             except (IOError, netrc.NetrcParseError) as err:
1181                 self.report_warning(
1182                     'parsing .netrc: %s' % error_to_compat_str(err))
1183
1184         return username, password
1185
1186     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1187         """
1188         Get the login info as (username, password)
1189         First look for the manually specified credentials using username_option
1190         and password_option as keys in params dictionary. If no such credentials
1191         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1192         value.
1193         If there's no info available, return (None, None)
1194         """
1195
1196         # Attempt to use provided username and password or .netrc data
1197         username = self.get_param(username_option)
1198         if username is not None:
1199             password = self.get_param(password_option)
1200         else:
1201             username, password = self._get_netrc_login_info(netrc_machine)
1202
1203         return username, password
1204
1205     def _get_tfa_info(self, note='two-factor verification code'):
1206         """
1207         Get the two-factor authentication info
1208         TODO - asking the user will be required for sms/phone verify
1209         currently just uses the command line option
1210         If there's no info available, return None
1211         """
1212
1213         tfa = self.get_param('twofactor')
1214         if tfa is not None:
1215             return tfa
1216
1217         return compat_getpass('Type %s and press [Return]: ' % note)
1218
1219     # Helper functions for extracting OpenGraph info
1220     @staticmethod
1221     def _og_regexes(prop):
1222         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1223         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1224                        % {'prop': re.escape(prop)})
1225         template = r'<meta[^>]+?%s[^>]+?%s'
1226         return [
1227             template % (property_re, content_re),
1228             template % (content_re, property_re),
1229         ]
1230
1231     @staticmethod
1232     def _meta_regex(prop):
1233         return r'''(?isx)<meta
1234                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1235                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1236
1237     def _og_search_property(self, prop, html, name=None, **kargs):
1238         prop = variadic(prop)
1239         if name is None:
1240             name = 'OpenGraph %s' % prop[0]
1241         og_regexes = []
1242         for p in prop:
1243             og_regexes.extend(self._og_regexes(p))
1244         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1245         if escaped is None:
1246             return None
1247         return unescapeHTML(escaped)
1248
1249     def _og_search_thumbnail(self, html, **kargs):
1250         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1251
1252     def _og_search_description(self, html, **kargs):
1253         return self._og_search_property('description', html, fatal=False, **kargs)
1254
1255     def _og_search_title(self, html, **kargs):
1256         return self._og_search_property('title', html, **kargs)
1257
1258     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1259         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1260         if secure:
1261             regexes = self._og_regexes('video:secure_url') + regexes
1262         return self._html_search_regex(regexes, html, name, **kargs)
1263
1264     def _og_search_url(self, html, **kargs):
1265         return self._og_search_property('url', html, **kargs)
1266
1267     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1268         name = variadic(name)
1269         if display_name is None:
1270             display_name = name[0]
1271         return self._html_search_regex(
1272             [self._meta_regex(n) for n in name],
1273             html, display_name, fatal=fatal, group='content', **kwargs)
1274
1275     def _dc_search_uploader(self, html):
1276         return self._html_search_meta('dc.creator', html, 'uploader')
1277
1278     def _rta_search(self, html):
1279         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1280         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1281                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1282                      html):
1283             return 18
1284         return 0
1285
1286     def _media_rating_search(self, html):
1287         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1288         rating = self._html_search_meta('rating', html)
1289
1290         if not rating:
1291             return None
1292
1293         RATING_TABLE = {
1294             'safe for kids': 0,
1295             'general': 8,
1296             '14 years': 14,
1297             'mature': 17,
1298             'restricted': 19,
1299         }
1300         return RATING_TABLE.get(rating.lower())
1301
1302     def _family_friendly_search(self, html):
1303         # See http://schema.org/VideoObject
1304         family_friendly = self._html_search_meta(
1305             'isFamilyFriendly', html, default=None)
1306
1307         if not family_friendly:
1308             return None
1309
1310         RATING_TABLE = {
1311             '1': 0,
1312             'true': 0,
1313             '0': 18,
1314             'false': 18,
1315         }
1316         return RATING_TABLE.get(family_friendly.lower())
1317
1318     def _twitter_search_player(self, html):
1319         return self._html_search_meta('twitter:player', html,
1320                                       'twitter card player')
1321
1322     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1323         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1324         default = kwargs.get('default', NO_DEFAULT)
1325         # JSON-LD may be malformed and thus `fatal` should be respected.
1326         # At the same time `default` may be passed that assumes `fatal=False`
1327         # for _search_regex. Let's simulate the same behavior here as well.
1328         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1329         json_ld = []
1330         for mobj in json_ld_list:
1331             json_ld_item = self._parse_json(
1332                 mobj.group('json_ld'), video_id, fatal=fatal)
1333             if not json_ld_item:
1334                 continue
1335             if isinstance(json_ld_item, dict):
1336                 json_ld.append(json_ld_item)
1337             elif isinstance(json_ld_item, (list, tuple)):
1338                 json_ld.extend(json_ld_item)
1339         if json_ld:
1340             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1341         if json_ld:
1342             return json_ld
1343         if default is not NO_DEFAULT:
1344             return default
1345         elif fatal:
1346             raise RegexNotFoundError('Unable to extract JSON-LD')
1347         else:
1348             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1349             return {}
1350
1351     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1352         if isinstance(json_ld, compat_str):
1353             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1354         if not json_ld:
1355             return {}
1356         info = {}
1357         if not isinstance(json_ld, (list, tuple, dict)):
1358             return info
1359         if isinstance(json_ld, dict):
1360             json_ld = [json_ld]
1361
1362         INTERACTION_TYPE_MAP = {
1363             'CommentAction': 'comment',
1364             'AgreeAction': 'like',
1365             'DisagreeAction': 'dislike',
1366             'LikeAction': 'like',
1367             'DislikeAction': 'dislike',
1368             'ListenAction': 'view',
1369             'WatchAction': 'view',
1370             'ViewAction': 'view',
1371         }
1372
1373         def extract_interaction_type(e):
1374             interaction_type = e.get('interactionType')
1375             if isinstance(interaction_type, dict):
1376                 interaction_type = interaction_type.get('@type')
1377             return str_or_none(interaction_type)
1378
1379         def extract_interaction_statistic(e):
1380             interaction_statistic = e.get('interactionStatistic')
1381             if isinstance(interaction_statistic, dict):
1382                 interaction_statistic = [interaction_statistic]
1383             if not isinstance(interaction_statistic, list):
1384                 return
1385             for is_e in interaction_statistic:
1386                 if not isinstance(is_e, dict):
1387                     continue
1388                 if is_e.get('@type') != 'InteractionCounter':
1389                     continue
1390                 interaction_type = extract_interaction_type(is_e)
1391                 if not interaction_type:
1392                     continue
1393                 # For interaction count some sites provide string instead of
1394                 # an integer (as per spec) with non digit characters (e.g. ",")
1395                 # so extracting count with more relaxed str_to_int
1396                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1397                 if interaction_count is None:
1398                     continue
1399                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1400                 if not count_kind:
1401                     continue
1402                 count_key = '%s_count' % count_kind
1403                 if info.get(count_key) is not None:
1404                     continue
1405                 info[count_key] = interaction_count
1406
1407         def extract_video_object(e):
1408             assert e['@type'] == 'VideoObject'
1409             author = e.get('author')
1410             info.update({
1411                 'url': url_or_none(e.get('contentUrl')),
1412                 'title': unescapeHTML(e.get('name')),
1413                 'description': unescapeHTML(e.get('description')),
1414                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1415                 'duration': parse_duration(e.get('duration')),
1416                 'timestamp': unified_timestamp(e.get('uploadDate')),
1417                 # author can be an instance of 'Organization' or 'Person' types.
1418                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1419                 # however some websites are using 'Text' type instead.
1420                 # 1. https://schema.org/VideoObject
1421                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1422                 'filesize': float_or_none(e.get('contentSize')),
1423                 'tbr': int_or_none(e.get('bitrate')),
1424                 'width': int_or_none(e.get('width')),
1425                 'height': int_or_none(e.get('height')),
1426                 'view_count': int_or_none(e.get('interactionCount')),
1427             })
1428             extract_interaction_statistic(e)
1429
1430         for e in json_ld:
1431             if '@context' in e:
1432                 item_type = e.get('@type')
1433                 if expected_type is not None and expected_type != item_type:
1434                     continue
1435                 if item_type in ('TVEpisode', 'Episode'):
1436                     episode_name = unescapeHTML(e.get('name'))
1437                     info.update({
1438                         'episode': episode_name,
1439                         'episode_number': int_or_none(e.get('episodeNumber')),
1440                         'description': unescapeHTML(e.get('description')),
1441                     })
1442                     if not info.get('title') and episode_name:
1443                         info['title'] = episode_name
1444                     part_of_season = e.get('partOfSeason')
1445                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1446                         info.update({
1447                             'season': unescapeHTML(part_of_season.get('name')),
1448                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1449                         })
1450                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1451                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1452                         info['series'] = unescapeHTML(part_of_series.get('name'))
1453                 elif item_type == 'Movie':
1454                     info.update({
1455                         'title': unescapeHTML(e.get('name')),
1456                         'description': unescapeHTML(e.get('description')),
1457                         'duration': parse_duration(e.get('duration')),
1458                         'timestamp': unified_timestamp(e.get('dateCreated')),
1459                     })
1460                 elif item_type in ('Article', 'NewsArticle'):
1461                     info.update({
1462                         'timestamp': parse_iso8601(e.get('datePublished')),
1463                         'title': unescapeHTML(e.get('headline')),
1464                         'description': unescapeHTML(e.get('articleBody')),
1465                     })
1466                 elif item_type == 'VideoObject':
1467                     extract_video_object(e)
1468                     if expected_type is None:
1469                         continue
1470                     else:
1471                         break
1472                 video = e.get('video')
1473                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1474                     extract_video_object(video)
1475                 if expected_type is None:
1476                     continue
1477                 else:
1478                     break
1479         return dict((k, v) for k, v in info.items() if v is not None)
1480
1481     @staticmethod
1482     def _hidden_inputs(html):
1483         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1484         hidden_inputs = {}
1485         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1486             attrs = extract_attributes(input)
1487             if not input:
1488                 continue
1489             if attrs.get('type') not in ('hidden', 'submit'):
1490                 continue
1491             name = attrs.get('name') or attrs.get('id')
1492             value = attrs.get('value')
1493             if name and value is not None:
1494                 hidden_inputs[name] = value
1495         return hidden_inputs
1496
1497     def _form_hidden_inputs(self, form_id, html):
1498         form = self._search_regex(
1499             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1500             html, '%s form' % form_id, group='form')
1501         return self._hidden_inputs(form)
1502
1503     class FormatSort:
1504         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1505
1506         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1507                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1508                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1509         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1510                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1511                         'fps', 'fs_approx', 'source', 'format_id')
1512
1513         settings = {
1514             'vcodec': {'type': 'ordered', 'regex': True,
1515                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1516             'acodec': {'type': 'ordered', 'regex': True,
1517                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1518             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1519                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1520             'vext': {'type': 'ordered', 'field': 'video_ext',
1521                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1522                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1523             'aext': {'type': 'ordered', 'field': 'audio_ext',
1524                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1525                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1526             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1527             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1528                            'field': ('vcodec', 'acodec'),
1529                            'function': lambda it: int(any(v != 'none' for v in it))},
1530             'ie_pref': {'priority': True, 'type': 'extractor'},
1531             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1532             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1533             'lang': {'convert': 'ignore', 'field': 'language_preference'},
1534             'quality': {'convert': 'float_none', 'default': -1},
1535             'filesize': {'convert': 'bytes'},
1536             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1537             'id': {'convert': 'string', 'field': 'format_id'},
1538             'height': {'convert': 'float_none'},
1539             'width': {'convert': 'float_none'},
1540             'fps': {'convert': 'float_none'},
1541             'tbr': {'convert': 'float_none'},
1542             'vbr': {'convert': 'float_none'},
1543             'abr': {'convert': 'float_none'},
1544             'asr': {'convert': 'float_none'},
1545             'source': {'convert': 'ignore', 'field': 'source_preference'},
1546
1547             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1548             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1549             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1550             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1551             'res': {'type': 'multiple', 'field': ('height', 'width'),
1552                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1553
1554             # Most of these exist only for compatibility reasons
1555             'dimension': {'type': 'alias', 'field': 'res'},
1556             'resolution': {'type': 'alias', 'field': 'res'},
1557             'extension': {'type': 'alias', 'field': 'ext'},
1558             'bitrate': {'type': 'alias', 'field': 'br'},
1559             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1560             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1561             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1562             'framerate': {'type': 'alias', 'field': 'fps'},
1563             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1564             'protocol': {'type': 'alias', 'field': 'proto'},
1565             'source_preference': {'type': 'alias', 'field': 'source'},
1566             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1567             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1568             'samplerate': {'type': 'alias', 'field': 'asr'},
1569             'video_ext': {'type': 'alias', 'field': 'vext'},
1570             'audio_ext': {'type': 'alias', 'field': 'aext'},
1571             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1572             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1573             'video': {'type': 'alias', 'field': 'hasvid'},
1574             'has_video': {'type': 'alias', 'field': 'hasvid'},
1575             'audio': {'type': 'alias', 'field': 'hasaud'},
1576             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1577             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1578             'preference': {'type': 'alias', 'field': 'ie_pref'},
1579             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1580             'format_id': {'type': 'alias', 'field': 'id'},
1581         }
1582
1583         _order = []
1584
1585         def _get_field_setting(self, field, key):
1586             if field not in self.settings:
1587                 self.settings[field] = {}
1588             propObj = self.settings[field]
1589             if key not in propObj:
1590                 type = propObj.get('type')
1591                 if key == 'field':
1592                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1593                 elif key == 'convert':
1594                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1595                 else:
1596                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1597                 propObj[key] = default
1598             return propObj[key]
1599
1600         def _resolve_field_value(self, field, value, convertNone=False):
1601             if value is None:
1602                 if not convertNone:
1603                     return None
1604             else:
1605                 value = value.lower()
1606             conversion = self._get_field_setting(field, 'convert')
1607             if conversion == 'ignore':
1608                 return None
1609             if conversion == 'string':
1610                 return value
1611             elif conversion == 'float_none':
1612                 return float_or_none(value)
1613             elif conversion == 'bytes':
1614                 return FileDownloader.parse_bytes(value)
1615             elif conversion == 'order':
1616                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1617                 use_regex = self._get_field_setting(field, 'regex')
1618                 list_length = len(order_list)
1619                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1620                 if use_regex and value is not None:
1621                     for i, regex in enumerate(order_list):
1622                         if regex and re.match(regex, value):
1623                             return list_length - i
1624                     return list_length - empty_pos  # not in list
1625                 else:  # not regex or  value = None
1626                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1627             else:
1628                 if value.isnumeric():
1629                     return float(value)
1630                 else:
1631                     self.settings[field]['convert'] = 'string'
1632                     return value
1633
1634         def evaluate_params(self, params, sort_extractor):
1635             self._use_free_order = params.get('prefer_free_formats', False)
1636             self._sort_user = params.get('format_sort', [])
1637             self._sort_extractor = sort_extractor
1638
1639             def add_item(field, reverse, closest, limit_text):
1640                 field = field.lower()
1641                 if field in self._order:
1642                     return
1643                 self._order.append(field)
1644                 limit = self._resolve_field_value(field, limit_text)
1645                 data = {
1646                     'reverse': reverse,
1647                     'closest': False if limit is None else closest,
1648                     'limit_text': limit_text,
1649                     'limit': limit}
1650                 if field in self.settings:
1651                     self.settings[field].update(data)
1652                 else:
1653                     self.settings[field] = data
1654
1655             sort_list = (
1656                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1657                 + (tuple() if params.get('format_sort_force', False)
1658                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1659                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1660
1661             for item in sort_list:
1662                 match = re.match(self.regex, item)
1663                 if match is None:
1664                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1665                 field = match.group('field')
1666                 if field is None:
1667                     continue
1668                 if self._get_field_setting(field, 'type') == 'alias':
1669                     field = self._get_field_setting(field, 'field')
1670                 reverse = match.group('reverse') is not None
1671                 closest = match.group('separator') == '~'
1672                 limit_text = match.group('limit')
1673
1674                 has_limit = limit_text is not None
1675                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1676                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1677
1678                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1679                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1680                 limit_count = len(limits)
1681                 for (i, f) in enumerate(fields):
1682                     add_item(f, reverse, closest,
1683                              limits[i] if i < limit_count
1684                              else limits[0] if has_limit and not has_multiple_limits
1685                              else None)
1686
1687         def print_verbose_info(self, write_debug):
1688             if self._sort_user:
1689                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1690             if self._sort_extractor:
1691                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1692             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1693                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1694                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1695                               self._get_field_setting(field, 'limit_text'),
1696                               self._get_field_setting(field, 'limit'))
1697                 if self._get_field_setting(field, 'limit_text') is not None else '')
1698                 for field in self._order if self._get_field_setting(field, 'visible')]))
1699
1700         def _calculate_field_preference_from_value(self, format, field, type, value):
1701             reverse = self._get_field_setting(field, 'reverse')
1702             closest = self._get_field_setting(field, 'closest')
1703             limit = self._get_field_setting(field, 'limit')
1704
1705             if type == 'extractor':
1706                 maximum = self._get_field_setting(field, 'max')
1707                 if value is None or (maximum is not None and value >= maximum):
1708                     value = -1
1709             elif type == 'boolean':
1710                 in_list = self._get_field_setting(field, 'in_list')
1711                 not_in_list = self._get_field_setting(field, 'not_in_list')
1712                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1713             elif type == 'ordered':
1714                 value = self._resolve_field_value(field, value, True)
1715
1716             # try to convert to number
1717             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1718             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1719             if is_num:
1720                 value = val_num
1721
1722             return ((-10, 0) if value is None
1723                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1724                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1725                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1726                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1727                     else (-1, value, 0))
1728
1729         def _calculate_field_preference(self, format, field):
1730             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1731             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1732             if type == 'multiple':
1733                 type = 'field'  # Only 'field' is allowed in multiple for now
1734                 actual_fields = self._get_field_setting(field, 'field')
1735
1736                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1737             else:
1738                 value = get_value(field)
1739             return self._calculate_field_preference_from_value(format, field, type, value)
1740
1741         def calculate_preference(self, format):
1742             # Determine missing protocol
1743             if not format.get('protocol'):
1744                 format['protocol'] = determine_protocol(format)
1745
1746             # Determine missing ext
1747             if not format.get('ext') and 'url' in format:
1748                 format['ext'] = determine_ext(format['url'])
1749             if format.get('vcodec') == 'none':
1750                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1751                 format['video_ext'] = 'none'
1752             else:
1753                 format['video_ext'] = format['ext']
1754                 format['audio_ext'] = 'none'
1755             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1756             #    format['preference'] = -1000
1757
1758             # Determine missing bitrates
1759             if format.get('tbr') is None:
1760                 if format.get('vbr') is not None and format.get('abr') is not None:
1761                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1762             else:
1763                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1764                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1765                 if format.get('acodec') != "none" and format.get('abr') is None:
1766                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1767
1768             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1769
1770     def _sort_formats(self, formats, field_preference=[]):
1771         if not formats:
1772             return
1773         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1774         format_sort.evaluate_params(self._downloader.params, field_preference)
1775         if self.get_param('verbose', False):
1776             format_sort.print_verbose_info(self._downloader.write_debug)
1777         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1778
1779     def _check_formats(self, formats, video_id):
1780         if formats:
1781             formats[:] = filter(
1782                 lambda f: self._is_valid_url(
1783                     f['url'], video_id,
1784                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1785                 formats)
1786
1787     @staticmethod
1788     def _remove_duplicate_formats(formats):
1789         format_urls = set()
1790         unique_formats = []
1791         for f in formats:
1792             if f['url'] not in format_urls:
1793                 format_urls.add(f['url'])
1794                 unique_formats.append(f)
1795         formats[:] = unique_formats
1796
1797     def _is_valid_url(self, url, video_id, item='video', headers={}):
1798         url = self._proto_relative_url(url, scheme='http:')
1799         # For now assume non HTTP(S) URLs always valid
1800         if not (url.startswith('http://') or url.startswith('https://')):
1801             return True
1802         try:
1803             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1804             return True
1805         except ExtractorError as e:
1806             self.to_screen(
1807                 '%s: %s URL is invalid, skipping: %s'
1808                 % (video_id, item, error_to_compat_str(e.cause)))
1809             return False
1810
1811     def http_scheme(self):
1812         """ Either "http:" or "https:", depending on the user's preferences """
1813         return (
1814             'http:'
1815             if self.get_param('prefer_insecure', False)
1816             else 'https:')
1817
1818     def _proto_relative_url(self, url, scheme=None):
1819         if url is None:
1820             return url
1821         if url.startswith('//'):
1822             if scheme is None:
1823                 scheme = self.http_scheme()
1824             return scheme + url
1825         else:
1826             return url
1827
1828     def _sleep(self, timeout, video_id, msg_template=None):
1829         if msg_template is None:
1830             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1831         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1832         self.to_screen(msg)
1833         time.sleep(timeout)
1834
1835     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1836                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1837                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1838         manifest = self._download_xml(
1839             manifest_url, video_id, 'Downloading f4m manifest',
1840             'Unable to download f4m manifest',
1841             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1842             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1843             transform_source=transform_source,
1844             fatal=fatal, data=data, headers=headers, query=query)
1845
1846         if manifest is False:
1847             return []
1848
1849         return self._parse_f4m_formats(
1850             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1851             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1852
1853     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1854                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1855                            fatal=True, m3u8_id=None):
1856         if not isinstance(manifest, compat_etree_Element) and not fatal:
1857             return []
1858
1859         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1860         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1861         if akamai_pv is not None and ';' in akamai_pv.text:
1862             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1863             if playerVerificationChallenge.strip() != '':
1864                 return []
1865
1866         formats = []
1867         manifest_version = '1.0'
1868         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1869         if not media_nodes:
1870             manifest_version = '2.0'
1871             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1872         # Remove unsupported DRM protected media from final formats
1873         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1874         media_nodes = remove_encrypted_media(media_nodes)
1875         if not media_nodes:
1876             return formats
1877
1878         manifest_base_url = get_base_url(manifest)
1879
1880         bootstrap_info = xpath_element(
1881             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1882             'bootstrap info', default=None)
1883
1884         vcodec = None
1885         mime_type = xpath_text(
1886             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1887             'base URL', default=None)
1888         if mime_type and mime_type.startswith('audio/'):
1889             vcodec = 'none'
1890
1891         for i, media_el in enumerate(media_nodes):
1892             tbr = int_or_none(media_el.attrib.get('bitrate'))
1893             width = int_or_none(media_el.attrib.get('width'))
1894             height = int_or_none(media_el.attrib.get('height'))
1895             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1896             # If <bootstrapInfo> is present, the specified f4m is a
1897             # stream-level manifest, and only set-level manifests may refer to
1898             # external resources.  See section 11.4 and section 4 of F4M spec
1899             if bootstrap_info is None:
1900                 media_url = None
1901                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1902                 if manifest_version == '2.0':
1903                     media_url = media_el.attrib.get('href')
1904                 if media_url is None:
1905                     media_url = media_el.attrib.get('url')
1906                 if not media_url:
1907                     continue
1908                 manifest_url = (
1909                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1910                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1911                 # If media_url is itself a f4m manifest do the recursive extraction
1912                 # since bitrates in parent manifest (this one) and media_url manifest
1913                 # may differ leading to inability to resolve the format by requested
1914                 # bitrate in f4m downloader
1915                 ext = determine_ext(manifest_url)
1916                 if ext == 'f4m':
1917                     f4m_formats = self._extract_f4m_formats(
1918                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1919                         transform_source=transform_source, fatal=fatal)
1920                     # Sometimes stream-level manifest contains single media entry that
1921                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1922                     # At the same time parent's media entry in set-level manifest may
1923                     # contain it. We will copy it from parent in such cases.
1924                     if len(f4m_formats) == 1:
1925                         f = f4m_formats[0]
1926                         f.update({
1927                             'tbr': f.get('tbr') or tbr,
1928                             'width': f.get('width') or width,
1929                             'height': f.get('height') or height,
1930                             'format_id': f.get('format_id') if not tbr else format_id,
1931                             'vcodec': vcodec,
1932                         })
1933                     formats.extend(f4m_formats)
1934                     continue
1935                 elif ext == 'm3u8':
1936                     formats.extend(self._extract_m3u8_formats(
1937                         manifest_url, video_id, 'mp4', preference=preference,
1938                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1939                     continue
1940             formats.append({
1941                 'format_id': format_id,
1942                 'url': manifest_url,
1943                 'manifest_url': manifest_url,
1944                 'ext': 'flv' if bootstrap_info is not None else None,
1945                 'protocol': 'f4m',
1946                 'tbr': tbr,
1947                 'width': width,
1948                 'height': height,
1949                 'vcodec': vcodec,
1950                 'preference': preference,
1951                 'quality': quality,
1952             })
1953         return formats
1954
1955     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1956         return {
1957             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1958             'url': m3u8_url,
1959             'ext': ext,
1960             'protocol': 'm3u8',
1961             'preference': preference - 100 if preference else -100,
1962             'quality': quality,
1963             'resolution': 'multiple',
1964             'format_note': 'Quality selection URL',
1965         }
1966
1967     def _extract_m3u8_formats(self, *args, **kwargs):
1968         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1969         if subs:
1970             self.report_warning(bug_reports_message(
1971                 "Ignoring subtitle tracks found in the HLS manifest; "
1972                 "if any subtitle tracks are missing,"
1973             ), only_once=True)
1974         return fmts
1975
1976     def _extract_m3u8_formats_and_subtitles(
1977             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1978             preference=None, quality=None, m3u8_id=None, note=None,
1979             errnote=None, fatal=True, live=False, data=None, headers={},
1980             query={}):
1981
1982         res = self._download_webpage_handle(
1983             m3u8_url, video_id,
1984             note='Downloading m3u8 information' if note is None else note,
1985             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1986             fatal=fatal, data=data, headers=headers, query=query)
1987
1988         if res is False:
1989             return [], {}
1990
1991         m3u8_doc, urlh = res
1992         m3u8_url = urlh.geturl()
1993
1994         return self._parse_m3u8_formats_and_subtitles(
1995             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1996             preference=preference, quality=quality, m3u8_id=m3u8_id,
1997             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1998             headers=headers, query=query, video_id=video_id)
1999
2000     def _parse_m3u8_formats_and_subtitles(
2001             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2002             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2003             errnote=None, fatal=True, data=None, headers={}, query={},
2004             video_id=None):
2005         formats, subtitles = [], {}
2006
2007         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
2008             return formats, subtitles
2009
2010         has_drm = re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)
2011
2012         def format_url(url):
2013             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2014
2015         if self.get_param('hls_split_discontinuity', False):
2016             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2017                 if not m3u8_doc:
2018                     if not manifest_url:
2019                         return []
2020                     m3u8_doc = self._download_webpage(
2021                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2022                         note=False, errnote='Failed to download m3u8 playlist information')
2023                     if m3u8_doc is False:
2024                         return []
2025                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2026
2027         else:
2028             def _extract_m3u8_playlist_indices(*args, **kwargs):
2029                 return [None]
2030
2031         # References:
2032         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2033         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2034         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2035
2036         # We should try extracting formats only from master playlists [1, 4.3.4],
2037         # i.e. playlists that describe available qualities. On the other hand
2038         # media playlists [1, 4.3.3] should be returned as is since they contain
2039         # just the media without qualities renditions.
2040         # Fortunately, master playlist can be easily distinguished from media
2041         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2042         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2043         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2044         # media playlist and MUST NOT appear in master playlist thus we can
2045         # clearly detect media playlist with this criterion.
2046
2047         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2048             formats = [{
2049                 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
2050                 'format_index': idx,
2051                 'url': m3u8_url,
2052                 'ext': ext,
2053                 'protocol': entry_protocol,
2054                 'preference': preference,
2055                 'quality': quality,
2056                 'has_drm': has_drm,
2057             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2058
2059             return formats, subtitles
2060
2061         groups = {}
2062         last_stream_inf = {}
2063
2064         def extract_media(x_media_line):
2065             media = parse_m3u8_attributes(x_media_line)
2066             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2067             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2068             if not (media_type and group_id and name):
2069                 return
2070             groups.setdefault(group_id, []).append(media)
2071             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2072             if media_type == 'SUBTITLES':
2073                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2074                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2075                 # However, lack of URI has been spotted in the wild.
2076                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2077                 if not media.get('URI'):
2078                     return
2079                 url = format_url(media['URI'])
2080                 sub_info = {
2081                     'url': url,
2082                     'ext': determine_ext(url),
2083                 }
2084                 if sub_info['ext'] == 'm3u8':
2085                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2086                     # files may contain is WebVTT:
2087                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2088                     sub_info['ext'] = 'vtt'
2089                     sub_info['protocol'] = 'm3u8_native'
2090                 lang = media.get('LANGUAGE') or 'und'
2091                 subtitles.setdefault(lang, []).append(sub_info)
2092             if media_type not in ('VIDEO', 'AUDIO'):
2093                 return
2094             media_url = media.get('URI')
2095             if media_url:
2096                 manifest_url = format_url(media_url)
2097                 formats.extend({
2098                     'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
2099                     'format_note': name,
2100                     'format_index': idx,
2101                     'url': manifest_url,
2102                     'manifest_url': m3u8_url,
2103                     'language': media.get('LANGUAGE'),
2104                     'ext': ext,
2105                     'protocol': entry_protocol,
2106                     'preference': preference,
2107                     'quality': quality,
2108                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2109                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2110
2111         def build_stream_name():
2112             # Despite specification does not mention NAME attribute for
2113             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2114             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2115             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2116             stream_name = last_stream_inf.get('NAME')
2117             if stream_name:
2118                 return stream_name
2119             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2120             # from corresponding rendition group
2121             stream_group_id = last_stream_inf.get('VIDEO')
2122             if not stream_group_id:
2123                 return
2124             stream_group = groups.get(stream_group_id)
2125             if not stream_group:
2126                 return stream_group_id
2127             rendition = stream_group[0]
2128             return rendition.get('NAME') or stream_group_id
2129
2130         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2131         # chance to detect video only formats when EXT-X-STREAM-INF tags
2132         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2133         for line in m3u8_doc.splitlines():
2134             if line.startswith('#EXT-X-MEDIA:'):
2135                 extract_media(line)
2136
2137         for line in m3u8_doc.splitlines():
2138             if line.startswith('#EXT-X-STREAM-INF:'):
2139                 last_stream_inf = parse_m3u8_attributes(line)
2140             elif line.startswith('#') or not line.strip():
2141                 continue
2142             else:
2143                 tbr = float_or_none(
2144                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2145                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2146                 manifest_url = format_url(line.strip())
2147
2148                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2149                     format_id = [m3u8_id, None, idx]
2150                     # Bandwidth of live streams may differ over time thus making
2151                     # format_id unpredictable. So it's better to keep provided
2152                     # format_id intact.
2153                     if not live:
2154                         stream_name = build_stream_name()
2155                         format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
2156                     f = {
2157                         'format_id': '-'.join(map(str, filter(None, format_id))),
2158                         'format_index': idx,
2159                         'url': manifest_url,
2160                         'manifest_url': m3u8_url,
2161                         'tbr': tbr,
2162                         'ext': ext,
2163                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2164                         'protocol': entry_protocol,
2165                         'preference': preference,
2166                         'quality': quality,
2167                     }
2168                     resolution = last_stream_inf.get('RESOLUTION')
2169                     if resolution:
2170                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2171                         if mobj:
2172                             f['width'] = int(mobj.group('width'))
2173                             f['height'] = int(mobj.group('height'))
2174                     # Unified Streaming Platform
2175                     mobj = re.search(
2176                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2177                     if mobj:
2178                         abr, vbr = mobj.groups()
2179                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2180                         f.update({
2181                             'vbr': vbr,
2182                             'abr': abr,
2183                         })
2184                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2185                     f.update(codecs)
2186                     audio_group_id = last_stream_inf.get('AUDIO')
2187                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2188                     # references a rendition group MUST have a CODECS attribute.
2189                     # However, this is not always respected, for example, [2]
2190                     # contains EXT-X-STREAM-INF tag which references AUDIO
2191                     # rendition group but does not have CODECS and despite
2192                     # referencing an audio group it represents a complete
2193                     # (with audio and video) format. So, for such cases we will
2194                     # ignore references to rendition groups and treat them
2195                     # as complete formats.
2196                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2197                         audio_group = groups.get(audio_group_id)
2198                         if audio_group and audio_group[0].get('URI'):
2199                             # TODO: update acodec for audio only formats with
2200                             # the same GROUP-ID
2201                             f['acodec'] = 'none'
2202                     if not f.get('ext'):
2203                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2204                     formats.append(f)
2205
2206                     # for DailyMotion
2207                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2208                     if progressive_uri:
2209                         http_f = f.copy()
2210                         del http_f['manifest_url']
2211                         http_f.update({
2212                             'format_id': f['format_id'].replace('hls-', 'http-'),
2213                             'protocol': 'http',
2214                             'url': progressive_uri,
2215                         })
2216                         formats.append(http_f)
2217
2218                 last_stream_inf = {}
2219         return formats, subtitles
2220
2221     @staticmethod
2222     def _xpath_ns(path, namespace=None):
2223         if not namespace:
2224             return path
2225         out = []
2226         for c in path.split('/'):
2227             if not c or c == '.':
2228                 out.append(c)
2229             else:
2230                 out.append('{%s}%s' % (namespace, c))
2231         return '/'.join(out)
2232
2233     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2234         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2235
2236         if smil is False:
2237             assert not fatal
2238             return []
2239
2240         namespace = self._parse_smil_namespace(smil)
2241
2242         fmts = self._parse_smil_formats(
2243             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2244         subs = self._parse_smil_subtitles(
2245             smil, namespace=namespace)
2246
2247         return fmts, subs
2248
2249     def _extract_smil_formats(self, *args, **kwargs):
2250         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2251         if subs:
2252             self.report_warning(bug_reports_message(
2253                 "Ignoring subtitle tracks found in the SMIL manifest; "
2254                 "if any subtitle tracks are missing,"
2255             ), only_once=True)
2256         return fmts
2257
2258     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2259         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2260         if smil is False:
2261             return {}
2262         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2263
2264     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2265         return self._download_xml(
2266             smil_url, video_id, 'Downloading SMIL file',
2267             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2268
2269     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2270         namespace = self._parse_smil_namespace(smil)
2271
2272         formats = self._parse_smil_formats(
2273             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2274         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2275
2276         video_id = os.path.splitext(url_basename(smil_url))[0]
2277         title = None
2278         description = None
2279         upload_date = None
2280         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2281             name = meta.attrib.get('name')
2282             content = meta.attrib.get('content')
2283             if not name or not content:
2284                 continue
2285             if not title and name == 'title':
2286                 title = content
2287             elif not description and name in ('description', 'abstract'):
2288                 description = content
2289             elif not upload_date and name == 'date':
2290                 upload_date = unified_strdate(content)
2291
2292         thumbnails = [{
2293             'id': image.get('type'),
2294             'url': image.get('src'),
2295             'width': int_or_none(image.get('width')),
2296             'height': int_or_none(image.get('height')),
2297         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2298
2299         return {
2300             'id': video_id,
2301             'title': title or video_id,
2302             'description': description,
2303             'upload_date': upload_date,
2304             'thumbnails': thumbnails,
2305             'formats': formats,
2306             'subtitles': subtitles,
2307         }
2308
2309     def _parse_smil_namespace(self, smil):
2310         return self._search_regex(
2311             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2312
2313     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2314         base = smil_url
2315         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2316             b = meta.get('base') or meta.get('httpBase')
2317             if b:
2318                 base = b
2319                 break
2320
2321         formats = []
2322         rtmp_count = 0
2323         http_count = 0
2324         m3u8_count = 0
2325
2326         srcs = []
2327         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2328         for medium in media:
2329             src = medium.get('src')
2330             if not src or src in srcs:
2331                 continue
2332             srcs.append(src)
2333
2334             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2335             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2336             width = int_or_none(medium.get('width'))
2337             height = int_or_none(medium.get('height'))
2338             proto = medium.get('proto')
2339             ext = medium.get('ext')
2340             src_ext = determine_ext(src)
2341             streamer = medium.get('streamer') or base
2342
2343             if proto == 'rtmp' or streamer.startswith('rtmp'):
2344                 rtmp_count += 1
2345                 formats.append({
2346                     'url': streamer,
2347                     'play_path': src,
2348                     'ext': 'flv',
2349                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2350                     'tbr': bitrate,
2351                     'filesize': filesize,
2352                     'width': width,
2353                     'height': height,
2354                 })
2355                 if transform_rtmp_url:
2356                     streamer, src = transform_rtmp_url(streamer, src)
2357                     formats[-1].update({
2358                         'url': streamer,
2359                         'play_path': src,
2360                     })
2361                 continue
2362
2363             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2364             src_url = src_url.strip()
2365
2366             if proto == 'm3u8' or src_ext == 'm3u8':
2367                 m3u8_formats = self._extract_m3u8_formats(
2368                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2369                 if len(m3u8_formats) == 1:
2370                     m3u8_count += 1
2371                     m3u8_formats[0].update({
2372                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2373                         'tbr': bitrate,
2374                         'width': width,
2375                         'height': height,
2376                     })
2377                 formats.extend(m3u8_formats)
2378             elif src_ext == 'f4m':
2379                 f4m_url = src_url
2380                 if not f4m_params:
2381                     f4m_params = {
2382                         'hdcore': '3.2.0',
2383                         'plugin': 'flowplayer-3.2.0.1',
2384                     }
2385                 f4m_url += '&' if '?' in f4m_url else '?'
2386                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2387                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2388             elif src_ext == 'mpd':
2389                 formats.extend(self._extract_mpd_formats(
2390                     src_url, video_id, mpd_id='dash', fatal=False))
2391             elif re.search(r'\.ism/[Mm]anifest', src_url):
2392                 formats.extend(self._extract_ism_formats(
2393                     src_url, video_id, ism_id='mss', fatal=False))
2394             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2395                 http_count += 1
2396                 formats.append({
2397                     'url': src_url,
2398                     'ext': ext or src_ext or 'flv',
2399                     'format_id': 'http-%d' % (bitrate or http_count),
2400                     'tbr': bitrate,
2401                     'filesize': filesize,
2402                     'width': width,
2403                     'height': height,
2404                 })
2405
2406         return formats
2407
2408     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2409         urls = []
2410         subtitles = {}
2411         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2412             src = textstream.get('src')
2413             if not src or src in urls:
2414                 continue
2415             urls.append(src)
2416             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2417             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2418             subtitles.setdefault(lang, []).append({
2419                 'url': src,
2420                 'ext': ext,
2421             })
2422         return subtitles
2423
2424     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2425         xspf = self._download_xml(
2426             xspf_url, playlist_id, 'Downloading xpsf playlist',
2427             'Unable to download xspf manifest', fatal=fatal)
2428         if xspf is False:
2429             return []
2430         return self._parse_xspf(
2431             xspf, playlist_id, xspf_url=xspf_url,
2432             xspf_base_url=base_url(xspf_url))
2433
2434     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2435         NS_MAP = {
2436             'xspf': 'http://xspf.org/ns/0/',
2437             's1': 'http://static.streamone.nl/player/ns/0',
2438         }
2439
2440         entries = []
2441         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2442             title = xpath_text(
2443                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2444             description = xpath_text(
2445                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2446             thumbnail = xpath_text(
2447                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2448             duration = float_or_none(
2449                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2450
2451             formats = []
2452             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2453                 format_url = urljoin(xspf_base_url, location.text)
2454                 if not format_url:
2455                     continue
2456                 formats.append({
2457                     'url': format_url,
2458                     'manifest_url': xspf_url,
2459                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2460                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2461                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2462                 })
2463             self._sort_formats(formats)
2464
2465             entries.append({
2466                 'id': playlist_id,
2467                 'title': title,
2468                 'description': description,
2469                 'thumbnail': thumbnail,
2470                 'duration': duration,
2471                 'formats': formats,
2472             })
2473         return entries
2474
2475     def _extract_mpd_formats(self, *args, **kwargs):
2476         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2477         if subs:
2478             self.report_warning(bug_reports_message(
2479                 "Ignoring subtitle tracks found in the DASH manifest; "
2480                 "if any subtitle tracks are missing,"
2481             ), only_once=True)
2482         return fmts
2483
2484     def _extract_mpd_formats_and_subtitles(
2485             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2486             fatal=True, data=None, headers={}, query={}):
2487         res = self._download_xml_handle(
2488             mpd_url, video_id,
2489             note='Downloading MPD manifest' if note is None else note,
2490             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2491             fatal=fatal, data=data, headers=headers, query=query)
2492         if res is False:
2493             return [], {}
2494         mpd_doc, urlh = res
2495         if mpd_doc is None:
2496             return [], {}
2497         mpd_base_url = base_url(urlh.geturl())
2498
2499         return self._parse_mpd_formats_and_subtitles(
2500             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2501
2502     def _parse_mpd_formats(self, *args, **kwargs):
2503         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2504         if subs:
2505             self.report_warning(bug_reports_message(
2506                 "Ignoring subtitle tracks found in the DASH manifest; "
2507                 "if any subtitle tracks are missing,"
2508             ), only_once=True)
2509         return fmts
2510
2511     def _parse_mpd_formats_and_subtitles(
2512             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2513         """
2514         Parse formats from MPD manifest.
2515         References:
2516          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2517             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2518          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2519         """
2520         if not self.get_param('dynamic_mpd', True):
2521             if mpd_doc.get('type') == 'dynamic':
2522                 return [], {}
2523
2524         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2525
2526         def _add_ns(path):
2527             return self._xpath_ns(path, namespace)
2528
2529         def is_drm_protected(element):
2530             return element.find(_add_ns('ContentProtection')) is not None
2531
2532         def extract_multisegment_info(element, ms_parent_info):
2533             ms_info = ms_parent_info.copy()
2534
2535             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2536             # common attributes and elements.  We will only extract relevant
2537             # for us.
2538             def extract_common(source):
2539                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2540                 if segment_timeline is not None:
2541                     s_e = segment_timeline.findall(_add_ns('S'))
2542                     if s_e:
2543                         ms_info['total_number'] = 0
2544                         ms_info['s'] = []
2545                         for s in s_e:
2546                             r = int(s.get('r', 0))
2547                             ms_info['total_number'] += 1 + r
2548                             ms_info['s'].append({
2549                                 't': int(s.get('t', 0)),
2550                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2551                                 'd': int(s.attrib['d']),
2552                                 'r': r,
2553                             })
2554                 start_number = source.get('startNumber')
2555                 if start_number:
2556                     ms_info['start_number'] = int(start_number)
2557                 timescale = source.get('timescale')
2558                 if timescale:
2559                     ms_info['timescale'] = int(timescale)
2560                 segment_duration = source.get('duration')
2561                 if segment_duration:
2562                     ms_info['segment_duration'] = float(segment_duration)
2563
2564             def extract_Initialization(source):
2565                 initialization = source.find(_add_ns('Initialization'))
2566                 if initialization is not None:
2567                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2568
2569             segment_list = element.find(_add_ns('SegmentList'))
2570             if segment_list is not None:
2571                 extract_common(segment_list)
2572                 extract_Initialization(segment_list)
2573                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2574                 if segment_urls_e:
2575                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2576             else:
2577                 segment_template = element.find(_add_ns('SegmentTemplate'))
2578                 if segment_template is not None:
2579                     extract_common(segment_template)
2580                     media = segment_template.get('media')
2581                     if media:
2582                         ms_info['media'] = media
2583                     initialization = segment_template.get('initialization')
2584                     if initialization:
2585                         ms_info['initialization'] = initialization
2586                     else:
2587                         extract_Initialization(segment_template)
2588             return ms_info
2589
2590         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2591         formats, subtitles = [], {}
2592         stream_numbers = {'audio': 0, 'video': 0}
2593         for period in mpd_doc.findall(_add_ns('Period')):
2594             period_duration = parse_duration(period.get('duration')) or mpd_duration
2595             period_ms_info = extract_multisegment_info(period, {
2596                 'start_number': 1,
2597                 'timescale': 1,
2598             })
2599             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2600                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2601                 for representation in adaptation_set.findall(_add_ns('Representation')):
2602                     representation_attrib = adaptation_set.attrib.copy()
2603                     representation_attrib.update(representation.attrib)
2604                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2605                     mime_type = representation_attrib['mimeType']
2606                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2607
2608                     codecs = representation_attrib.get('codecs', '')
2609                     if content_type not in ('video', 'audio', 'text'):
2610                         if mime_type == 'image/jpeg':
2611                             content_type = mime_type
2612                         elif codecs.split('.')[0] == 'stpp':
2613                             content_type = 'text'
2614                         else:
2615                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2616                             continue
2617
2618                     base_url = ''
2619                     for element in (representation, adaptation_set, period, mpd_doc):
2620                         base_url_e = element.find(_add_ns('BaseURL'))
2621                         if base_url_e is not None:
2622                             base_url = base_url_e.text + base_url
2623                             if re.match(r'^https?://', base_url):
2624                                 break
2625                     if mpd_base_url and base_url.startswith('/'):
2626                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2627                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2628                         if not mpd_base_url.endswith('/'):
2629                             mpd_base_url += '/'
2630                         base_url = mpd_base_url + base_url
2631                     representation_id = representation_attrib.get('id')
2632                     lang = representation_attrib.get('lang')
2633                     url_el = representation.find(_add_ns('BaseURL'))
2634                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2635                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2636                     if representation_id is not None:
2637                         format_id = representation_id
2638                     else:
2639                         format_id = content_type
2640                     if mpd_id:
2641                         format_id = mpd_id + '-' + format_id
2642                     if content_type in ('video', 'audio'):
2643                         f = {
2644                             'format_id': format_id,
2645                             'manifest_url': mpd_url,
2646                             'ext': mimetype2ext(mime_type),
2647                             'width': int_or_none(representation_attrib.get('width')),
2648                             'height': int_or_none(representation_attrib.get('height')),
2649                             'tbr': float_or_none(bandwidth, 1000),
2650                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2651                             'fps': int_or_none(representation_attrib.get('frameRate')),
2652                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2653                             'format_note': 'DASH %s' % content_type,
2654                             'filesize': filesize,
2655                             'container': mimetype2ext(mime_type) + '_dash',
2656                             'manifest_stream_number': stream_numbers[content_type]
2657                         }
2658                         f.update(parse_codecs(codecs))
2659                         stream_numbers[content_type] += 1
2660                     elif content_type == 'text':
2661                         f = {
2662                             'ext': mimetype2ext(mime_type),
2663                             'manifest_url': mpd_url,
2664                             'filesize': filesize,
2665                         }
2666                     elif content_type == 'image/jpeg':
2667                         # See test case in VikiIE
2668                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2669                         f = {
2670                             'format_id': format_id,
2671                             'ext': 'mhtml',
2672                             'manifest_url': mpd_url,
2673                             'format_note': 'DASH storyboards (jpeg)',
2674                             'acodec': 'none',
2675                             'vcodec': 'none',
2676                         }
2677                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2678                         f['has_drm'] = True
2679                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2680
2681                     def prepare_template(template_name, identifiers):
2682                         tmpl = representation_ms_info[template_name]
2683                         # First of, % characters outside $...$ templates
2684                         # must be escaped by doubling for proper processing
2685                         # by % operator string formatting used further (see
2686                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2687                         t = ''
2688                         in_template = False
2689                         for c in tmpl:
2690                             t += c
2691                             if c == '$':
2692                                 in_template = not in_template
2693                             elif c == '%' and not in_template:
2694                                 t += c
2695                         # Next, $...$ templates are translated to their
2696                         # %(...) counterparts to be used with % operator
2697                         if representation_id is not None:
2698                             t = t.replace('$RepresentationID$', representation_id)
2699                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2700                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2701                         t.replace('$$', '$')
2702                         return t
2703
2704                     # @initialization is a regular template like @media one
2705                     # so it should be handled just the same way (see
2706                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2707                     if 'initialization' in representation_ms_info:
2708                         initialization_template = prepare_template(
2709                             'initialization',
2710                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2711                             # $Time$ shall not be included for @initialization thus
2712                             # only $Bandwidth$ remains
2713                             ('Bandwidth', ))
2714                         representation_ms_info['initialization_url'] = initialization_template % {
2715                             'Bandwidth': bandwidth,
2716                         }
2717
2718                     def location_key(location):
2719                         return 'url' if re.match(r'^https?://', location) else 'path'
2720
2721                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2722
2723                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2724                         media_location_key = location_key(media_template)
2725
2726                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2727                         # can't be used at the same time
2728                         if '%(Number' in media_template and 's' not in representation_ms_info:
2729                             segment_duration = None
2730                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2731                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2732                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2733                             representation_ms_info['fragments'] = [{
2734                                 media_location_key: media_template % {
2735                                     'Number': segment_number,
2736                                     'Bandwidth': bandwidth,
2737                                 },
2738                                 'duration': segment_duration,
2739                             } for segment_number in range(
2740                                 representation_ms_info['start_number'],
2741                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2742                         else:
2743                             # $Number*$ or $Time$ in media template with S list available
2744                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2745                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2746                             representation_ms_info['fragments'] = []
2747                             segment_time = 0
2748                             segment_d = None
2749                             segment_number = representation_ms_info['start_number']
2750
2751                             def add_segment_url():
2752                                 segment_url = media_template % {
2753                                     'Time': segment_time,
2754                                     'Bandwidth': bandwidth,
2755                                     'Number': segment_number,
2756                                 }
2757                                 representation_ms_info['fragments'].append({
2758                                     media_location_key: segment_url,
2759                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2760                                 })
2761
2762                             for num, s in enumerate(representation_ms_info['s']):
2763                                 segment_time = s.get('t') or segment_time
2764                                 segment_d = s['d']
2765                                 add_segment_url()
2766                                 segment_number += 1
2767                                 for r in range(s.get('r', 0)):
2768                                     segment_time += segment_d
2769                                     add_segment_url()
2770                                     segment_number += 1
2771                                 segment_time += segment_d
2772                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2773                         # No media template
2774                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2775                         # or any YouTube dashsegments video
2776                         fragments = []
2777                         segment_index = 0
2778                         timescale = representation_ms_info['timescale']
2779                         for s in representation_ms_info['s']:
2780                             duration = float_or_none(s['d'], timescale)
2781                             for r in range(s.get('r', 0) + 1):
2782                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2783                                 fragments.append({
2784                                     location_key(segment_uri): segment_uri,
2785                                     'duration': duration,
2786                                 })
2787                                 segment_index += 1
2788                         representation_ms_info['fragments'] = fragments
2789                     elif 'segment_urls' in representation_ms_info:
2790                         # Segment URLs with no SegmentTimeline
2791                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2792                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2793                         fragments = []
2794                         segment_duration = float_or_none(
2795                             representation_ms_info['segment_duration'],
2796                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2797                         for segment_url in representation_ms_info['segment_urls']:
2798                             fragment = {
2799                                 location_key(segment_url): segment_url,
2800                             }
2801                             if segment_duration:
2802                                 fragment['duration'] = segment_duration
2803                             fragments.append(fragment)
2804                         representation_ms_info['fragments'] = fragments
2805                     # If there is a fragments key available then we correctly recognized fragmented media.
2806                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2807                     # assumption is not necessarily correct since we may simply have no support for
2808                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2809                     if 'fragments' in representation_ms_info:
2810                         f.update({
2811                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2812                             'url': mpd_url or base_url,
2813                             'fragment_base_url': base_url,
2814                             'fragments': [],
2815                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2816                         })
2817                         if 'initialization_url' in representation_ms_info:
2818                             initialization_url = representation_ms_info['initialization_url']
2819                             if not f.get('url'):
2820                                 f['url'] = initialization_url
2821                             f['fragments'].append({location_key(initialization_url): initialization_url})
2822                         f['fragments'].extend(representation_ms_info['fragments'])
2823                     else:
2824                         # Assuming direct URL to unfragmented media.
2825                         f['url'] = base_url
2826                     if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2827                         formats.append(f)
2828                     elif content_type == 'text':
2829                         subtitles.setdefault(lang or 'und', []).append(f)
2830
2831         return formats, subtitles
2832
2833     def _extract_ism_formats(self, *args, **kwargs):
2834         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2835         if subs:
2836             self.report_warning(bug_reports_message(
2837                 "Ignoring subtitle tracks found in the ISM manifest; "
2838                 "if any subtitle tracks are missing,"
2839             ))
2840         return fmts
2841
2842     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2843         res = self._download_xml_handle(
2844             ism_url, video_id,
2845             note='Downloading ISM manifest' if note is None else note,
2846             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2847             fatal=fatal, data=data, headers=headers, query=query)
2848         if res is False:
2849             return [], {}
2850         ism_doc, urlh = res
2851         if ism_doc is None:
2852             return [], {}
2853
2854         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2855
2856     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2857         """
2858         Parse formats from ISM manifest.
2859         References:
2860          1. [MS-SSTR]: Smooth Streaming Protocol,
2861             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2862         """
2863         if ism_doc.get('IsLive') == 'TRUE':
2864             return [], {}
2865
2866         duration = int(ism_doc.attrib['Duration'])
2867         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2868
2869         formats = []
2870         subtitles = {}
2871         for stream in ism_doc.findall('StreamIndex'):
2872             stream_type = stream.get('Type')
2873             if stream_type not in ('video', 'audio', 'text'):
2874                 continue
2875             url_pattern = stream.attrib['Url']
2876             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2877             stream_name = stream.get('Name')
2878             stream_language = stream.get('Language', 'und')
2879             for track in stream.findall('QualityLevel'):
2880                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2881                 # TODO: add support for WVC1 and WMAP
2882                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2883                     self.report_warning('%s is not a supported codec' % fourcc)
2884                     continue
2885                 tbr = int(track.attrib['Bitrate']) // 1000
2886                 # [1] does not mention Width and Height attributes. However,
2887                 # they're often present while MaxWidth and MaxHeight are
2888                 # missing, so should be used as fallbacks
2889                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2890                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2891                 sampling_rate = int_or_none(track.get('SamplingRate'))
2892
2893                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2894                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2895
2896                 fragments = []
2897                 fragment_ctx = {
2898                     'time': 0,
2899                 }
2900                 stream_fragments = stream.findall('c')
2901                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2902                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2903                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2904                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2905                     if not fragment_ctx['duration']:
2906                         try:
2907                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2908                         except IndexError:
2909                             next_fragment_time = duration
2910                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2911                     for _ in range(fragment_repeat):
2912                         fragments.append({
2913                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2914                             'duration': fragment_ctx['duration'] / stream_timescale,
2915                         })
2916                         fragment_ctx['time'] += fragment_ctx['duration']
2917
2918                 format_id = []
2919                 if ism_id:
2920                     format_id.append(ism_id)
2921                 if stream_name:
2922                     format_id.append(stream_name)
2923                 format_id.append(compat_str(tbr))
2924
2925                 if stream_type == 'text':
2926                     subtitles.setdefault(stream_language, []).append({
2927                         'ext': 'ismt',
2928                         'protocol': 'ism',
2929                         'url': ism_url,
2930                         'manifest_url': ism_url,
2931                         'fragments': fragments,
2932                         '_download_params': {
2933                             'stream_type': stream_type,
2934                             'duration': duration,
2935                             'timescale': stream_timescale,
2936                             'fourcc': fourcc,
2937                             'language': stream_language,
2938                             'codec_private_data': track.get('CodecPrivateData'),
2939                         }
2940                     })
2941                 elif stream_type in ('video', 'audio'):
2942                     formats.append({
2943                         'format_id': '-'.join(format_id),
2944                         'url': ism_url,
2945                         'manifest_url': ism_url,
2946                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2947                         'width': width,
2948                         'height': height,
2949                         'tbr': tbr,
2950                         'asr': sampling_rate,
2951                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2952                         'acodec': 'none' if stream_type == 'video' else fourcc,
2953                         'protocol': 'ism',
2954                         'fragments': fragments,
2955                         'has_drm': ism_doc.find('Protection') is not None,
2956                         '_download_params': {
2957                             'stream_type': stream_type,
2958                             'duration': duration,
2959                             'timescale': stream_timescale,
2960                             'width': width or 0,
2961                             'height': height or 0,
2962                             'fourcc': fourcc,
2963                             'language': stream_language,
2964                             'codec_private_data': track.get('CodecPrivateData'),
2965                             'sampling_rate': sampling_rate,
2966                             'channels': int_or_none(track.get('Channels', 2)),
2967                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2968                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2969                         },
2970                     })
2971         return formats, subtitles
2972
2973     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2974         def absolute_url(item_url):
2975             return urljoin(base_url, item_url)
2976
2977         def parse_content_type(content_type):
2978             if not content_type:
2979                 return {}
2980             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2981             if ctr:
2982                 mimetype, codecs = ctr.groups()
2983                 f = parse_codecs(codecs)
2984                 f['ext'] = mimetype2ext(mimetype)
2985                 return f
2986             return {}
2987
2988         def _media_formats(src, cur_media_type, type_info={}):
2989             full_url = absolute_url(src)
2990             ext = type_info.get('ext') or determine_ext(full_url)
2991             if ext == 'm3u8':
2992                 is_plain_url = False
2993                 formats = self._extract_m3u8_formats(
2994                     full_url, video_id, ext='mp4',
2995                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2996                     preference=preference, quality=quality, fatal=False)
2997             elif ext == 'mpd':
2998                 is_plain_url = False
2999                 formats = self._extract_mpd_formats(
3000                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3001             else:
3002                 is_plain_url = True
3003                 formats = [{
3004                     'url': full_url,
3005                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3006                 }]
3007             return is_plain_url, formats
3008
3009         entries = []
3010         # amp-video and amp-audio are very similar to their HTML5 counterparts
3011         # so we wll include them right here (see
3012         # https://www.ampproject.org/docs/reference/components/amp-video)
3013         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3014         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3015         media_tags = [(media_tag, media_tag_name, media_type, '')
3016                       for media_tag, media_tag_name, media_type
3017                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3018         media_tags.extend(re.findall(
3019             # We only allow video|audio followed by a whitespace or '>'.
3020             # Allowing more characters may end up in significant slow down (see
3021             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3022             # http://www.porntrex.com/maps/videositemap.xml).
3023             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3024         for media_tag, _, media_type, media_content in media_tags:
3025             media_info = {
3026                 'formats': [],
3027                 'subtitles': {},
3028             }
3029             media_attributes = extract_attributes(media_tag)
3030             src = strip_or_none(media_attributes.get('src'))
3031             if src:
3032                 _, formats = _media_formats(src, media_type)
3033                 media_info['formats'].extend(formats)
3034             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3035             if media_content:
3036                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3037                     s_attr = extract_attributes(source_tag)
3038                     # data-video-src and data-src are non standard but seen
3039                     # several times in the wild
3040                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3041                     if not src:
3042                         continue
3043                     f = parse_content_type(s_attr.get('type'))
3044                     is_plain_url, formats = _media_formats(src, media_type, f)
3045                     if is_plain_url:
3046                         # width, height, res, label and title attributes are
3047                         # all not standard but seen several times in the wild
3048                         labels = [
3049                             s_attr.get(lbl)
3050                             for lbl in ('label', 'title')
3051                             if str_or_none(s_attr.get(lbl))
3052                         ]
3053                         width = int_or_none(s_attr.get('width'))
3054                         height = (int_or_none(s_attr.get('height'))
3055                                   or int_or_none(s_attr.get('res')))
3056                         if not width or not height:
3057                             for lbl in labels:
3058                                 resolution = parse_resolution(lbl)
3059                                 if not resolution:
3060                                     continue
3061                                 width = width or resolution.get('width')
3062                                 height = height or resolution.get('height')
3063                         for lbl in labels:
3064                             tbr = parse_bitrate(lbl)
3065                             if tbr:
3066                                 break
3067                         else:
3068                             tbr = None
3069                         f.update({
3070                             'width': width,
3071                             'height': height,
3072                             'tbr': tbr,
3073                             'format_id': s_attr.get('label') or s_attr.get('title'),
3074                         })
3075                         f.update(formats[0])
3076                         media_info['formats'].append(f)
3077                     else:
3078                         media_info['formats'].extend(formats)
3079                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3080                     track_attributes = extract_attributes(track_tag)
3081                     kind = track_attributes.get('kind')
3082                     if not kind or kind in ('subtitles', 'captions'):
3083                         src = strip_or_none(track_attributes.get('src'))
3084                         if not src:
3085                             continue
3086                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3087                         media_info['subtitles'].setdefault(lang, []).append({
3088                             'url': absolute_url(src),
3089                         })
3090             for f in media_info['formats']:
3091                 f.setdefault('http_headers', {})['Referer'] = base_url
3092             if media_info['formats'] or media_info['subtitles']:
3093                 entries.append(media_info)
3094         return entries
3095
3096     def _extract_akamai_formats(self, *args, **kwargs):
3097         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3098         if subs:
3099             self.report_warning(bug_reports_message(
3100                 "Ignoring subtitle tracks found in the manifests; "
3101                 "if any subtitle tracks are missing,"
3102             ))
3103         return fmts
3104
3105     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3106         signed = 'hdnea=' in manifest_url
3107         if not signed:
3108             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3109             manifest_url = re.sub(
3110                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3111                 '', manifest_url).strip('?')
3112
3113         formats = []
3114         subtitles = {}
3115
3116         hdcore_sign = 'hdcore=3.7.0'
3117         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3118         hds_host = hosts.get('hds')
3119         if hds_host:
3120             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3121         if 'hdcore=' not in f4m_url:
3122             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3123         f4m_formats = self._extract_f4m_formats(
3124             f4m_url, video_id, f4m_id='hds', fatal=False)
3125         for entry in f4m_formats:
3126             entry.update({'extra_param_to_segment_url': hdcore_sign})
3127         formats.extend(f4m_formats)
3128
3129         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3130         hls_host = hosts.get('hls')
3131         if hls_host:
3132             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3133         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3134             m3u8_url, video_id, 'mp4', 'm3u8_native',
3135             m3u8_id='hls', fatal=False)
3136         formats.extend(m3u8_formats)
3137         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3138
3139         http_host = hosts.get('http')
3140         if http_host and m3u8_formats and not signed:
3141             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3142             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3143             qualities_length = len(qualities)
3144             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3145                 i = 0
3146                 for f in m3u8_formats:
3147                     if f['vcodec'] != 'none':
3148                         for protocol in ('http', 'https'):
3149                             http_f = f.copy()
3150                             del http_f['manifest_url']
3151                             http_url = re.sub(
3152                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3153                             http_f.update({
3154                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3155                                 'url': http_url,
3156                                 'protocol': protocol,
3157                             })
3158                             formats.append(http_f)
3159                         i += 1
3160
3161         return formats, subtitles
3162
3163     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3164         query = compat_urlparse.urlparse(url).query
3165         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3166         mobj = re.search(
3167             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3168         url_base = mobj.group('url')
3169         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3170         formats = []
3171
3172         def manifest_url(manifest):
3173             m_url = '%s/%s' % (http_base_url, manifest)
3174             if query:
3175                 m_url += '?%s' % query
3176             return m_url
3177
3178         if 'm3u8' not in skip_protocols:
3179             formats.extend(self._extract_m3u8_formats(
3180                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3181                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3182         if 'f4m' not in skip_protocols:
3183             formats.extend(self._extract_f4m_formats(
3184                 manifest_url('manifest.f4m'),
3185                 video_id, f4m_id='hds', fatal=False))
3186         if 'dash' not in skip_protocols:
3187             formats.extend(self._extract_mpd_formats(
3188                 manifest_url('manifest.mpd'),
3189                 video_id, mpd_id='dash', fatal=False))
3190         if re.search(r'(?:/smil:|\.smil)', url_base):
3191             if 'smil' not in skip_protocols:
3192                 rtmp_formats = self._extract_smil_formats(
3193                     manifest_url('jwplayer.smil'),
3194                     video_id, fatal=False)
3195                 for rtmp_format in rtmp_formats:
3196                     rtsp_format = rtmp_format.copy()
3197                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3198                     del rtsp_format['play_path']
3199                     del rtsp_format['ext']
3200                     rtsp_format.update({
3201                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3202                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3203                         'protocol': 'rtsp',
3204                     })
3205                     formats.extend([rtmp_format, rtsp_format])
3206         else:
3207             for protocol in ('rtmp', 'rtsp'):
3208                 if protocol not in skip_protocols:
3209                     formats.append({
3210                         'url': '%s:%s' % (protocol, url_base),
3211                         'format_id': protocol,
3212                         'protocol': protocol,
3213                     })
3214         return formats
3215
3216     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3217         mobj = re.search(
3218             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3219             webpage)
3220         if mobj:
3221             try:
3222                 jwplayer_data = self._parse_json(mobj.group('options'),
3223                                                  video_id=video_id,
3224                                                  transform_source=transform_source)
3225             except ExtractorError:
3226                 pass
3227             else:
3228                 if isinstance(jwplayer_data, dict):
3229                     return jwplayer_data
3230
3231     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3232         jwplayer_data = self._find_jwplayer_data(
3233             webpage, video_id, transform_source=js_to_json)
3234         return self._parse_jwplayer_data(
3235             jwplayer_data, video_id, *args, **kwargs)
3236
3237     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3238                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3239         # JWPlayer backward compatibility: flattened playlists
3240         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3241         if 'playlist' not in jwplayer_data:
3242             jwplayer_data = {'playlist': [jwplayer_data]}
3243
3244         entries = []
3245
3246         # JWPlayer backward compatibility: single playlist item
3247         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3248         if not isinstance(jwplayer_data['playlist'], list):
3249             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3250
3251         for video_data in jwplayer_data['playlist']:
3252             # JWPlayer backward compatibility: flattened sources
3253             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3254             if 'sources' not in video_data:
3255                 video_data['sources'] = [video_data]
3256
3257             this_video_id = video_id or video_data['mediaid']
3258
3259             formats = self._parse_jwplayer_formats(
3260                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3261                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3262
3263             subtitles = {}
3264             tracks = video_data.get('tracks')
3265             if tracks and isinstance(tracks, list):
3266                 for track in tracks:
3267                     if not isinstance(track, dict):
3268                         continue
3269                     track_kind = track.get('kind')
3270                     if not track_kind or not isinstance(track_kind, compat_str):
3271                         continue
3272                     if track_kind.lower() not in ('captions', 'subtitles'):
3273                         continue
3274                     track_url = urljoin(base_url, track.get('file'))
3275                     if not track_url:
3276                         continue
3277                     subtitles.setdefault(track.get('label') or 'en', []).append({
3278                         'url': self._proto_relative_url(track_url)
3279                     })
3280
3281             entry = {
3282                 'id': this_video_id,
3283                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3284                 'description': clean_html(video_data.get('description')),
3285                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3286                 'timestamp': int_or_none(video_data.get('pubdate')),
3287                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3288                 'subtitles': subtitles,
3289             }
3290             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3291             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3292                 entry.update({
3293                     '_type': 'url_transparent',
3294                     'url': formats[0]['url'],
3295                 })
3296             else:
3297                 self._sort_formats(formats)
3298                 entry['formats'] = formats
3299             entries.append(entry)
3300         if len(entries) == 1:
3301             return entries[0]
3302         else:
3303             return self.playlist_result(entries)
3304
3305     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3306                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3307         urls = []
3308         formats = []
3309         for source in jwplayer_sources_data:
3310             if not isinstance(source, dict):
3311                 continue
3312             source_url = urljoin(
3313                 base_url, self._proto_relative_url(source.get('file')))
3314             if not source_url or source_url in urls:
3315                 continue
3316             urls.append(source_url)
3317             source_type = source.get('type') or ''
3318             ext = mimetype2ext(source_type) or determine_ext(source_url)
3319             if source_type == 'hls' or ext == 'm3u8':
3320                 formats.extend(self._extract_m3u8_formats(
3321                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3322                     m3u8_id=m3u8_id, fatal=False))
3323             elif source_type == 'dash' or ext == 'mpd':
3324                 formats.extend(self._extract_mpd_formats(
3325                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3326             elif ext == 'smil':
3327                 formats.extend(self._extract_smil_formats(
3328                     source_url, video_id, fatal=False))
3329             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3330             elif source_type.startswith('audio') or ext in (
3331                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3332                 formats.append({
3333                     'url': source_url,
3334                     'vcodec': 'none',
3335                     'ext': ext,
3336                 })
3337             else:
3338                 height = int_or_none(source.get('height'))
3339                 if height is None:
3340                     # Often no height is provided but there is a label in
3341                     # format like "1080p", "720p SD", or 1080.
3342                     height = int_or_none(self._search_regex(
3343                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3344                         'height', default=None))
3345                 a_format = {
3346                     'url': source_url,
3347                     'width': int_or_none(source.get('width')),
3348                     'height': height,
3349                     'tbr': int_or_none(source.get('bitrate')),
3350                     'ext': ext,
3351                 }
3352                 if source_url.startswith('rtmp'):
3353                     a_format['ext'] = 'flv'
3354                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3355                     # of jwplayer.flash.swf
3356                     rtmp_url_parts = re.split(
3357                         r'((?:mp4|mp3|flv):)', source_url, 1)
3358                     if len(rtmp_url_parts) == 3:
3359                         rtmp_url, prefix, play_path = rtmp_url_parts
3360                         a_format.update({
3361                             'url': rtmp_url,
3362                             'play_path': prefix + play_path,
3363                         })
3364                     if rtmp_params:
3365                         a_format.update(rtmp_params)
3366                 formats.append(a_format)
3367         return formats
3368
3369     def _live_title(self, name):
3370         """ Generate the title for a live video """
3371         now = datetime.datetime.now()
3372         now_str = now.strftime('%Y-%m-%d %H:%M')
3373         return name + ' ' + now_str
3374
3375     def _int(self, v, name, fatal=False, **kwargs):
3376         res = int_or_none(v, **kwargs)
3377         if 'get_attr' in kwargs:
3378             print(getattr(v, kwargs['get_attr']))
3379         if res is None:
3380             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3381             if fatal:
3382                 raise ExtractorError(msg)
3383             else:
3384                 self.report_warning(msg)
3385         return res
3386
3387     def _float(self, v, name, fatal=False, **kwargs):
3388         res = float_or_none(v, **kwargs)
3389         if res is None:
3390             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3391             if fatal:
3392                 raise ExtractorError(msg)
3393             else:
3394                 self.report_warning(msg)
3395         return res
3396
3397     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3398                     path='/', secure=False, discard=False, rest={}, **kwargs):
3399         cookie = compat_cookiejar_Cookie(
3400             0, name, value, port, port is not None, domain, True,
3401             domain.startswith('.'), path, True, secure, expire_time,
3402             discard, None, None, rest)
3403         self._downloader.cookiejar.set_cookie(cookie)
3404
3405     def _get_cookies(self, url):
3406         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3407         req = sanitized_Request(url)
3408         self._downloader.cookiejar.add_cookie_header(req)
3409         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3410
3411     def _apply_first_set_cookie_header(self, url_handle, cookie):
3412         """
3413         Apply first Set-Cookie header instead of the last. Experimental.
3414
3415         Some sites (e.g. [1-3]) may serve two cookies under the same name
3416         in Set-Cookie header and expect the first (old) one to be set rather
3417         than second (new). However, as of RFC6265 the newer one cookie
3418         should be set into cookie store what actually happens.
3419         We will workaround this issue by resetting the cookie to
3420         the first one manually.
3421         1. https://new.vk.com/
3422         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3423         3. https://learning.oreilly.com/
3424         """
3425         for header, cookies in url_handle.headers.items():
3426             if header.lower() != 'set-cookie':
3427                 continue
3428             if sys.version_info[0] >= 3:
3429                 cookies = cookies.encode('iso-8859-1')
3430             cookies = cookies.decode('utf-8')
3431             cookie_value = re.search(
3432                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3433             if cookie_value:
3434                 value, domain = cookie_value.groups()
3435                 self._set_cookie(domain, cookie, value)
3436                 break
3437
3438     def get_testcases(self, include_onlymatching=False):
3439         t = getattr(self, '_TEST', None)
3440         if t:
3441             assert not hasattr(self, '_TESTS'), \
3442                 '%s has _TEST and _TESTS' % type(self).__name__
3443             tests = [t]
3444         else:
3445             tests = getattr(self, '_TESTS', [])
3446         for t in tests:
3447             if not include_onlymatching and t.get('only_matching', False):
3448                 continue
3449             t['name'] = type(self).__name__[:-len('IE')]
3450             yield t
3451
3452     def is_suitable(self, age_limit):
3453         """ Test whether the extractor is generally suitable for the given
3454         age limit (i.e. pornographic sites are not, all others usually are) """
3455
3456         any_restricted = False
3457         for tc in self.get_testcases(include_onlymatching=False):
3458             if tc.get('playlist', []):
3459                 tc = tc['playlist'][0]
3460             is_restricted = age_restricted(
3461                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3462             if not is_restricted:
3463                 return True
3464             any_restricted = any_restricted or is_restricted
3465         return not any_restricted
3466
3467     def extract_subtitles(self, *args, **kwargs):
3468         if (self.get_param('writesubtitles', False)
3469                 or self.get_param('listsubtitles')):
3470             return self._get_subtitles(*args, **kwargs)
3471         return {}
3472
3473     def _get_subtitles(self, *args, **kwargs):
3474         raise NotImplementedError('This method must be implemented by subclasses')
3475
3476     @staticmethod
3477     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3478         """ Merge subtitle items for one language. Items with duplicated URLs
3479         will be dropped. """
3480         list1_urls = set([item['url'] for item in subtitle_list1])
3481         ret = list(subtitle_list1)
3482         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3483         return ret
3484
3485     @classmethod
3486     def _merge_subtitles(cls, *dicts, target=None):
3487         """ Merge subtitle dictionaries, language by language. """
3488         if target is None:
3489             target = {}
3490         for d in dicts:
3491             for lang, subs in d.items():
3492                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3493         return target
3494
3495     def extract_automatic_captions(self, *args, **kwargs):
3496         if (self.get_param('writeautomaticsub', False)
3497                 or self.get_param('listsubtitles')):
3498             return self._get_automatic_captions(*args, **kwargs)
3499         return {}
3500
3501     def _get_automatic_captions(self, *args, **kwargs):
3502         raise NotImplementedError('This method must be implemented by subclasses')
3503
3504     def mark_watched(self, *args, **kwargs):
3505         if (self.get_param('mark_watched', False)
3506                 and (self._get_login_info()[0] is not None
3507                      or self.get_param('cookiefile') is not None)):
3508             self._mark_watched(*args, **kwargs)
3509
3510     def _mark_watched(self, *args, **kwargs):
3511         raise NotImplementedError('This method must be implemented by subclasses')
3512
3513     def geo_verification_headers(self):
3514         headers = {}
3515         geo_verification_proxy = self.get_param('geo_verification_proxy')
3516         if geo_verification_proxy:
3517             headers['Ytdl-request-proxy'] = geo_verification_proxy
3518         return headers
3519
3520     def _generic_id(self, url):
3521         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3522
3523     def _generic_title(self, url):
3524         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3525
3526     @staticmethod
3527     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3528         all_known = all(map(
3529             lambda x: x is not None,
3530             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3531         return (
3532             'private' if is_private
3533             else 'premium_only' if needs_premium
3534             else 'subscriber_only' if needs_subscription
3535             else 'needs_auth' if needs_auth
3536             else 'unlisted' if is_unlisted
3537             else 'public' if all_known
3538             else None)
3539
3540     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3541         '''
3542         @returns            A list of values for the extractor argument given by "key"
3543                             or "default" if no such key is present
3544         @param default      The default value to return when the key is not present (default: [])
3545         @param casesense    When false, the values are converted to lower case
3546         '''
3547         val = traverse_obj(
3548             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3549         if val is None:
3550             return [] if default is NO_DEFAULT else default
3551         return list(val) if casesense else [x.lower() for x in val]
3552
3553
3554 class SearchInfoExtractor(InfoExtractor):
3555     """
3556     Base class for paged search queries extractors.
3557     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3558     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3559     """
3560
3561     @classmethod
3562     def _make_valid_url(cls):
3563         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3564
3565     @classmethod
3566     def suitable(cls, url):
3567         return re.match(cls._make_valid_url(), url) is not None
3568
3569     def _real_extract(self, query):
3570         mobj = re.match(self._make_valid_url(), query)
3571         if mobj is None:
3572             raise ExtractorError('Invalid search query "%s"' % query)
3573
3574         prefix = mobj.group('prefix')
3575         query = mobj.group('query')
3576         if prefix == '':
3577             return self._get_n_results(query, 1)
3578         elif prefix == 'all':
3579             return self._get_n_results(query, self._MAX_RESULTS)
3580         else:
3581             n = int(prefix)
3582             if n <= 0:
3583                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3584             elif n > self._MAX_RESULTS:
3585                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3586                 n = self._MAX_RESULTS
3587             return self._get_n_results(query, n)
3588
3589     def _get_n_results(self, query, n):
3590         """Get a specified number of results for a query"""
3591         raise NotImplementedError('This method must be implemented by subclasses')
3592
3593     @property
3594     def SEARCH_KEY(self):
3595         return self._SEARCH_KEY