yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar_Cookie,
  18     compat_cookies_SimpleCookie,
  19     compat_etree_Element,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader import FileDownloader
  33 from ..downloader.f4m import (
  34     get_base_url,
  35     remove_encrypted_media,
  36 )
  37 from ..utils import (
  38     age_restricted,
  39     base_url,
  40     bug_reports_message,
  41     clean_html,
  42     compiled_regex_type,
  43     determine_ext,
  44     determine_protocol,
  45     dict_get,
  46     error_to_compat_str,
  47     extract_attributes,
  48     ExtractorError,
  49     fix_xml_ampersands,
  50     float_or_none,
  51     format_field,
  52     GeoRestrictedError,
  53     GeoUtils,
  54     int_or_none,
  55     js_to_json,
  56     JSON_LD_RE,
  57     mimetype2ext,
  58     network_exceptions,
  59     NO_DEFAULT,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     RegexNotFoundError,
  68     sanitize_filename,
  69     sanitized_Request,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     traverse_obj,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     url_basename,
  80     url_or_none,
  81     urljoin,
  82     variadic,
  83     xpath_element,
  84     xpath_text,
  85     xpath_with_ns,
  86 )
  87
  88
  89 class InfoExtractor(object):
  90     """Information Extractor class.
  91
  92     Information extractors are the classes that, given a URL, extract
  93     information about the video (or videos) the URL refers to. This
  94     information includes the real video URL, the video title, author and
  95     others. The information is stored in a dictionary which is then
  96     passed to the YoutubeDL. The YoutubeDL processes this
  97     information possibly downloading the video to the file system, among
  98     other possible outcomes.
  99
 100     The type field determines the type of the result.
 101     By far the most common value (and the default if _type is missing) is
 102     "video", which indicates a single video.
 103
 104     For a video, the dictionaries must include the following fields:
 105
 106     id:             Video identifier.
 107     title:          Video title, unescaped.
 108
 109     Additionally, it must contain either a formats entry or a url one:
 110
 111     formats:        A list of dictionaries for each format available, ordered
 112                     from worst to best quality.
 113
 114                     Potential fields:
 115                     * url        The mandatory URL representing the media:
 116                                    for plain file media - HTTP URL of this file,
 117                                    for RTMP - RTMP URL,
 118                                    for HLS - URL of the M3U8 media playlist,
 119                                    for HDS - URL of the F4M manifest,
 120                                    for DASH
 121                                      - HTTP URL to plain file media (in case of
 122                                        unfragmented media)
 123                                      - URL of the MPD manifest or base URL
 124                                        representing the media if MPD manifest
 125                                        is parsed from a string (in case of
 126                                        fragmented media)
 127                                    for MSS - URL of the ISM manifest.
 128                     * manifest_url
 129                                  The URL of the manifest file in case of
 130                                  fragmented media:
 131                                    for HLS - URL of the M3U8 master playlist,
 132                                    for HDS - URL of the F4M manifest,
 133                                    for DASH - URL of the MPD manifest,
 134                                    for MSS - URL of the ISM manifest.
 135                     * ext        Will be calculated from URL if missing
 136                     * format     A human-readable description of the format
 137                                  ("mp4 container with h264/opus").
 138                                  Calculated from the format_id, width, height.
 139                                  and format_note fields if missing.
 140                     * format_id  A short description of the format
 141                                  ("mp4_h264_opus" or "19").
 142                                 Technically optional, but strongly recommended.
 143                     * format_note Additional info about the format
 144                                  ("3D" or "DASH video")
 145                     * width      Width of the video, if known
 146                     * height     Height of the video, if known
 147                     * resolution Textual description of width and height
 148                     * tbr        Average bitrate of audio and video in KBit/s
 149                     * abr        Average audio bitrate in KBit/s
 150                     * acodec     Name of the audio codec in use
 151                     * asr        Audio sampling rate in Hertz
 152                     * vbr        Average video bitrate in KBit/s
 153                     * fps        Frame rate
 154                     * vcodec     Name of the video codec in use
 155                     * container  Name of the container format
 156                     * filesize   The number of bytes, if known in advance
 157                     * filesize_approx  An estimate for the number of bytes
 158                     * player_url SWF Player URL (used for rtmpdump).
 159                     * protocol   The protocol that will be used for the actual
 160                                  download, lower-case.
 161                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 162                                  "m3u8", "m3u8_native" or "http_dash_segments".
 163                     * fragment_base_url
 164                                  Base URL for fragments. Each fragment's path
 165                                  value (if present) will be relative to
 166                                  this URL.
 167                     * fragments  A list of fragments of a fragmented media.
 168                                  Each fragment entry must contain either an url
 169                                  or a path. If an url is present it should be
 170                                  considered by a client. Otherwise both path and
 171                                  fragment_base_url must be present. Here is
 172                                  the list of all potential fields:
 173                                  * "url" - fragment's URL
 174                                  * "path" - fragment's path relative to
 175                                             fragment_base_url
 176                                  * "duration" (optional, int or float)
 177                                  * "filesize" (optional, int)
 178                     * preference Order number of this format. If this field is
 179                                  present and not None, the formats get sorted
 180                                  by this field, regardless of all other values.
 181                                  -1 for default (order by other properties),
 182                                  -2 or smaller for less than default.
 183                                  < -1000 to hide the format (if there is
 184                                     another one which is strictly better)
 185                     * language   Language code, e.g. "de" or "en-US".
 186                     * language_preference  Is this in the language mentioned in
 187                                  the URL?
 188                                  10 if it's what the URL is about,
 189                                  -1 for default (don't know),
 190                                  -10 otherwise, other values reserved for now.
 191                     * quality    Order number of the video quality of this
 192                                  format, irrespective of the file format.
 193                                  -1 for default (order by other properties),
 194                                  -2 or smaller for less than default.
 195                     * source_preference  Order number for this video source
 196                                   (quality takes higher priority)
 197                                  -1 for default (order by other properties),
 198                                  -2 or smaller for less than default.
 199                     * http_headers  A dictionary of additional HTTP headers
 200                                  to add to the request.
 201                     * stretched_ratio  If given and not 1, indicates that the
 202                                  video's pixels are not square.
 203                                  width : height ratio as float.
 204                     * no_resume  The server does not support resuming the
 205                                  (HTTP or RTMP) download. Boolean.
 206                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 207                     * downloader_options  A dictionary of downloader options as
 208                                  described in FileDownloader
 209                     RTMP formats can also have the additional fields: page_url,
 210                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 211                     rtmp_protocol, rtmp_real_time
 212
 213     url:            Final video URL.
 214     ext:            Video filename extension.
 215     format:         The video format, defaults to ext (used for --get-format)
 216     player_url:     SWF Player URL (used for rtmpdump).
 217
 218     The following fields are optional:
 219
 220     alt_title:      A secondary title of the video.
 221     display_id      An alternative identifier for the video, not necessarily
 222                     unique, but available before title. Typically, id is
 223                     something like "4234987", title "Dancing naked mole rats",
 224                     and display_id "dancing-naked-mole-rats"
 225     thumbnails:     A list of dictionaries, with the following entries:
 226                         * "id" (optional, string) - Thumbnail format ID
 227                         * "url"
 228                         * "preference" (optional, int) - quality of the image
 229                         * "width" (optional, int)
 230                         * "height" (optional, int)
 231                         * "resolution" (optional, string "{width}x{height}",
 232                                         deprecated)
 233                         * "filesize" (optional, int)
 234                         * "_test_url" (optional, bool) - If true, test the URL
 235     thumbnail:      Full URL to a video thumbnail image.
 236     description:    Full video description.
 237     uploader:       Full name of the video uploader.
 238     license:        License name the video is licensed under.
 239     creator:        The creator of the video.
 240     release_timestamp: UNIX timestamp of the moment the video was released.
 241     release_date:   The date (YYYYMMDD) when the video was released.
 242     timestamp:      UNIX timestamp of the moment the video was uploaded
 243     upload_date:    Video upload date (YYYYMMDD).
 244                     If not explicitly set, calculated from timestamp.
 245     uploader_id:    Nickname or id of the video uploader.
 246     uploader_url:   Full URL to a personal webpage of the video uploader.
 247     channel:        Full name of the channel the video is uploaded on.
 248                     Note that channel fields may or may not repeat uploader
 249                     fields. This depends on a particular extractor.
 250     channel_id:     Id of the channel.
 251     channel_url:    Full URL to a channel webpage.
 252     location:       Physical location where the video was filmed.
 253     subtitles:      The available subtitles as a dictionary in the format
 254                     {tag: subformats}. "tag" is usually a language code, and
 255                     "subformats" is a list sorted from lower to higher
 256                     preference, each element is a dictionary with the "ext"
 257                     entry and one of:
 258                         * "data": The subtitles file contents
 259                         * "url": A URL pointing to the subtitles file
 260                     It can optionally also have:
 261                         * "name": Name or description of the subtitles
 262                     "ext" will be calculated from URL if missing
 263     automatic_captions: Like 'subtitles'; contains automatically generated
 264                     captions instead of normal subtitles
 265     duration:       Length of the video in seconds, as an integer or float.
 266     view_count:     How many users have watched the video on the platform.
 267     like_count:     Number of positive ratings of the video
 268     dislike_count:  Number of negative ratings of the video
 269     repost_count:   Number of reposts of the video
 270     average_rating: Average rating give by users, the scale used depends on the webpage
 271     comment_count:  Number of comments on the video
 272     comments:       A list of comments, each with one or more of the following
 273                     properties (all but one of text or html optional):
 274                         * "author" - human-readable name of the comment author
 275                         * "author_id" - user ID of the comment author
 276                         * "author_thumbnail" - The thumbnail of the comment author
 277                         * "id" - Comment ID
 278                         * "html" - Comment as HTML
 279                         * "text" - Plain text of the comment
 280                         * "timestamp" - UNIX timestamp of comment
 281                         * "parent" - ID of the comment this one is replying to.
 282                                      Set to "root" to indicate that this is a
 283                                      comment to the original video.
 284                         * "like_count" - Number of positive ratings of the comment
 285                         * "dislike_count" - Number of negative ratings of the comment
 286                         * "is_favorited" - Whether the comment is marked as
 287                                            favorite by the video uploader
 288                         * "author_is_uploader" - Whether the comment is made by
 289                                                  the video uploader
 290     age_limit:      Age restriction for the video, as an integer (years)
 291     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 292                     should allow to get the same result again. (It will be set
 293                     by YoutubeDL if it's missing)
 294     categories:     A list of categories that the video falls in, for example
 295                     ["Sports", "Berlin"]
 296     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 297     cast:           A list of the video cast
 298     is_live:        True, False, or None (=unknown). Whether this video is a
 299                     live stream that goes on instead of a fixed-length video.
 300     was_live:       True, False, or None (=unknown). Whether this video was
 301                     originally a live stream.
 302     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 303                     If absent, automatically set from is_live, was_live
 304     start_time:     Time in seconds where the reproduction should start, as
 305                     specified in the URL.
 306     end_time:       Time in seconds where the reproduction should end, as
 307                     specified in the URL.
 308     chapters:       A list of dictionaries, with the following entries:
 309                         * "start_time" - The start time of the chapter in seconds
 310                         * "end_time" - The end time of the chapter in seconds
 311                         * "title" (optional, string)
 312     playable_in_embed: Whether this video is allowed to play in embedded
 313                     players on other sites. Can be True (=always allowed),
 314                     False (=never allowed), None (=unknown), or a string
 315                     specifying the criteria for embedability (Eg: 'whitelist')
 316     availability:   Under what condition the video is available. One of
 317                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 318                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 319                     to set it
 320     __post_extractor: A function to be called just before the metadata is
 321                     written to either disk, logger or console. The function
 322                     must return a dict which will be added to the info_dict.
 323                     This is usefull for additional information that is
 324                     time-consuming to extract. Note that the fields thus
 325                     extracted will not be available to output template and
 326                     match_filter. So, only "comments" and "comment_count" are
 327                     currently allowed to be extracted via this method.
 328
 329     The following fields should only be used when the video belongs to some logical
 330     chapter or section:
 331
 332     chapter:        Name or title of the chapter the video belongs to.
 333     chapter_number: Number of the chapter the video belongs to, as an integer.
 334     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 335
 336     The following fields should only be used when the video is an episode of some
 337     series, programme or podcast:
 338
 339     series:         Title of the series or programme the video episode belongs to.
 340     season:         Title of the season the video episode belongs to.
 341     season_number:  Number of the season the video episode belongs to, as an integer.
 342     season_id:      Id of the season the video episode belongs to, as a unicode string.
 343     episode:        Title of the video episode. Unlike mandatory video title field,
 344                     this field should denote the exact title of the video episode
 345                     without any kind of decoration.
 346     episode_number: Number of the video episode within a season, as an integer.
 347     episode_id:     Id of the video episode, as a unicode string.
 348
 349     The following fields should only be used when the media is a track or a part of
 350     a music album:
 351
 352     track:          Title of the track.
 353     track_number:   Number of the track within an album or a disc, as an integer.
 354     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 355                     as a unicode string.
 356     artist:         Artist(s) of the track.
 357     genre:          Genre(s) of the track.
 358     album:          Title of the album the track belongs to.
 359     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 360     album_artist:   List of all artists appeared on the album (e.g.
 361                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 362                     and compilations).
 363     disc_number:    Number of the disc or other physical medium the track belongs to,
 364                     as an integer.
 365     release_year:   Year (YYYY) when the album was released.
 366
 367     Unless mentioned otherwise, the fields should be Unicode strings.
 368
 369     Unless mentioned otherwise, None is equivalent to absence of information.
 370
 371
 372     _type "playlist" indicates multiple videos.
 373     There must be a key "entries", which is a list, an iterable, or a PagedList
 374     object, each element of which is a valid dictionary by this specification.
 375
 376     Additionally, playlists can have "id", "title", and any other relevent
 377     attributes with the same semantics as videos (see above).
 378
 379
 380     _type "multi_video" indicates that there are multiple videos that
 381     form a single show, for examples multiple acts of an opera or TV episode.
 382     It must have an entries key like a playlist and contain all the keys
 383     required for a video at the same time.
 384
 385
 386     _type "url" indicates that the video must be extracted from another
 387     location, possibly by a different extractor. Its only required key is:
 388     "url" - the next URL to extract.
 389     The key "ie_key" can be set to the class name (minus the trailing "IE",
 390     e.g. "Youtube") if the extractor class is known in advance.
 391     Additionally, the dictionary may have any properties of the resolved entity
 392     known in advance, for example "title" if the title of the referred video is
 393     known ahead of time.
 394
 395
 396     _type "url_transparent" entities have the same specification as "url", but
 397     indicate that the given additional information is more precise than the one
 398     associated with the resolved URL.
 399     This is useful when a site employs a video service that hosts the video and
 400     its technical metadata, but that video service does not embed a useful
 401     title, description etc.
 402
 403
 404     Subclasses of this one should re-define the _real_initialize() and
 405     _real_extract() methods and define a _VALID_URL regexp.
 406     Probably, they should also be added to the list of extractors.
 407
 408     _GEO_BYPASS attribute may be set to False in order to disable
 409     geo restriction bypass mechanisms for a particular extractor.
 410     Though it won't disable explicit geo restriction bypass based on
 411     country code provided with geo_bypass_country.
 412
 413     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 414     countries for this extractor. One of these countries will be used by
 415     geo restriction bypass mechanism right away in order to bypass
 416     geo restriction, of course, if the mechanism is not disabled.
 417
 418     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 419     IP blocks in CIDR notation for this extractor. One of these IP blocks
 420     will be used by geo restriction bypass mechanism similarly
 421     to _GEO_COUNTRIES.
 422
 423     Finally, the _WORKING attribute should be set to False for broken IEs
 424     in order to warn the users and skip the tests.
 425     """
 426
 427     _ready = False
 428     _downloader = None
 429     _x_forwarded_for_ip = None
 430     _GEO_BYPASS = True
 431     _GEO_COUNTRIES = None
 432     _GEO_IP_BLOCKS = None
 433     _WORKING = True
 434
 435     _LOGIN_HINTS = {
 436         'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
 437         'cookies': (
 438             'Use --cookies for the authentication. '
 439             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to pass cookies'),
 440         'password': 'Use --username and --password or --netrc to provide account credentials',
 441     }
 442
 443     def __init__(self, downloader=None):
 444         """Constructor. Receives an optional downloader."""
 445         self._ready = False
 446         self._x_forwarded_for_ip = None
 447         self._printed_messages = set()
 448         self.set_downloader(downloader)
 449
 450     @classmethod
 451     def _match_valid_url(cls, url):
 452         # This does not use has/getattr intentionally - we want to know whether
 453         # we have cached the regexp for *this* class, whereas getattr would also
 454         # match the superclass
 455         if '_VALID_URL_RE' not in cls.__dict__:
 456             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 457         return cls._VALID_URL_RE.match(url)
 458
 459     @classmethod
 460     def suitable(cls, url):
 461         """Receives a URL and returns True if suitable for this IE."""
 462         # This function must import everything it needs (except other extractors),
 463         # so that lazy_extractors works correctly
 464         return cls._match_valid_url(url) is not None
 465
 466     @classmethod
 467     def _match_id(cls, url):
 468         return cls._match_valid_url(url).group('id')
 469
 470     @classmethod
 471     def get_temp_id(cls, url):
 472         try:
 473             return cls._match_id(url)
 474         except (IndexError, AttributeError):
 475             return None
 476
 477     @classmethod
 478     def working(cls):
 479         """Getter method for _WORKING."""
 480         return cls._WORKING
 481
 482     def initialize(self):
 483         """Initializes an instance (authentication, etc)."""
 484         self._printed_messages = set()
 485         self._initialize_geo_bypass({
 486             'countries': self._GEO_COUNTRIES,
 487             'ip_blocks': self._GEO_IP_BLOCKS,
 488         })
 489         if not self._ready:
 490             self._real_initialize()
 491             self._ready = True
 492
 493     def _initialize_geo_bypass(self, geo_bypass_context):
 494         """
 495         Initialize geo restriction bypass mechanism.
 496
 497         This method is used to initialize geo bypass mechanism based on faking
 498         X-Forwarded-For HTTP header. A random country from provided country list
 499         is selected and a random IP belonging to this country is generated. This
 500         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 501         HTTP requests.
 502
 503         This method will be used for initial geo bypass mechanism initialization
 504         during the instance initialization with _GEO_COUNTRIES and
 505         _GEO_IP_BLOCKS.
 506
 507         You may also manually call it from extractor's code if geo bypass
 508         information is not available beforehand (e.g. obtained during
 509         extraction) or due to some other reason. In this case you should pass
 510         this information in geo bypass context passed as first argument. It may
 511         contain following fields:
 512
 513         countries:  List of geo unrestricted countries (similar
 514                     to _GEO_COUNTRIES)
 515         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 516                     (similar to _GEO_IP_BLOCKS)
 517
 518         """
 519         if not self._x_forwarded_for_ip:
 520
 521             # Geo bypass mechanism is explicitly disabled by user
 522             if not self.get_param('geo_bypass', True):
 523                 return
 524
 525             if not geo_bypass_context:
 526                 geo_bypass_context = {}
 527
 528             # Backward compatibility: previously _initialize_geo_bypass
 529             # expected a list of countries, some 3rd party code may still use
 530             # it this way
 531             if isinstance(geo_bypass_context, (list, tuple)):
 532                 geo_bypass_context = {
 533                     'countries': geo_bypass_context,
 534                 }
 535
 536             # The whole point of geo bypass mechanism is to fake IP
 537             # as X-Forwarded-For HTTP header based on some IP block or
 538             # country code.
 539
 540             # Path 1: bypassing based on IP block in CIDR notation
 541
 542             # Explicit IP block specified by user, use it right away
 543             # regardless of whether extractor is geo bypassable or not
 544             ip_block = self.get_param('geo_bypass_ip_block', None)
 545
 546             # Otherwise use random IP block from geo bypass context but only
 547             # if extractor is known as geo bypassable
 548             if not ip_block:
 549                 ip_blocks = geo_bypass_context.get('ip_blocks')
 550                 if self._GEO_BYPASS and ip_blocks:
 551                     ip_block = random.choice(ip_blocks)
 552
 553             if ip_block:
 554                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 555                 self._downloader.write_debug(
 556                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 557                 return
 558
 559             # Path 2: bypassing based on country code
 560
 561             # Explicit country code specified by user, use it right away
 562             # regardless of whether extractor is geo bypassable or not
 563             country = self.get_param('geo_bypass_country', None)
 564
 565             # Otherwise use random country code from geo bypass context but
 566             # only if extractor is known as geo bypassable
 567             if not country:
 568                 countries = geo_bypass_context.get('countries')
 569                 if self._GEO_BYPASS and countries:
 570                     country = random.choice(countries)
 571
 572             if country:
 573                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 574                 self._downloader.write_debug(
 575                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 576
 577     def extract(self, url):
 578         """Extracts URL information and returns it in list of dicts."""
 579         try:
 580             for _ in range(2):
 581                 try:
 582                     self.initialize()
 583                     self.write_debug('Extracting URL: %s' % url)
 584                     ie_result = self._real_extract(url)
 585                     if ie_result is None:
 586                         return None
 587                     if self._x_forwarded_for_ip:
 588                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 589                     subtitles = ie_result.get('subtitles')
 590                     if (subtitles and 'live_chat' in subtitles
 591                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 592                         del subtitles['live_chat']
 593                     return ie_result
 594                 except GeoRestrictedError as e:
 595                     if self.__maybe_fake_ip_and_retry(e.countries):
 596                         continue
 597                     raise
 598         except ExtractorError as e:
 599             video_id = e.video_id or self.get_temp_id(url)
 600             raise ExtractorError(
 601                 e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
 602         except compat_http_client.IncompleteRead as e:
 603             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 604         except (KeyError, StopIteration) as e:
 605             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 606
 607     def __maybe_fake_ip_and_retry(self, countries):
 608         if (not self.get_param('geo_bypass_country', None)
 609                 and self._GEO_BYPASS
 610                 and self.get_param('geo_bypass', True)
 611                 and not self._x_forwarded_for_ip
 612                 and countries):
 613             country_code = random.choice(countries)
 614             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 615             if self._x_forwarded_for_ip:
 616                 self.report_warning(
 617                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 618                     % (self._x_forwarded_for_ip, country_code.upper()))
 619                 return True
 620         return False
 621
 622     def set_downloader(self, downloader):
 623         """Sets the downloader for this IE."""
 624         self._downloader = downloader
 625
 626     def _real_initialize(self):
 627         """Real initialization process. Redefine in subclasses."""
 628         pass
 629
 630     def _real_extract(self, url):
 631         """Real extraction process. Redefine in subclasses."""
 632         pass
 633
 634     @classmethod
 635     def ie_key(cls):
 636         """A string for getting the InfoExtractor with get_info_extractor"""
 637         return cls.__name__[:-2]
 638
 639     @property
 640     def IE_NAME(self):
 641         return compat_str(type(self).__name__[:-2])
 642
 643     @staticmethod
 644     def __can_accept_status_code(err, expected_status):
 645         assert isinstance(err, compat_urllib_error.HTTPError)
 646         if expected_status is None:
 647             return False
 648         elif callable(expected_status):
 649             return expected_status(err.code) is True
 650         else:
 651             return err.code in variadic(expected_status)
 652
 653     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 654         """
 655         Return the response handle.
 656
 657         See _download_webpage docstring for arguments specification.
 658         """
 659         if not self._downloader._first_webpage_request:
 660             sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
 661             if sleep_interval > 0:
 662                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 663                 time.sleep(sleep_interval)
 664         else:
 665             self._downloader._first_webpage_request = False
 666
 667         if note is None:
 668             self.report_download_webpage(video_id)
 669         elif note is not False:
 670             if video_id is None:
 671                 self.to_screen('%s' % (note,))
 672             else:
 673                 self.to_screen('%s: %s' % (video_id, note))
 674
 675         # Some sites check X-Forwarded-For HTTP header in order to figure out
 676         # the origin of the client behind proxy. This allows bypassing geo
 677         # restriction by faking this header's value to IP that belongs to some
 678         # geo unrestricted country. We will do so once we encounter any
 679         # geo restriction error.
 680         if self._x_forwarded_for_ip:
 681             if 'X-Forwarded-For' not in headers:
 682                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 683
 684         if isinstance(url_or_request, compat_urllib_request.Request):
 685             url_or_request = update_Request(
 686                 url_or_request, data=data, headers=headers, query=query)
 687         else:
 688             if query:
 689                 url_or_request = update_url_query(url_or_request, query)
 690             if data is not None or headers:
 691                 url_or_request = sanitized_Request(url_or_request, data, headers)
 692         try:
 693             return self._downloader.urlopen(url_or_request)
 694         except network_exceptions as err:
 695             if isinstance(err, compat_urllib_error.HTTPError):
 696                 if self.__can_accept_status_code(err, expected_status):
 697                     # Retain reference to error to prevent file object from
 698                     # being closed before it can be read. Works around the
 699                     # effects of <https://bugs.python.org/issue15002>
 700                     # introduced in Python 3.4.1.
 701                     err.fp._error = err
 702                     return err.fp
 703
 704             if errnote is False:
 705                 return False
 706             if errnote is None:
 707                 errnote = 'Unable to download webpage'
 708
 709             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 710             if fatal:
 711                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 712             else:
 713                 self.report_warning(errmsg)
 714                 return False
 715
 716     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 717         """
 718         Return a tuple (page content as string, URL handle).
 719
 720         See _download_webpage docstring for arguments specification.
 721         """
 722         # Strip hashes from the URL (#1038)
 723         if isinstance(url_or_request, (compat_str, str)):
 724             url_or_request = url_or_request.partition('#')[0]
 725
 726         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 727         if urlh is False:
 728             assert not fatal
 729             return False
 730         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 731         return (content, urlh)
 732
 733     @staticmethod
 734     def _guess_encoding_from_content(content_type, webpage_bytes):
 735         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 736         if m:
 737             encoding = m.group(1)
 738         else:
 739             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 740                           webpage_bytes[:1024])
 741             if m:
 742                 encoding = m.group(1).decode('ascii')
 743             elif webpage_bytes.startswith(b'\xff\xfe'):
 744                 encoding = 'utf-16'
 745             else:
 746                 encoding = 'utf-8'
 747
 748         return encoding
 749
 750     def __check_blocked(self, content):
 751         first_block = content[:512]
 752         if ('<title>Access to this site is blocked</title>' in content
 753                 and 'Websense' in first_block):
 754             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 755             blocked_iframe = self._html_search_regex(
 756                 r'<iframe src="([^"]+)"', content,
 757                 'Websense information URL', default=None)
 758             if blocked_iframe:
 759                 msg += ' Visit %s for more details' % blocked_iframe
 760             raise ExtractorError(msg, expected=True)
 761         if '<title>The URL you requested has been blocked</title>' in first_block:
 762             msg = (
 763                 'Access to this webpage has been blocked by Indian censorship. '
 764                 'Use a VPN or proxy server (with --proxy) to route around it.')
 765             block_msg = self._html_search_regex(
 766                 r'</h1><p>(.*?)</p>',
 767                 content, 'block message', default=None)
 768             if block_msg:
 769                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 770             raise ExtractorError(msg, expected=True)
 771         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 772                 and 'blocklist.rkn.gov.ru' in content):
 773             raise ExtractorError(
 774                 'Access to this webpage has been blocked by decision of the Russian government. '
 775                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 776                 expected=True)
 777
 778     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 779         content_type = urlh.headers.get('Content-Type', '')
 780         webpage_bytes = urlh.read()
 781         if prefix is not None:
 782             webpage_bytes = prefix + webpage_bytes
 783         if not encoding:
 784             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 785         if self.get_param('dump_intermediate_pages', False):
 786             self.to_screen('Dumping request to ' + urlh.geturl())
 787             dump = base64.b64encode(webpage_bytes).decode('ascii')
 788             self._downloader.to_screen(dump)
 789         if self.get_param('write_pages', False):
 790             basen = '%s_%s' % (video_id, urlh.geturl())
 791             if len(basen) > 240:
 792                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 793                 basen = basen[:240 - len(h)] + h
 794             raw_filename = basen + '.dump'
 795             filename = sanitize_filename(raw_filename, restricted=True)
 796             self.to_screen('Saving request to ' + filename)
 797             # Working around MAX_PATH limitation on Windows (see
 798             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 799             if compat_os_name == 'nt':
 800                 absfilepath = os.path.abspath(filename)
 801                 if len(absfilepath) > 259:
 802                     filename = '\\\\?\\' + absfilepath
 803             with open(filename, 'wb') as outf:
 804                 outf.write(webpage_bytes)
 805
 806         try:
 807             content = webpage_bytes.decode(encoding, 'replace')
 808         except LookupError:
 809             content = webpage_bytes.decode('utf-8', 'replace')
 810
 811         self.__check_blocked(content)
 812
 813         return content
 814
 815     def _download_webpage(
 816             self, url_or_request, video_id, note=None, errnote=None,
 817             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 818             headers={}, query={}, expected_status=None):
 819         """
 820         Return the data of the page as a string.
 821
 822         Arguments:
 823         url_or_request -- plain text URL as a string or
 824             a compat_urllib_request.Requestobject
 825         video_id -- Video/playlist/item identifier (string)
 826
 827         Keyword arguments:
 828         note -- note printed before downloading (string)
 829         errnote -- note printed in case of an error (string)
 830         fatal -- flag denoting whether error should be considered fatal,
 831             i.e. whether it should cause ExtractionError to be raised,
 832             otherwise a warning will be reported and extraction continued
 833         tries -- number of tries
 834         timeout -- sleep interval between tries
 835         encoding -- encoding for a page content decoding, guessed automatically
 836             when not explicitly specified
 837         data -- POST data (bytes)
 838         headers -- HTTP headers (dict)
 839         query -- URL query (dict)
 840         expected_status -- allows to accept failed HTTP requests (non 2xx
 841             status code) by explicitly specifying a set of accepted status
 842             codes. Can be any of the following entities:
 843                 - an integer type specifying an exact failed status code to
 844                   accept
 845                 - a list or a tuple of integer types specifying a list of
 846                   failed status codes to accept
 847                 - a callable accepting an actual failed status code and
 848                   returning True if it should be accepted
 849             Note that this argument does not affect success status codes (2xx)
 850             which are always accepted.
 851         """
 852
 853         success = False
 854         try_count = 0
 855         while success is False:
 856             try:
 857                 res = self._download_webpage_handle(
 858                     url_or_request, video_id, note, errnote, fatal,
 859                     encoding=encoding, data=data, headers=headers, query=query,
 860                     expected_status=expected_status)
 861                 success = True
 862             except compat_http_client.IncompleteRead as e:
 863                 try_count += 1
 864                 if try_count >= tries:
 865                     raise e
 866                 self._sleep(timeout, video_id)
 867         if res is False:
 868             return res
 869         else:
 870             content, _ = res
 871             return content
 872
 873     def _download_xml_handle(
 874             self, url_or_request, video_id, note='Downloading XML',
 875             errnote='Unable to download XML', transform_source=None,
 876             fatal=True, encoding=None, data=None, headers={}, query={},
 877             expected_status=None):
 878         """
 879         Return a tuple (xml as an compat_etree_Element, URL handle).
 880
 881         See _download_webpage docstring for arguments specification.
 882         """
 883         res = self._download_webpage_handle(
 884             url_or_request, video_id, note, errnote, fatal=fatal,
 885             encoding=encoding, data=data, headers=headers, query=query,
 886             expected_status=expected_status)
 887         if res is False:
 888             return res
 889         xml_string, urlh = res
 890         return self._parse_xml(
 891             xml_string, video_id, transform_source=transform_source,
 892             fatal=fatal), urlh
 893
 894     def _download_xml(
 895             self, url_or_request, video_id,
 896             note='Downloading XML', errnote='Unable to download XML',
 897             transform_source=None, fatal=True, encoding=None,
 898             data=None, headers={}, query={}, expected_status=None):
 899         """
 900         Return the xml as an compat_etree_Element.
 901
 902         See _download_webpage docstring for arguments specification.
 903         """
 904         res = self._download_xml_handle(
 905             url_or_request, video_id, note=note, errnote=errnote,
 906             transform_source=transform_source, fatal=fatal, encoding=encoding,
 907             data=data, headers=headers, query=query,
 908             expected_status=expected_status)
 909         return res if res is False else res[0]
 910
 911     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 912         if transform_source:
 913             xml_string = transform_source(xml_string)
 914         try:
 915             return compat_etree_fromstring(xml_string.encode('utf-8'))
 916         except compat_xml_parse_error as ve:
 917             errmsg = '%s: Failed to parse XML ' % video_id
 918             if fatal:
 919                 raise ExtractorError(errmsg, cause=ve)
 920             else:
 921                 self.report_warning(errmsg + str(ve))
 922
 923     def _download_json_handle(
 924             self, url_or_request, video_id, note='Downloading JSON metadata',
 925             errnote='Unable to download JSON metadata', transform_source=None,
 926             fatal=True, encoding=None, data=None, headers={}, query={},
 927             expected_status=None):
 928         """
 929         Return a tuple (JSON object, URL handle).
 930
 931         See _download_webpage docstring for arguments specification.
 932         """
 933         res = self._download_webpage_handle(
 934             url_or_request, video_id, note, errnote, fatal=fatal,
 935             encoding=encoding, data=data, headers=headers, query=query,
 936             expected_status=expected_status)
 937         if res is False:
 938             return res
 939         json_string, urlh = res
 940         return self._parse_json(
 941             json_string, video_id, transform_source=transform_source,
 942             fatal=fatal), urlh
 943
 944     def _download_json(
 945             self, url_or_request, video_id, note='Downloading JSON metadata',
 946             errnote='Unable to download JSON metadata', transform_source=None,
 947             fatal=True, encoding=None, data=None, headers={}, query={},
 948             expected_status=None):
 949         """
 950         Return the JSON object as a dict.
 951
 952         See _download_webpage docstring for arguments specification.
 953         """
 954         res = self._download_json_handle(
 955             url_or_request, video_id, note=note, errnote=errnote,
 956             transform_source=transform_source, fatal=fatal, encoding=encoding,
 957             data=data, headers=headers, query=query,
 958             expected_status=expected_status)
 959         return res if res is False else res[0]
 960
 961     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 962         if transform_source:
 963             json_string = transform_source(json_string)
 964         try:
 965             return json.loads(json_string)
 966         except ValueError as ve:
 967             errmsg = '%s: Failed to parse JSON ' % video_id
 968             if fatal:
 969                 raise ExtractorError(errmsg, cause=ve)
 970             else:
 971                 self.report_warning(errmsg + str(ve))
 972
 973     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 974         return self._parse_json(
 975             data[data.find('{'):data.rfind('}') + 1],
 976             video_id, transform_source, fatal)
 977
 978     def _download_socket_json_handle(
 979             self, url_or_request, video_id, note='Polling socket',
 980             errnote='Unable to poll socket', transform_source=None,
 981             fatal=True, encoding=None, data=None, headers={}, query={},
 982             expected_status=None):
 983         """
 984         Return a tuple (JSON object, URL handle).
 985
 986         See _download_webpage docstring for arguments specification.
 987         """
 988         res = self._download_webpage_handle(
 989             url_or_request, video_id, note, errnote, fatal=fatal,
 990             encoding=encoding, data=data, headers=headers, query=query,
 991             expected_status=expected_status)
 992         if res is False:
 993             return res
 994         webpage, urlh = res
 995         return self._parse_socket_response_as_json(
 996             webpage, video_id, transform_source=transform_source,
 997             fatal=fatal), urlh
 998
 999     def _download_socket_json(
1000             self, url_or_request, video_id, note='Polling socket',
1001             errnote='Unable to poll socket', transform_source=None,
1002             fatal=True, encoding=None, data=None, headers={}, query={},
1003             expected_status=None):
1004         """
1005         Return the JSON object as a dict.
1006
1007         See _download_webpage docstring for arguments specification.
1008         """
1009         res = self._download_socket_json_handle(
1010             url_or_request, video_id, note=note, errnote=errnote,
1011             transform_source=transform_source, fatal=fatal, encoding=encoding,
1012             data=data, headers=headers, query=query,
1013             expected_status=expected_status)
1014         return res if res is False else res[0]
1015
1016     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1017         idstr = format_field(video_id, template='%s: ')
1018         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1019         if only_once:
1020             if f'WARNING: {msg}' in self._printed_messages:
1021                 return
1022             self._printed_messages.add(f'WARNING: {msg}')
1023         self._downloader.report_warning(msg, *args, **kwargs)
1024
1025     def to_screen(self, msg, *args, **kwargs):
1026         """Print msg to screen, prefixing it with '[ie_name]'"""
1027         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1028
1029     def write_debug(self, msg, *args, **kwargs):
1030         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1031
1032     def get_param(self, name, default=None, *args, **kwargs):
1033         if self._downloader:
1034             return self._downloader.params.get(name, default, *args, **kwargs)
1035         return default
1036
1037     def report_drm(self, video_id, partial=False):
1038         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1039
1040     def report_extraction(self, id_or_name):
1041         """Report information extraction."""
1042         self.to_screen('%s: Extracting information' % id_or_name)
1043
1044     def report_download_webpage(self, video_id):
1045         """Report webpage download."""
1046         self.to_screen('%s: Downloading webpage' % video_id)
1047
1048     def report_age_confirmation(self):
1049         """Report attempt to confirm age."""
1050         self.to_screen('Confirming age')
1051
1052     def report_login(self):
1053         """Report attempt to log in."""
1054         self.to_screen('Logging in')
1055
1056     def raise_login_required(
1057             self, msg='This video is only available for registered users',
1058             metadata_available=False, method='any'):
1059         if metadata_available and self.get_param('ignore_no_formats_error'):
1060             self.report_warning(msg)
1061         if method is not None:
1062             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1063         raise ExtractorError(msg, expected=True)
1064
1065     def raise_geo_restricted(
1066             self, msg='This video is not available from your location due to geo restriction',
1067             countries=None, metadata_available=False):
1068         if metadata_available and self.get_param('ignore_no_formats_error'):
1069             self.report_warning(msg)
1070         else:
1071             raise GeoRestrictedError(msg, countries=countries)
1072
1073     def raise_no_formats(self, msg, expected=False, video_id=None):
1074         if expected and self.get_param('ignore_no_formats_error'):
1075             self.report_warning(msg, video_id)
1076         elif isinstance(msg, ExtractorError):
1077             raise msg
1078         else:
1079             raise ExtractorError(msg, expected=expected, video_id=video_id)
1080
1081     # Methods for following #608
1082     @staticmethod
1083     def url_result(url, ie=None, video_id=None, video_title=None):
1084         """Returns a URL that points to a page that should be processed"""
1085         # TODO: ie should be the class used for getting the info
1086         video_info = {'_type': 'url',
1087                       'url': url,
1088                       'ie_key': ie}
1089         if video_id is not None:
1090             video_info['id'] = video_id
1091         if video_title is not None:
1092             video_info['title'] = video_title
1093         return video_info
1094
1095     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1096         urls = orderedSet(
1097             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1098             for m in matches)
1099         return self.playlist_result(
1100             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1101
1102     @staticmethod
1103     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1104         """Returns a playlist"""
1105         video_info = {'_type': 'playlist',
1106                       'entries': entries}
1107         video_info.update(kwargs)
1108         if playlist_id:
1109             video_info['id'] = playlist_id
1110         if playlist_title:
1111             video_info['title'] = playlist_title
1112         if playlist_description is not None:
1113             video_info['description'] = playlist_description
1114         return video_info
1115
1116     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1117         """
1118         Perform a regex search on the given string, using a single or a list of
1119         patterns returning the first matching group.
1120         In case of failure return a default value or raise a WARNING or a
1121         RegexNotFoundError, depending on fatal, specifying the field name.
1122         """
1123         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1124             mobj = re.search(pattern, string, flags)
1125         else:
1126             for p in pattern:
1127                 mobj = re.search(p, string, flags)
1128                 if mobj:
1129                     break
1130
1131         if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1132             _name = '\033[0;34m%s\033[0m' % name
1133         else:
1134             _name = name
1135
1136         if mobj:
1137             if group is None:
1138                 # return the first matching group
1139                 return next(g for g in mobj.groups() if g is not None)
1140             elif isinstance(group, (list, tuple)):
1141                 return tuple(mobj.group(g) for g in group)
1142             else:
1143                 return mobj.group(group)
1144         elif default is not NO_DEFAULT:
1145             return default
1146         elif fatal:
1147             raise RegexNotFoundError('Unable to extract %s' % _name)
1148         else:
1149             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1150             return None
1151
1152     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1153         """
1154         Like _search_regex, but strips HTML tags and unescapes entities.
1155         """
1156         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1157         if res:
1158             return clean_html(res).strip()
1159         else:
1160             return res
1161
1162     def _get_netrc_login_info(self, netrc_machine=None):
1163         username = None
1164         password = None
1165         netrc_machine = netrc_machine or self._NETRC_MACHINE
1166
1167         if self.get_param('usenetrc', False):
1168             try:
1169                 info = netrc.netrc().authenticators(netrc_machine)
1170                 if info is not None:
1171                     username = info[0]
1172                     password = info[2]
1173                 else:
1174                     raise netrc.NetrcParseError(
1175                         'No authenticators for %s' % netrc_machine)
1176             except (IOError, netrc.NetrcParseError) as err:
1177                 self.report_warning(
1178                     'parsing .netrc: %s' % error_to_compat_str(err))
1179
1180         return username, password
1181
1182     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1183         """
1184         Get the login info as (username, password)
1185         First look for the manually specified credentials using username_option
1186         and password_option as keys in params dictionary. If no such credentials
1187         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1188         value.
1189         If there's no info available, return (None, None)
1190         """
1191
1192         # Attempt to use provided username and password or .netrc data
1193         username = self.get_param(username_option)
1194         if username is not None:
1195             password = self.get_param(password_option)
1196         else:
1197             username, password = self._get_netrc_login_info(netrc_machine)
1198
1199         return username, password
1200
1201     def _get_tfa_info(self, note='two-factor verification code'):
1202         """
1203         Get the two-factor authentication info
1204         TODO - asking the user will be required for sms/phone verify
1205         currently just uses the command line option
1206         If there's no info available, return None
1207         """
1208
1209         tfa = self.get_param('twofactor')
1210         if tfa is not None:
1211             return tfa
1212
1213         return compat_getpass('Type %s and press [Return]: ' % note)
1214
1215     # Helper functions for extracting OpenGraph info
1216     @staticmethod
1217     def _og_regexes(prop):
1218         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1219         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1220                        % {'prop': re.escape(prop)})
1221         template = r'<meta[^>]+?%s[^>]+?%s'
1222         return [
1223             template % (property_re, content_re),
1224             template % (content_re, property_re),
1225         ]
1226
1227     @staticmethod
1228     def _meta_regex(prop):
1229         return r'''(?isx)<meta
1230                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1231                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1232
1233     def _og_search_property(self, prop, html, name=None, **kargs):
1234         prop = variadic(prop)
1235         if name is None:
1236             name = 'OpenGraph %s' % prop[0]
1237         og_regexes = []
1238         for p in prop:
1239             og_regexes.extend(self._og_regexes(p))
1240         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1241         if escaped is None:
1242             return None
1243         return unescapeHTML(escaped)
1244
1245     def _og_search_thumbnail(self, html, **kargs):
1246         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1247
1248     def _og_search_description(self, html, **kargs):
1249         return self._og_search_property('description', html, fatal=False, **kargs)
1250
1251     def _og_search_title(self, html, **kargs):
1252         return self._og_search_property('title', html, **kargs)
1253
1254     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1255         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1256         if secure:
1257             regexes = self._og_regexes('video:secure_url') + regexes
1258         return self._html_search_regex(regexes, html, name, **kargs)
1259
1260     def _og_search_url(self, html, **kargs):
1261         return self._og_search_property('url', html, **kargs)
1262
1263     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1264         name = variadic(name)
1265         if display_name is None:
1266             display_name = name[0]
1267         return self._html_search_regex(
1268             [self._meta_regex(n) for n in name],
1269             html, display_name, fatal=fatal, group='content', **kwargs)
1270
1271     def _dc_search_uploader(self, html):
1272         return self._html_search_meta('dc.creator', html, 'uploader')
1273
1274     def _rta_search(self, html):
1275         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1276         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1277                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1278                      html):
1279             return 18
1280         return 0
1281
1282     def _media_rating_search(self, html):
1283         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1284         rating = self._html_search_meta('rating', html)
1285
1286         if not rating:
1287             return None
1288
1289         RATING_TABLE = {
1290             'safe for kids': 0,
1291             'general': 8,
1292             '14 years': 14,
1293             'mature': 17,
1294             'restricted': 19,
1295         }
1296         return RATING_TABLE.get(rating.lower())
1297
1298     def _family_friendly_search(self, html):
1299         # See http://schema.org/VideoObject
1300         family_friendly = self._html_search_meta(
1301             'isFamilyFriendly', html, default=None)
1302
1303         if not family_friendly:
1304             return None
1305
1306         RATING_TABLE = {
1307             '1': 0,
1308             'true': 0,
1309             '0': 18,
1310             'false': 18,
1311         }
1312         return RATING_TABLE.get(family_friendly.lower())
1313
1314     def _twitter_search_player(self, html):
1315         return self._html_search_meta('twitter:player', html,
1316                                       'twitter card player')
1317
1318     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1319         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1320         default = kwargs.get('default', NO_DEFAULT)
1321         # JSON-LD may be malformed and thus `fatal` should be respected.
1322         # At the same time `default` may be passed that assumes `fatal=False`
1323         # for _search_regex. Let's simulate the same behavior here as well.
1324         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1325         json_ld = []
1326         for mobj in json_ld_list:
1327             json_ld_item = self._parse_json(
1328                 mobj.group('json_ld'), video_id, fatal=fatal)
1329             if not json_ld_item:
1330                 continue
1331             if isinstance(json_ld_item, dict):
1332                 json_ld.append(json_ld_item)
1333             elif isinstance(json_ld_item, (list, tuple)):
1334                 json_ld.extend(json_ld_item)
1335         if json_ld:
1336             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1337         if json_ld:
1338             return json_ld
1339         if default is not NO_DEFAULT:
1340             return default
1341         elif fatal:
1342             raise RegexNotFoundError('Unable to extract JSON-LD')
1343         else:
1344             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1345             return {}
1346
1347     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1348         if isinstance(json_ld, compat_str):
1349             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1350         if not json_ld:
1351             return {}
1352         info = {}
1353         if not isinstance(json_ld, (list, tuple, dict)):
1354             return info
1355         if isinstance(json_ld, dict):
1356             json_ld = [json_ld]
1357
1358         INTERACTION_TYPE_MAP = {
1359             'CommentAction': 'comment',
1360             'AgreeAction': 'like',
1361             'DisagreeAction': 'dislike',
1362             'LikeAction': 'like',
1363             'DislikeAction': 'dislike',
1364             'ListenAction': 'view',
1365             'WatchAction': 'view',
1366             'ViewAction': 'view',
1367         }
1368
1369         def extract_interaction_type(e):
1370             interaction_type = e.get('interactionType')
1371             if isinstance(interaction_type, dict):
1372                 interaction_type = interaction_type.get('@type')
1373             return str_or_none(interaction_type)
1374
1375         def extract_interaction_statistic(e):
1376             interaction_statistic = e.get('interactionStatistic')
1377             if isinstance(interaction_statistic, dict):
1378                 interaction_statistic = [interaction_statistic]
1379             if not isinstance(interaction_statistic, list):
1380                 return
1381             for is_e in interaction_statistic:
1382                 if not isinstance(is_e, dict):
1383                     continue
1384                 if is_e.get('@type') != 'InteractionCounter':
1385                     continue
1386                 interaction_type = extract_interaction_type(is_e)
1387                 if not interaction_type:
1388                     continue
1389                 # For interaction count some sites provide string instead of
1390                 # an integer (as per spec) with non digit characters (e.g. ",")
1391                 # so extracting count with more relaxed str_to_int
1392                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1393                 if interaction_count is None:
1394                     continue
1395                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1396                 if not count_kind:
1397                     continue
1398                 count_key = '%s_count' % count_kind
1399                 if info.get(count_key) is not None:
1400                     continue
1401                 info[count_key] = interaction_count
1402
1403         def extract_video_object(e):
1404             assert e['@type'] == 'VideoObject'
1405             author = e.get('author')
1406             info.update({
1407                 'url': url_or_none(e.get('contentUrl')),
1408                 'title': unescapeHTML(e.get('name')),
1409                 'description': unescapeHTML(e.get('description')),
1410                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1411                 'duration': parse_duration(e.get('duration')),
1412                 'timestamp': unified_timestamp(e.get('uploadDate')),
1413                 # author can be an instance of 'Organization' or 'Person' types.
1414                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1415                 # however some websites are using 'Text' type instead.
1416                 # 1. https://schema.org/VideoObject
1417                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1418                 'filesize': float_or_none(e.get('contentSize')),
1419                 'tbr': int_or_none(e.get('bitrate')),
1420                 'width': int_or_none(e.get('width')),
1421                 'height': int_or_none(e.get('height')),
1422                 'view_count': int_or_none(e.get('interactionCount')),
1423             })
1424             extract_interaction_statistic(e)
1425
1426         for e in json_ld:
1427             if '@context' in e:
1428                 item_type = e.get('@type')
1429                 if expected_type is not None and expected_type != item_type:
1430                     continue
1431                 if item_type in ('TVEpisode', 'Episode'):
1432                     episode_name = unescapeHTML(e.get('name'))
1433                     info.update({
1434                         'episode': episode_name,
1435                         'episode_number': int_or_none(e.get('episodeNumber')),
1436                         'description': unescapeHTML(e.get('description')),
1437                     })
1438                     if not info.get('title') and episode_name:
1439                         info['title'] = episode_name
1440                     part_of_season = e.get('partOfSeason')
1441                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1442                         info.update({
1443                             'season': unescapeHTML(part_of_season.get('name')),
1444                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1445                         })
1446                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1447                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1448                         info['series'] = unescapeHTML(part_of_series.get('name'))
1449                 elif item_type == 'Movie':
1450                     info.update({
1451                         'title': unescapeHTML(e.get('name')),
1452                         'description': unescapeHTML(e.get('description')),
1453                         'duration': parse_duration(e.get('duration')),
1454                         'timestamp': unified_timestamp(e.get('dateCreated')),
1455                     })
1456                 elif item_type in ('Article', 'NewsArticle'):
1457                     info.update({
1458                         'timestamp': parse_iso8601(e.get('datePublished')),
1459                         'title': unescapeHTML(e.get('headline')),
1460                         'description': unescapeHTML(e.get('articleBody')),
1461                     })
1462                 elif item_type == 'VideoObject':
1463                     extract_video_object(e)
1464                     if expected_type is None:
1465                         continue
1466                     else:
1467                         break
1468                 video = e.get('video')
1469                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1470                     extract_video_object(video)
1471                 if expected_type is None:
1472                     continue
1473                 else:
1474                     break
1475         return dict((k, v) for k, v in info.items() if v is not None)
1476
1477     @staticmethod
1478     def _hidden_inputs(html):
1479         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1480         hidden_inputs = {}
1481         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1482             attrs = extract_attributes(input)
1483             if not input:
1484                 continue
1485             if attrs.get('type') not in ('hidden', 'submit'):
1486                 continue
1487             name = attrs.get('name') or attrs.get('id')
1488             value = attrs.get('value')
1489             if name and value is not None:
1490                 hidden_inputs[name] = value
1491         return hidden_inputs
1492
1493     def _form_hidden_inputs(self, form_id, html):
1494         form = self._search_regex(
1495             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1496             html, '%s form' % form_id, group='form')
1497         return self._hidden_inputs(form)
1498
1499     class FormatSort:
1500         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1501
1502         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1503                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1504                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1505         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1506                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1507                         'fps', 'fs_approx', 'source', 'format_id')
1508
1509         settings = {
1510             'vcodec': {'type': 'ordered', 'regex': True,
1511                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1512             'acodec': {'type': 'ordered', 'regex': True,
1513                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1514             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1515                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1516             'vext': {'type': 'ordered', 'field': 'video_ext',
1517                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1518                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1519             'aext': {'type': 'ordered', 'field': 'audio_ext',
1520                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1521                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1522             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1523             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1524                            'field': ('vcodec', 'acodec'),
1525                            'function': lambda it: int(any(v != 'none' for v in it))},
1526             'ie_pref': {'priority': True, 'type': 'extractor'},
1527             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1528             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1529             'lang': {'convert': 'ignore', 'field': 'language_preference'},
1530             'quality': {'convert': 'float_none', 'default': -1},
1531             'filesize': {'convert': 'bytes'},
1532             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1533             'id': {'convert': 'string', 'field': 'format_id'},
1534             'height': {'convert': 'float_none'},
1535             'width': {'convert': 'float_none'},
1536             'fps': {'convert': 'float_none'},
1537             'tbr': {'convert': 'float_none'},
1538             'vbr': {'convert': 'float_none'},
1539             'abr': {'convert': 'float_none'},
1540             'asr': {'convert': 'float_none'},
1541             'source': {'convert': 'ignore', 'field': 'source_preference'},
1542
1543             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1544             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1545             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1546             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1547             'res': {'type': 'multiple', 'field': ('height', 'width'),
1548                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1549
1550             # Most of these exist only for compatibility reasons
1551             'dimension': {'type': 'alias', 'field': 'res'},
1552             'resolution': {'type': 'alias', 'field': 'res'},
1553             'extension': {'type': 'alias', 'field': 'ext'},
1554             'bitrate': {'type': 'alias', 'field': 'br'},
1555             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1556             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1557             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1558             'framerate': {'type': 'alias', 'field': 'fps'},
1559             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1560             'protocol': {'type': 'alias', 'field': 'proto'},
1561             'source_preference': {'type': 'alias', 'field': 'source'},
1562             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1563             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1564             'samplerate': {'type': 'alias', 'field': 'asr'},
1565             'video_ext': {'type': 'alias', 'field': 'vext'},
1566             'audio_ext': {'type': 'alias', 'field': 'aext'},
1567             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1568             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1569             'video': {'type': 'alias', 'field': 'hasvid'},
1570             'has_video': {'type': 'alias', 'field': 'hasvid'},
1571             'audio': {'type': 'alias', 'field': 'hasaud'},
1572             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1573             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1574             'preference': {'type': 'alias', 'field': 'ie_pref'},
1575             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1576             'format_id': {'type': 'alias', 'field': 'id'},
1577         }
1578
1579         _order = []
1580
1581         def _get_field_setting(self, field, key):
1582             if field not in self.settings:
1583                 self.settings[field] = {}
1584             propObj = self.settings[field]
1585             if key not in propObj:
1586                 type = propObj.get('type')
1587                 if key == 'field':
1588                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1589                 elif key == 'convert':
1590                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1591                 else:
1592                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1593                 propObj[key] = default
1594             return propObj[key]
1595
1596         def _resolve_field_value(self, field, value, convertNone=False):
1597             if value is None:
1598                 if not convertNone:
1599                     return None
1600             else:
1601                 value = value.lower()
1602             conversion = self._get_field_setting(field, 'convert')
1603             if conversion == 'ignore':
1604                 return None
1605             if conversion == 'string':
1606                 return value
1607             elif conversion == 'float_none':
1608                 return float_or_none(value)
1609             elif conversion == 'bytes':
1610                 return FileDownloader.parse_bytes(value)
1611             elif conversion == 'order':
1612                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1613                 use_regex = self._get_field_setting(field, 'regex')
1614                 list_length = len(order_list)
1615                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1616                 if use_regex and value is not None:
1617                     for i, regex in enumerate(order_list):
1618                         if regex and re.match(regex, value):
1619                             return list_length - i
1620                     return list_length - empty_pos  # not in list
1621                 else:  # not regex or  value = None
1622                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1623             else:
1624                 if value.isnumeric():
1625                     return float(value)
1626                 else:
1627                     self.settings[field]['convert'] = 'string'
1628                     return value
1629
1630         def evaluate_params(self, params, sort_extractor):
1631             self._use_free_order = params.get('prefer_free_formats', False)
1632             self._sort_user = params.get('format_sort', [])
1633             self._sort_extractor = sort_extractor
1634
1635             def add_item(field, reverse, closest, limit_text):
1636                 field = field.lower()
1637                 if field in self._order:
1638                     return
1639                 self._order.append(field)
1640                 limit = self._resolve_field_value(field, limit_text)
1641                 data = {
1642                     'reverse': reverse,
1643                     'closest': False if limit is None else closest,
1644                     'limit_text': limit_text,
1645                     'limit': limit}
1646                 if field in self.settings:
1647                     self.settings[field].update(data)
1648                 else:
1649                     self.settings[field] = data
1650
1651             sort_list = (
1652                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1653                 + (tuple() if params.get('format_sort_force', False)
1654                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1655                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1656
1657             for item in sort_list:
1658                 match = re.match(self.regex, item)
1659                 if match is None:
1660                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1661                 field = match.group('field')
1662                 if field is None:
1663                     continue
1664                 if self._get_field_setting(field, 'type') == 'alias':
1665                     field = self._get_field_setting(field, 'field')
1666                 reverse = match.group('reverse') is not None
1667                 closest = match.group('separator') == '~'
1668                 limit_text = match.group('limit')
1669
1670                 has_limit = limit_text is not None
1671                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1672                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1673
1674                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1675                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1676                 limit_count = len(limits)
1677                 for (i, f) in enumerate(fields):
1678                     add_item(f, reverse, closest,
1679                              limits[i] if i < limit_count
1680                              else limits[0] if has_limit and not has_multiple_limits
1681                              else None)
1682
1683         def print_verbose_info(self, write_debug):
1684             if self._sort_user:
1685                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1686             if self._sort_extractor:
1687                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1688             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1689                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1690                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1691                               self._get_field_setting(field, 'limit_text'),
1692                               self._get_field_setting(field, 'limit'))
1693                 if self._get_field_setting(field, 'limit_text') is not None else '')
1694                 for field in self._order if self._get_field_setting(field, 'visible')]))
1695
1696         def _calculate_field_preference_from_value(self, format, field, type, value):
1697             reverse = self._get_field_setting(field, 'reverse')
1698             closest = self._get_field_setting(field, 'closest')
1699             limit = self._get_field_setting(field, 'limit')
1700
1701             if type == 'extractor':
1702                 maximum = self._get_field_setting(field, 'max')
1703                 if value is None or (maximum is not None and value >= maximum):
1704                     value = -1
1705             elif type == 'boolean':
1706                 in_list = self._get_field_setting(field, 'in_list')
1707                 not_in_list = self._get_field_setting(field, 'not_in_list')
1708                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1709             elif type == 'ordered':
1710                 value = self._resolve_field_value(field, value, True)
1711
1712             # try to convert to number
1713             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1714             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1715             if is_num:
1716                 value = val_num
1717
1718             return ((-10, 0) if value is None
1719                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1720                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1721                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1722                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1723                     else (-1, value, 0))
1724
1725         def _calculate_field_preference(self, format, field):
1726             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1727             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1728             if type == 'multiple':
1729                 type = 'field'  # Only 'field' is allowed in multiple for now
1730                 actual_fields = self._get_field_setting(field, 'field')
1731
1732                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1733             else:
1734                 value = get_value(field)
1735             return self._calculate_field_preference_from_value(format, field, type, value)
1736
1737         def calculate_preference(self, format):
1738             # Determine missing protocol
1739             if not format.get('protocol'):
1740                 format['protocol'] = determine_protocol(format)
1741
1742             # Determine missing ext
1743             if not format.get('ext') and 'url' in format:
1744                 format['ext'] = determine_ext(format['url'])
1745             if format.get('vcodec') == 'none':
1746                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1747                 format['video_ext'] = 'none'
1748             else:
1749                 format['video_ext'] = format['ext']
1750                 format['audio_ext'] = 'none'
1751             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1752             #    format['preference'] = -1000
1753
1754             # Determine missing bitrates
1755             if format.get('tbr') is None:
1756                 if format.get('vbr') is not None and format.get('abr') is not None:
1757                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1758             else:
1759                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1760                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1761                 if format.get('acodec') != "none" and format.get('abr') is None:
1762                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1763
1764             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1765
1766     def _sort_formats(self, formats, field_preference=[]):
1767         if not formats:
1768             return
1769         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1770         format_sort.evaluate_params(self._downloader.params, field_preference)
1771         if self.get_param('verbose', False):
1772             format_sort.print_verbose_info(self._downloader.write_debug)
1773         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1774
1775     def _check_formats(self, formats, video_id):
1776         if formats:
1777             formats[:] = filter(
1778                 lambda f: self._is_valid_url(
1779                     f['url'], video_id,
1780                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1781                 formats)
1782
1783     @staticmethod
1784     def _remove_duplicate_formats(formats):
1785         format_urls = set()
1786         unique_formats = []
1787         for f in formats:
1788             if f['url'] not in format_urls:
1789                 format_urls.add(f['url'])
1790                 unique_formats.append(f)
1791         formats[:] = unique_formats
1792
1793     def _is_valid_url(self, url, video_id, item='video', headers={}):
1794         url = self._proto_relative_url(url, scheme='http:')
1795         # For now assume non HTTP(S) URLs always valid
1796         if not (url.startswith('http://') or url.startswith('https://')):
1797             return True
1798         try:
1799             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1800             return True
1801         except ExtractorError as e:
1802             self.to_screen(
1803                 '%s: %s URL is invalid, skipping: %s'
1804                 % (video_id, item, error_to_compat_str(e.cause)))
1805             return False
1806
1807     def http_scheme(self):
1808         """ Either "http:" or "https:", depending on the user's preferences """
1809         return (
1810             'http:'
1811             if self.get_param('prefer_insecure', False)
1812             else 'https:')
1813
1814     def _proto_relative_url(self, url, scheme=None):
1815         if url is None:
1816             return url
1817         if url.startswith('//'):
1818             if scheme is None:
1819                 scheme = self.http_scheme()
1820             return scheme + url
1821         else:
1822             return url
1823
1824     def _sleep(self, timeout, video_id, msg_template=None):
1825         if msg_template is None:
1826             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1827         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1828         self.to_screen(msg)
1829         time.sleep(timeout)
1830
1831     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1832                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1833                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1834         manifest = self._download_xml(
1835             manifest_url, video_id, 'Downloading f4m manifest',
1836             'Unable to download f4m manifest',
1837             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1838             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1839             transform_source=transform_source,
1840             fatal=fatal, data=data, headers=headers, query=query)
1841
1842         if manifest is False:
1843             return []
1844
1845         return self._parse_f4m_formats(
1846             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1847             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1848
1849     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1850                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1851                            fatal=True, m3u8_id=None):
1852         if not isinstance(manifest, compat_etree_Element) and not fatal:
1853             return []
1854
1855         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1856         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1857         if akamai_pv is not None and ';' in akamai_pv.text:
1858             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1859             if playerVerificationChallenge.strip() != '':
1860                 return []
1861
1862         formats = []
1863         manifest_version = '1.0'
1864         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1865         if not media_nodes:
1866             manifest_version = '2.0'
1867             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1868         # Remove unsupported DRM protected media from final formats
1869         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1870         media_nodes = remove_encrypted_media(media_nodes)
1871         if not media_nodes:
1872             return formats
1873
1874         manifest_base_url = get_base_url(manifest)
1875
1876         bootstrap_info = xpath_element(
1877             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1878             'bootstrap info', default=None)
1879
1880         vcodec = None
1881         mime_type = xpath_text(
1882             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1883             'base URL', default=None)
1884         if mime_type and mime_type.startswith('audio/'):
1885             vcodec = 'none'
1886
1887         for i, media_el in enumerate(media_nodes):
1888             tbr = int_or_none(media_el.attrib.get('bitrate'))
1889             width = int_or_none(media_el.attrib.get('width'))
1890             height = int_or_none(media_el.attrib.get('height'))
1891             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1892             # If <bootstrapInfo> is present, the specified f4m is a
1893             # stream-level manifest, and only set-level manifests may refer to
1894             # external resources.  See section 11.4 and section 4 of F4M spec
1895             if bootstrap_info is None:
1896                 media_url = None
1897                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1898                 if manifest_version == '2.0':
1899                     media_url = media_el.attrib.get('href')
1900                 if media_url is None:
1901                     media_url = media_el.attrib.get('url')
1902                 if not media_url:
1903                     continue
1904                 manifest_url = (
1905                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1906                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1907                 # If media_url is itself a f4m manifest do the recursive extraction
1908                 # since bitrates in parent manifest (this one) and media_url manifest
1909                 # may differ leading to inability to resolve the format by requested
1910                 # bitrate in f4m downloader
1911                 ext = determine_ext(manifest_url)
1912                 if ext == 'f4m':
1913                     f4m_formats = self._extract_f4m_formats(
1914                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1915                         transform_source=transform_source, fatal=fatal)
1916                     # Sometimes stream-level manifest contains single media entry that
1917                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1918                     # At the same time parent's media entry in set-level manifest may
1919                     # contain it. We will copy it from parent in such cases.
1920                     if len(f4m_formats) == 1:
1921                         f = f4m_formats[0]
1922                         f.update({
1923                             'tbr': f.get('tbr') or tbr,
1924                             'width': f.get('width') or width,
1925                             'height': f.get('height') or height,
1926                             'format_id': f.get('format_id') if not tbr else format_id,
1927                             'vcodec': vcodec,
1928                         })
1929                     formats.extend(f4m_formats)
1930                     continue
1931                 elif ext == 'm3u8':
1932                     formats.extend(self._extract_m3u8_formats(
1933                         manifest_url, video_id, 'mp4', preference=preference,
1934                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1935                     continue
1936             formats.append({
1937                 'format_id': format_id,
1938                 'url': manifest_url,
1939                 'manifest_url': manifest_url,
1940                 'ext': 'flv' if bootstrap_info is not None else None,
1941                 'protocol': 'f4m',
1942                 'tbr': tbr,
1943                 'width': width,
1944                 'height': height,
1945                 'vcodec': vcodec,
1946                 'preference': preference,
1947                 'quality': quality,
1948             })
1949         return formats
1950
1951     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1952         return {
1953             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1954             'url': m3u8_url,
1955             'ext': ext,
1956             'protocol': 'm3u8',
1957             'preference': preference - 100 if preference else -100,
1958             'quality': quality,
1959             'resolution': 'multiple',
1960             'format_note': 'Quality selection URL',
1961         }
1962
1963     def _extract_m3u8_formats(self, *args, **kwargs):
1964         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1965         if subs:
1966             self.report_warning(bug_reports_message(
1967                 "Ignoring subtitle tracks found in the HLS manifest; "
1968                 "if any subtitle tracks are missing,"
1969             ), only_once=True)
1970         return fmts
1971
1972     def _extract_m3u8_formats_and_subtitles(
1973             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1974             preference=None, quality=None, m3u8_id=None, note=None,
1975             errnote=None, fatal=True, live=False, data=None, headers={},
1976             query={}):
1977
1978         res = self._download_webpage_handle(
1979             m3u8_url, video_id,
1980             note='Downloading m3u8 information' if note is None else note,
1981             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1982             fatal=fatal, data=data, headers=headers, query=query)
1983
1984         if res is False:
1985             return [], {}
1986
1987         m3u8_doc, urlh = res
1988         m3u8_url = urlh.geturl()
1989
1990         return self._parse_m3u8_formats_and_subtitles(
1991             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1992             preference=preference, quality=quality, m3u8_id=m3u8_id,
1993             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1994             headers=headers, query=query, video_id=video_id)
1995
1996     def _parse_m3u8_formats_and_subtitles(
1997             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
1998             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1999             errnote=None, fatal=True, data=None, headers={}, query={},
2000             video_id=None):
2001         formats, subtitles = [], {}
2002
2003         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
2004             return formats, subtitles
2005
2006         has_drm = re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)
2007
2008         def format_url(url):
2009             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2010
2011         if self.get_param('hls_split_discontinuity', False):
2012             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2013                 if not m3u8_doc:
2014                     if not manifest_url:
2015                         return []
2016                     m3u8_doc = self._download_webpage(
2017                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2018                         note=False, errnote='Failed to download m3u8 playlist information')
2019                     if m3u8_doc is False:
2020                         return []
2021                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2022
2023         else:
2024             def _extract_m3u8_playlist_indices(*args, **kwargs):
2025                 return [None]
2026
2027         # References:
2028         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2029         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2030         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2031
2032         # We should try extracting formats only from master playlists [1, 4.3.4],
2033         # i.e. playlists that describe available qualities. On the other hand
2034         # media playlists [1, 4.3.3] should be returned as is since they contain
2035         # just the media without qualities renditions.
2036         # Fortunately, master playlist can be easily distinguished from media
2037         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2038         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2039         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2040         # media playlist and MUST NOT appear in master playlist thus we can
2041         # clearly detect media playlist with this criterion.
2042
2043         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2044             formats = [{
2045                 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
2046                 'format_index': idx,
2047                 'url': m3u8_url,
2048                 'ext': ext,
2049                 'protocol': entry_protocol,
2050                 'preference': preference,
2051                 'quality': quality,
2052                 'has_drm': has_drm,
2053             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2054
2055             return formats, subtitles
2056
2057         groups = {}
2058         last_stream_inf = {}
2059
2060         def extract_media(x_media_line):
2061             media = parse_m3u8_attributes(x_media_line)
2062             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2063             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2064             if not (media_type and group_id and name):
2065                 return
2066             groups.setdefault(group_id, []).append(media)
2067             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2068             if media_type == 'SUBTITLES':
2069                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2070                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2071                 # However, lack of URI has been spotted in the wild.
2072                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2073                 if not media.get('URI'):
2074                     return
2075                 url = format_url(media['URI'])
2076                 sub_info = {
2077                     'url': url,
2078                     'ext': determine_ext(url),
2079                 }
2080                 if sub_info['ext'] == 'm3u8':
2081                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2082                     # files may contain is WebVTT:
2083                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2084                     sub_info['ext'] = 'vtt'
2085                     sub_info['protocol'] = 'm3u8_native'
2086                 lang = media.get('LANGUAGE') or 'und'
2087                 subtitles.setdefault(lang, []).append(sub_info)
2088             if media_type not in ('VIDEO', 'AUDIO'):
2089                 return
2090             media_url = media.get('URI')
2091             if media_url:
2092                 manifest_url = format_url(media_url)
2093                 formats.extend({
2094                     'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
2095                     'format_note': name,
2096                     'format_index': idx,
2097                     'url': manifest_url,
2098                     'manifest_url': m3u8_url,
2099                     'language': media.get('LANGUAGE'),
2100                     'ext': ext,
2101                     'protocol': entry_protocol,
2102                     'preference': preference,
2103                     'quality': quality,
2104                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2105                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2106
2107         def build_stream_name():
2108             # Despite specification does not mention NAME attribute for
2109             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2110             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2111             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2112             stream_name = last_stream_inf.get('NAME')
2113             if stream_name:
2114                 return stream_name
2115             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2116             # from corresponding rendition group
2117             stream_group_id = last_stream_inf.get('VIDEO')
2118             if not stream_group_id:
2119                 return
2120             stream_group = groups.get(stream_group_id)
2121             if not stream_group:
2122                 return stream_group_id
2123             rendition = stream_group[0]
2124             return rendition.get('NAME') or stream_group_id
2125
2126         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2127         # chance to detect video only formats when EXT-X-STREAM-INF tags
2128         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2129         for line in m3u8_doc.splitlines():
2130             if line.startswith('#EXT-X-MEDIA:'):
2131                 extract_media(line)
2132
2133         for line in m3u8_doc.splitlines():
2134             if line.startswith('#EXT-X-STREAM-INF:'):
2135                 last_stream_inf = parse_m3u8_attributes(line)
2136             elif line.startswith('#') or not line.strip():
2137                 continue
2138             else:
2139                 tbr = float_or_none(
2140                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2141                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2142                 manifest_url = format_url(line.strip())
2143
2144                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2145                     format_id = [m3u8_id, None, idx]
2146                     # Bandwidth of live streams may differ over time thus making
2147                     # format_id unpredictable. So it's better to keep provided
2148                     # format_id intact.
2149                     if not live:
2150                         stream_name = build_stream_name()
2151                         format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
2152                     f = {
2153                         'format_id': '-'.join(map(str, filter(None, format_id))),
2154                         'format_index': idx,
2155                         'url': manifest_url,
2156                         'manifest_url': m3u8_url,
2157                         'tbr': tbr,
2158                         'ext': ext,
2159                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2160                         'protocol': entry_protocol,
2161                         'preference': preference,
2162                         'quality': quality,
2163                     }
2164                     resolution = last_stream_inf.get('RESOLUTION')
2165                     if resolution:
2166                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2167                         if mobj:
2168                             f['width'] = int(mobj.group('width'))
2169                             f['height'] = int(mobj.group('height'))
2170                     # Unified Streaming Platform
2171                     mobj = re.search(
2172                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2173                     if mobj:
2174                         abr, vbr = mobj.groups()
2175                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2176                         f.update({
2177                             'vbr': vbr,
2178                             'abr': abr,
2179                         })
2180                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2181                     f.update(codecs)
2182                     audio_group_id = last_stream_inf.get('AUDIO')
2183                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2184                     # references a rendition group MUST have a CODECS attribute.
2185                     # However, this is not always respected, for example, [2]
2186                     # contains EXT-X-STREAM-INF tag which references AUDIO
2187                     # rendition group but does not have CODECS and despite
2188                     # referencing an audio group it represents a complete
2189                     # (with audio and video) format. So, for such cases we will
2190                     # ignore references to rendition groups and treat them
2191                     # as complete formats.
2192                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2193                         audio_group = groups.get(audio_group_id)
2194                         if audio_group and audio_group[0].get('URI'):
2195                             # TODO: update acodec for audio only formats with
2196                             # the same GROUP-ID
2197                             f['acodec'] = 'none'
2198                     if not f.get('ext'):
2199                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2200                     formats.append(f)
2201
2202                     # for DailyMotion
2203                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2204                     if progressive_uri:
2205                         http_f = f.copy()
2206                         del http_f['manifest_url']
2207                         http_f.update({
2208                             'format_id': f['format_id'].replace('hls-', 'http-'),
2209                             'protocol': 'http',
2210                             'url': progressive_uri,
2211                         })
2212                         formats.append(http_f)
2213
2214                 last_stream_inf = {}
2215         return formats, subtitles
2216
2217     @staticmethod
2218     def _xpath_ns(path, namespace=None):
2219         if not namespace:
2220             return path
2221         out = []
2222         for c in path.split('/'):
2223             if not c or c == '.':
2224                 out.append(c)
2225             else:
2226                 out.append('{%s}%s' % (namespace, c))
2227         return '/'.join(out)
2228
2229     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2230         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2231
2232         if smil is False:
2233             assert not fatal
2234             return []
2235
2236         namespace = self._parse_smil_namespace(smil)
2237
2238         fmts = self._parse_smil_formats(
2239             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2240         subs = self._parse_smil_subtitles(
2241             smil, namespace=namespace)
2242
2243         return fmts, subs
2244
2245     def _extract_smil_formats(self, *args, **kwargs):
2246         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2247         if subs:
2248             self.report_warning(bug_reports_message(
2249                 "Ignoring subtitle tracks found in the SMIL manifest; "
2250                 "if any subtitle tracks are missing,"
2251             ), only_once=True)
2252         return fmts
2253
2254     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2255         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2256         if smil is False:
2257             return {}
2258         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2259
2260     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2261         return self._download_xml(
2262             smil_url, video_id, 'Downloading SMIL file',
2263             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2264
2265     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2266         namespace = self._parse_smil_namespace(smil)
2267
2268         formats = self._parse_smil_formats(
2269             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2270         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2271
2272         video_id = os.path.splitext(url_basename(smil_url))[0]
2273         title = None
2274         description = None
2275         upload_date = None
2276         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2277             name = meta.attrib.get('name')
2278             content = meta.attrib.get('content')
2279             if not name or not content:
2280                 continue
2281             if not title and name == 'title':
2282                 title = content
2283             elif not description and name in ('description', 'abstract'):
2284                 description = content
2285             elif not upload_date and name == 'date':
2286                 upload_date = unified_strdate(content)
2287
2288         thumbnails = [{
2289             'id': image.get('type'),
2290             'url': image.get('src'),
2291             'width': int_or_none(image.get('width')),
2292             'height': int_or_none(image.get('height')),
2293         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2294
2295         return {
2296             'id': video_id,
2297             'title': title or video_id,
2298             'description': description,
2299             'upload_date': upload_date,
2300             'thumbnails': thumbnails,
2301             'formats': formats,
2302             'subtitles': subtitles,
2303         }
2304
2305     def _parse_smil_namespace(self, smil):
2306         return self._search_regex(
2307             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2308
2309     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2310         base = smil_url
2311         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2312             b = meta.get('base') or meta.get('httpBase')
2313             if b:
2314                 base = b
2315                 break
2316
2317         formats = []
2318         rtmp_count = 0
2319         http_count = 0
2320         m3u8_count = 0
2321
2322         srcs = []
2323         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2324         for medium in media:
2325             src = medium.get('src')
2326             if not src or src in srcs:
2327                 continue
2328             srcs.append(src)
2329
2330             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2331             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2332             width = int_or_none(medium.get('width'))
2333             height = int_or_none(medium.get('height'))
2334             proto = medium.get('proto')
2335             ext = medium.get('ext')
2336             src_ext = determine_ext(src)
2337             streamer = medium.get('streamer') or base
2338
2339             if proto == 'rtmp' or streamer.startswith('rtmp'):
2340                 rtmp_count += 1
2341                 formats.append({
2342                     'url': streamer,
2343                     'play_path': src,
2344                     'ext': 'flv',
2345                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2346                     'tbr': bitrate,
2347                     'filesize': filesize,
2348                     'width': width,
2349                     'height': height,
2350                 })
2351                 if transform_rtmp_url:
2352                     streamer, src = transform_rtmp_url(streamer, src)
2353                     formats[-1].update({
2354                         'url': streamer,
2355                         'play_path': src,
2356                     })
2357                 continue
2358
2359             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2360             src_url = src_url.strip()
2361
2362             if proto == 'm3u8' or src_ext == 'm3u8':
2363                 m3u8_formats = self._extract_m3u8_formats(
2364                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2365                 if len(m3u8_formats) == 1:
2366                     m3u8_count += 1
2367                     m3u8_formats[0].update({
2368                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2369                         'tbr': bitrate,
2370                         'width': width,
2371                         'height': height,
2372                     })
2373                 formats.extend(m3u8_formats)
2374             elif src_ext == 'f4m':
2375                 f4m_url = src_url
2376                 if not f4m_params:
2377                     f4m_params = {
2378                         'hdcore': '3.2.0',
2379                         'plugin': 'flowplayer-3.2.0.1',
2380                     }
2381                 f4m_url += '&' if '?' in f4m_url else '?'
2382                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2383                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2384             elif src_ext == 'mpd':
2385                 formats.extend(self._extract_mpd_formats(
2386                     src_url, video_id, mpd_id='dash', fatal=False))
2387             elif re.search(r'\.ism/[Mm]anifest', src_url):
2388                 formats.extend(self._extract_ism_formats(
2389                     src_url, video_id, ism_id='mss', fatal=False))
2390             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2391                 http_count += 1
2392                 formats.append({
2393                     'url': src_url,
2394                     'ext': ext or src_ext or 'flv',
2395                     'format_id': 'http-%d' % (bitrate or http_count),
2396                     'tbr': bitrate,
2397                     'filesize': filesize,
2398                     'width': width,
2399                     'height': height,
2400                 })
2401
2402         return formats
2403
2404     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2405         urls = []
2406         subtitles = {}
2407         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2408             src = textstream.get('src')
2409             if not src or src in urls:
2410                 continue
2411             urls.append(src)
2412             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2413             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2414             subtitles.setdefault(lang, []).append({
2415                 'url': src,
2416                 'ext': ext,
2417             })
2418         return subtitles
2419
2420     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2421         xspf = self._download_xml(
2422             xspf_url, playlist_id, 'Downloading xpsf playlist',
2423             'Unable to download xspf manifest', fatal=fatal)
2424         if xspf is False:
2425             return []
2426         return self._parse_xspf(
2427             xspf, playlist_id, xspf_url=xspf_url,
2428             xspf_base_url=base_url(xspf_url))
2429
2430     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2431         NS_MAP = {
2432             'xspf': 'http://xspf.org/ns/0/',
2433             's1': 'http://static.streamone.nl/player/ns/0',
2434         }
2435
2436         entries = []
2437         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2438             title = xpath_text(
2439                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2440             description = xpath_text(
2441                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2442             thumbnail = xpath_text(
2443                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2444             duration = float_or_none(
2445                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2446
2447             formats = []
2448             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2449                 format_url = urljoin(xspf_base_url, location.text)
2450                 if not format_url:
2451                     continue
2452                 formats.append({
2453                     'url': format_url,
2454                     'manifest_url': xspf_url,
2455                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2456                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2457                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2458                 })
2459             self._sort_formats(formats)
2460
2461             entries.append({
2462                 'id': playlist_id,
2463                 'title': title,
2464                 'description': description,
2465                 'thumbnail': thumbnail,
2466                 'duration': duration,
2467                 'formats': formats,
2468             })
2469         return entries
2470
2471     def _extract_mpd_formats(self, *args, **kwargs):
2472         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2473         if subs:
2474             self.report_warning(bug_reports_message(
2475                 "Ignoring subtitle tracks found in the DASH manifest; "
2476                 "if any subtitle tracks are missing,"
2477             ), only_once=True)
2478         return fmts
2479
2480     def _extract_mpd_formats_and_subtitles(
2481             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2482             fatal=True, data=None, headers={}, query={}):
2483         res = self._download_xml_handle(
2484             mpd_url, video_id,
2485             note='Downloading MPD manifest' if note is None else note,
2486             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2487             fatal=fatal, data=data, headers=headers, query=query)
2488         if res is False:
2489             return [], {}
2490         mpd_doc, urlh = res
2491         if mpd_doc is None:
2492             return [], {}
2493         mpd_base_url = base_url(urlh.geturl())
2494
2495         return self._parse_mpd_formats_and_subtitles(
2496             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2497
2498     def _parse_mpd_formats(self, *args, **kwargs):
2499         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2500         if subs:
2501             self.report_warning(bug_reports_message(
2502                 "Ignoring subtitle tracks found in the DASH manifest; "
2503                 "if any subtitle tracks are missing,"
2504             ), only_once=True)
2505         return fmts
2506
2507     def _parse_mpd_formats_and_subtitles(
2508             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2509         """
2510         Parse formats from MPD manifest.
2511         References:
2512          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2513             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2514          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2515         """
2516         if not self.get_param('dynamic_mpd', True):
2517             if mpd_doc.get('type') == 'dynamic':
2518                 return [], {}
2519
2520         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2521
2522         def _add_ns(path):
2523             return self._xpath_ns(path, namespace)
2524
2525         def is_drm_protected(element):
2526             return element.find(_add_ns('ContentProtection')) is not None
2527
2528         def extract_multisegment_info(element, ms_parent_info):
2529             ms_info = ms_parent_info.copy()
2530
2531             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2532             # common attributes and elements.  We will only extract relevant
2533             # for us.
2534             def extract_common(source):
2535                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2536                 if segment_timeline is not None:
2537                     s_e = segment_timeline.findall(_add_ns('S'))
2538                     if s_e:
2539                         ms_info['total_number'] = 0
2540                         ms_info['s'] = []
2541                         for s in s_e:
2542                             r = int(s.get('r', 0))
2543                             ms_info['total_number'] += 1 + r
2544                             ms_info['s'].append({
2545                                 't': int(s.get('t', 0)),
2546                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2547                                 'd': int(s.attrib['d']),
2548                                 'r': r,
2549                             })
2550                 start_number = source.get('startNumber')
2551                 if start_number:
2552                     ms_info['start_number'] = int(start_number)
2553                 timescale = source.get('timescale')
2554                 if timescale:
2555                     ms_info['timescale'] = int(timescale)
2556                 segment_duration = source.get('duration')
2557                 if segment_duration:
2558                     ms_info['segment_duration'] = float(segment_duration)
2559
2560             def extract_Initialization(source):
2561                 initialization = source.find(_add_ns('Initialization'))
2562                 if initialization is not None:
2563                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2564
2565             segment_list = element.find(_add_ns('SegmentList'))
2566             if segment_list is not None:
2567                 extract_common(segment_list)
2568                 extract_Initialization(segment_list)
2569                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2570                 if segment_urls_e:
2571                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2572             else:
2573                 segment_template = element.find(_add_ns('SegmentTemplate'))
2574                 if segment_template is not None:
2575                     extract_common(segment_template)
2576                     media = segment_template.get('media')
2577                     if media:
2578                         ms_info['media'] = media
2579                     initialization = segment_template.get('initialization')
2580                     if initialization:
2581                         ms_info['initialization'] = initialization
2582                     else:
2583                         extract_Initialization(segment_template)
2584             return ms_info
2585
2586         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2587         formats, subtitles = [], {}
2588         stream_numbers = {'audio': 0, 'video': 0}
2589         for period in mpd_doc.findall(_add_ns('Period')):
2590             period_duration = parse_duration(period.get('duration')) or mpd_duration
2591             period_ms_info = extract_multisegment_info(period, {
2592                 'start_number': 1,
2593                 'timescale': 1,
2594             })
2595             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2596                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2597                 for representation in adaptation_set.findall(_add_ns('Representation')):
2598                     representation_attrib = adaptation_set.attrib.copy()
2599                     representation_attrib.update(representation.attrib)
2600                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2601                     mime_type = representation_attrib['mimeType']
2602                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2603
2604                     codecs = representation_attrib.get('codecs', '')
2605                     if content_type not in ('video', 'audio', 'text'):
2606                         if mime_type == 'image/jpeg':
2607                             content_type = mime_type
2608                         elif codecs.split('.')[0] == 'stpp':
2609                             content_type = 'text'
2610                         else:
2611                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2612                             continue
2613
2614                     base_url = ''
2615                     for element in (representation, adaptation_set, period, mpd_doc):
2616                         base_url_e = element.find(_add_ns('BaseURL'))
2617                         if base_url_e is not None:
2618                             base_url = base_url_e.text + base_url
2619                             if re.match(r'^https?://', base_url):
2620                                 break
2621                     if mpd_base_url and not re.match(r'^https?://', base_url):
2622                         if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2623                             mpd_base_url += '/'
2624                         base_url = mpd_base_url + base_url
2625                     representation_id = representation_attrib.get('id')
2626                     lang = representation_attrib.get('lang')
2627                     url_el = representation.find(_add_ns('BaseURL'))
2628                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2629                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2630                     if representation_id is not None:
2631                         format_id = representation_id
2632                     else:
2633                         format_id = content_type
2634                     if mpd_id:
2635                         format_id = mpd_id + '-' + format_id
2636                     if content_type in ('video', 'audio'):
2637                         f = {
2638                             'format_id': format_id,
2639                             'manifest_url': mpd_url,
2640                             'ext': mimetype2ext(mime_type),
2641                             'width': int_or_none(representation_attrib.get('width')),
2642                             'height': int_or_none(representation_attrib.get('height')),
2643                             'tbr': float_or_none(bandwidth, 1000),
2644                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2645                             'fps': int_or_none(representation_attrib.get('frameRate')),
2646                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2647                             'format_note': 'DASH %s' % content_type,
2648                             'filesize': filesize,
2649                             'container': mimetype2ext(mime_type) + '_dash',
2650                             'manifest_stream_number': stream_numbers[content_type]
2651                         }
2652                         f.update(parse_codecs(codecs))
2653                         stream_numbers[content_type] += 1
2654                     elif content_type == 'text':
2655                         f = {
2656                             'ext': mimetype2ext(mime_type),
2657                             'manifest_url': mpd_url,
2658                             'filesize': filesize,
2659                         }
2660                     elif content_type == 'image/jpeg':
2661                         # See test case in VikiIE
2662                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2663                         f = {
2664                             'format_id': format_id,
2665                             'ext': 'mhtml',
2666                             'manifest_url': mpd_url,
2667                             'format_note': 'DASH storyboards (jpeg)',
2668                             'acodec': 'none',
2669                             'vcodec': 'none',
2670                         }
2671                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2672                         f['has_drm'] = True
2673                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2674
2675                     def prepare_template(template_name, identifiers):
2676                         tmpl = representation_ms_info[template_name]
2677                         # First of, % characters outside $...$ templates
2678                         # must be escaped by doubling for proper processing
2679                         # by % operator string formatting used further (see
2680                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2681                         t = ''
2682                         in_template = False
2683                         for c in tmpl:
2684                             t += c
2685                             if c == '$':
2686                                 in_template = not in_template
2687                             elif c == '%' and not in_template:
2688                                 t += c
2689                         # Next, $...$ templates are translated to their
2690                         # %(...) counterparts to be used with % operator
2691                         if representation_id is not None:
2692                             t = t.replace('$RepresentationID$', representation_id)
2693                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2694                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2695                         t.replace('$$', '$')
2696                         return t
2697
2698                     # @initialization is a regular template like @media one
2699                     # so it should be handled just the same way (see
2700                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2701                     if 'initialization' in representation_ms_info:
2702                         initialization_template = prepare_template(
2703                             'initialization',
2704                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2705                             # $Time$ shall not be included for @initialization thus
2706                             # only $Bandwidth$ remains
2707                             ('Bandwidth', ))
2708                         representation_ms_info['initialization_url'] = initialization_template % {
2709                             'Bandwidth': bandwidth,
2710                         }
2711
2712                     def location_key(location):
2713                         return 'url' if re.match(r'^https?://', location) else 'path'
2714
2715                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2716
2717                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2718                         media_location_key = location_key(media_template)
2719
2720                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2721                         # can't be used at the same time
2722                         if '%(Number' in media_template and 's' not in representation_ms_info:
2723                             segment_duration = None
2724                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2725                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2726                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2727                             representation_ms_info['fragments'] = [{
2728                                 media_location_key: media_template % {
2729                                     'Number': segment_number,
2730                                     'Bandwidth': bandwidth,
2731                                 },
2732                                 'duration': segment_duration,
2733                             } for segment_number in range(
2734                                 representation_ms_info['start_number'],
2735                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2736                         else:
2737                             # $Number*$ or $Time$ in media template with S list available
2738                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2739                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2740                             representation_ms_info['fragments'] = []
2741                             segment_time = 0
2742                             segment_d = None
2743                             segment_number = representation_ms_info['start_number']
2744
2745                             def add_segment_url():
2746                                 segment_url = media_template % {
2747                                     'Time': segment_time,
2748                                     'Bandwidth': bandwidth,
2749                                     'Number': segment_number,
2750                                 }
2751                                 representation_ms_info['fragments'].append({
2752                                     media_location_key: segment_url,
2753                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2754                                 })
2755
2756                             for num, s in enumerate(representation_ms_info['s']):
2757                                 segment_time = s.get('t') or segment_time
2758                                 segment_d = s['d']
2759                                 add_segment_url()
2760                                 segment_number += 1
2761                                 for r in range(s.get('r', 0)):
2762                                     segment_time += segment_d
2763                                     add_segment_url()
2764                                     segment_number += 1
2765                                 segment_time += segment_d
2766                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2767                         # No media template
2768                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2769                         # or any YouTube dashsegments video
2770                         fragments = []
2771                         segment_index = 0
2772                         timescale = representation_ms_info['timescale']
2773                         for s in representation_ms_info['s']:
2774                             duration = float_or_none(s['d'], timescale)
2775                             for r in range(s.get('r', 0) + 1):
2776                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2777                                 fragments.append({
2778                                     location_key(segment_uri): segment_uri,
2779                                     'duration': duration,
2780                                 })
2781                                 segment_index += 1
2782                         representation_ms_info['fragments'] = fragments
2783                     elif 'segment_urls' in representation_ms_info:
2784                         # Segment URLs with no SegmentTimeline
2785                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2786                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2787                         fragments = []
2788                         segment_duration = float_or_none(
2789                             representation_ms_info['segment_duration'],
2790                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2791                         for segment_url in representation_ms_info['segment_urls']:
2792                             fragment = {
2793                                 location_key(segment_url): segment_url,
2794                             }
2795                             if segment_duration:
2796                                 fragment['duration'] = segment_duration
2797                             fragments.append(fragment)
2798                         representation_ms_info['fragments'] = fragments
2799                     # If there is a fragments key available then we correctly recognized fragmented media.
2800                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2801                     # assumption is not necessarily correct since we may simply have no support for
2802                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2803                     if 'fragments' in representation_ms_info:
2804                         f.update({
2805                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2806                             'url': mpd_url or base_url,
2807                             'fragment_base_url': base_url,
2808                             'fragments': [],
2809                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2810                         })
2811                         if 'initialization_url' in representation_ms_info:
2812                             initialization_url = representation_ms_info['initialization_url']
2813                             if not f.get('url'):
2814                                 f['url'] = initialization_url
2815                             f['fragments'].append({location_key(initialization_url): initialization_url})
2816                         f['fragments'].extend(representation_ms_info['fragments'])
2817                     else:
2818                         # Assuming direct URL to unfragmented media.
2819                         f['url'] = base_url
2820                     if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2821                         formats.append(f)
2822                     elif content_type == 'text':
2823                         subtitles.setdefault(lang or 'und', []).append(f)
2824
2825         return formats, subtitles
2826
2827     def _extract_ism_formats(self, *args, **kwargs):
2828         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2829         if subs:
2830             self.report_warning(bug_reports_message(
2831                 "Ignoring subtitle tracks found in the ISM manifest; "
2832                 "if any subtitle tracks are missing,"
2833             ))
2834         return fmts
2835
2836     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2837         res = self._download_xml_handle(
2838             ism_url, video_id,
2839             note='Downloading ISM manifest' if note is None else note,
2840             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2841             fatal=fatal, data=data, headers=headers, query=query)
2842         if res is False:
2843             return [], {}
2844         ism_doc, urlh = res
2845         if ism_doc is None:
2846             return [], {}
2847
2848         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2849
2850     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2851         """
2852         Parse formats from ISM manifest.
2853         References:
2854          1. [MS-SSTR]: Smooth Streaming Protocol,
2855             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2856         """
2857         if ism_doc.get('IsLive') == 'TRUE':
2858             return [], {}
2859
2860         duration = int(ism_doc.attrib['Duration'])
2861         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2862
2863         formats = []
2864         subtitles = {}
2865         for stream in ism_doc.findall('StreamIndex'):
2866             stream_type = stream.get('Type')
2867             if stream_type not in ('video', 'audio', 'text'):
2868                 continue
2869             url_pattern = stream.attrib['Url']
2870             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2871             stream_name = stream.get('Name')
2872             stream_language = stream.get('Language', 'und')
2873             for track in stream.findall('QualityLevel'):
2874                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2875                 # TODO: add support for WVC1 and WMAP
2876                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2877                     self.report_warning('%s is not a supported codec' % fourcc)
2878                     continue
2879                 tbr = int(track.attrib['Bitrate']) // 1000
2880                 # [1] does not mention Width and Height attributes. However,
2881                 # they're often present while MaxWidth and MaxHeight are
2882                 # missing, so should be used as fallbacks
2883                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2884                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2885                 sampling_rate = int_or_none(track.get('SamplingRate'))
2886
2887                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2888                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2889
2890                 fragments = []
2891                 fragment_ctx = {
2892                     'time': 0,
2893                 }
2894                 stream_fragments = stream.findall('c')
2895                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2896                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2897                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2898                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2899                     if not fragment_ctx['duration']:
2900                         try:
2901                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2902                         except IndexError:
2903                             next_fragment_time = duration
2904                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2905                     for _ in range(fragment_repeat):
2906                         fragments.append({
2907                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2908                             'duration': fragment_ctx['duration'] / stream_timescale,
2909                         })
2910                         fragment_ctx['time'] += fragment_ctx['duration']
2911
2912                 format_id = []
2913                 if ism_id:
2914                     format_id.append(ism_id)
2915                 if stream_name:
2916                     format_id.append(stream_name)
2917                 format_id.append(compat_str(tbr))
2918
2919                 if stream_type == 'text':
2920                     subtitles.setdefault(stream_language, []).append({
2921                         'ext': 'ismt',
2922                         'protocol': 'ism',
2923                         'url': ism_url,
2924                         'manifest_url': ism_url,
2925                         'fragments': fragments,
2926                         '_download_params': {
2927                             'stream_type': stream_type,
2928                             'duration': duration,
2929                             'timescale': stream_timescale,
2930                             'fourcc': fourcc,
2931                             'language': stream_language,
2932                             'codec_private_data': track.get('CodecPrivateData'),
2933                         }
2934                     })
2935                 elif stream_type in ('video', 'audio'):
2936                     formats.append({
2937                         'format_id': '-'.join(format_id),
2938                         'url': ism_url,
2939                         'manifest_url': ism_url,
2940                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2941                         'width': width,
2942                         'height': height,
2943                         'tbr': tbr,
2944                         'asr': sampling_rate,
2945                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2946                         'acodec': 'none' if stream_type == 'video' else fourcc,
2947                         'protocol': 'ism',
2948                         'fragments': fragments,
2949                         'has_drm': ism_doc.find('Protection') is not None,
2950                         '_download_params': {
2951                             'stream_type': stream_type,
2952                             'duration': duration,
2953                             'timescale': stream_timescale,
2954                             'width': width or 0,
2955                             'height': height or 0,
2956                             'fourcc': fourcc,
2957                             'language': stream_language,
2958                             'codec_private_data': track.get('CodecPrivateData'),
2959                             'sampling_rate': sampling_rate,
2960                             'channels': int_or_none(track.get('Channels', 2)),
2961                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2962                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2963                         },
2964                     })
2965         return formats, subtitles
2966
2967     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2968         def absolute_url(item_url):
2969             return urljoin(base_url, item_url)
2970
2971         def parse_content_type(content_type):
2972             if not content_type:
2973                 return {}
2974             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2975             if ctr:
2976                 mimetype, codecs = ctr.groups()
2977                 f = parse_codecs(codecs)
2978                 f['ext'] = mimetype2ext(mimetype)
2979                 return f
2980             return {}
2981
2982         def _media_formats(src, cur_media_type, type_info={}):
2983             full_url = absolute_url(src)
2984             ext = type_info.get('ext') or determine_ext(full_url)
2985             if ext == 'm3u8':
2986                 is_plain_url = False
2987                 formats = self._extract_m3u8_formats(
2988                     full_url, video_id, ext='mp4',
2989                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2990                     preference=preference, quality=quality, fatal=False)
2991             elif ext == 'mpd':
2992                 is_plain_url = False
2993                 formats = self._extract_mpd_formats(
2994                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2995             else:
2996                 is_plain_url = True
2997                 formats = [{
2998                     'url': full_url,
2999                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3000                 }]
3001             return is_plain_url, formats
3002
3003         entries = []
3004         # amp-video and amp-audio are very similar to their HTML5 counterparts
3005         # so we wll include them right here (see
3006         # https://www.ampproject.org/docs/reference/components/amp-video)
3007         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3008         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3009         media_tags = [(media_tag, media_tag_name, media_type, '')
3010                       for media_tag, media_tag_name, media_type
3011                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3012         media_tags.extend(re.findall(
3013             # We only allow video|audio followed by a whitespace or '>'.
3014             # Allowing more characters may end up in significant slow down (see
3015             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3016             # http://www.porntrex.com/maps/videositemap.xml).
3017             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3018         for media_tag, _, media_type, media_content in media_tags:
3019             media_info = {
3020                 'formats': [],
3021                 'subtitles': {},
3022             }
3023             media_attributes = extract_attributes(media_tag)
3024             src = strip_or_none(media_attributes.get('src'))
3025             if src:
3026                 _, formats = _media_formats(src, media_type)
3027                 media_info['formats'].extend(formats)
3028             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3029             if media_content:
3030                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3031                     s_attr = extract_attributes(source_tag)
3032                     # data-video-src and data-src are non standard but seen
3033                     # several times in the wild
3034                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3035                     if not src:
3036                         continue
3037                     f = parse_content_type(s_attr.get('type'))
3038                     is_plain_url, formats = _media_formats(src, media_type, f)
3039                     if is_plain_url:
3040                         # width, height, res, label and title attributes are
3041                         # all not standard but seen several times in the wild
3042                         labels = [
3043                             s_attr.get(lbl)
3044                             for lbl in ('label', 'title')
3045                             if str_or_none(s_attr.get(lbl))
3046                         ]
3047                         width = int_or_none(s_attr.get('width'))
3048                         height = (int_or_none(s_attr.get('height'))
3049                                   or int_or_none(s_attr.get('res')))
3050                         if not width or not height:
3051                             for lbl in labels:
3052                                 resolution = parse_resolution(lbl)
3053                                 if not resolution:
3054                                     continue
3055                                 width = width or resolution.get('width')
3056                                 height = height or resolution.get('height')
3057                         for lbl in labels:
3058                             tbr = parse_bitrate(lbl)
3059                             if tbr:
3060                                 break
3061                         else:
3062                             tbr = None
3063                         f.update({
3064                             'width': width,
3065                             'height': height,
3066                             'tbr': tbr,
3067                             'format_id': s_attr.get('label') or s_attr.get('title'),
3068                         })
3069                         f.update(formats[0])
3070                         media_info['formats'].append(f)
3071                     else:
3072                         media_info['formats'].extend(formats)
3073                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3074                     track_attributes = extract_attributes(track_tag)
3075                     kind = track_attributes.get('kind')
3076                     if not kind or kind in ('subtitles', 'captions'):
3077                         src = strip_or_none(track_attributes.get('src'))
3078                         if not src:
3079                             continue
3080                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3081                         media_info['subtitles'].setdefault(lang, []).append({
3082                             'url': absolute_url(src),
3083                         })
3084             for f in media_info['formats']:
3085                 f.setdefault('http_headers', {})['Referer'] = base_url
3086             if media_info['formats'] or media_info['subtitles']:
3087                 entries.append(media_info)
3088         return entries
3089
3090     def _extract_akamai_formats(self, *args, **kwargs):
3091         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3092         if subs:
3093             self.report_warning(bug_reports_message(
3094                 "Ignoring subtitle tracks found in the manifests; "
3095                 "if any subtitle tracks are missing,"
3096             ))
3097         return fmts
3098
3099     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3100         signed = 'hdnea=' in manifest_url
3101         if not signed:
3102             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3103             manifest_url = re.sub(
3104                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3105                 '', manifest_url).strip('?')
3106
3107         formats = []
3108         subtitles = {}
3109
3110         hdcore_sign = 'hdcore=3.7.0'
3111         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3112         hds_host = hosts.get('hds')
3113         if hds_host:
3114             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3115         if 'hdcore=' not in f4m_url:
3116             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3117         f4m_formats = self._extract_f4m_formats(
3118             f4m_url, video_id, f4m_id='hds', fatal=False)
3119         for entry in f4m_formats:
3120             entry.update({'extra_param_to_segment_url': hdcore_sign})
3121         formats.extend(f4m_formats)
3122
3123         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3124         hls_host = hosts.get('hls')
3125         if hls_host:
3126             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3127         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3128             m3u8_url, video_id, 'mp4', 'm3u8_native',
3129             m3u8_id='hls', fatal=False)
3130         formats.extend(m3u8_formats)
3131         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3132
3133         http_host = hosts.get('http')
3134         if http_host and m3u8_formats and not signed:
3135             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3136             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3137             qualities_length = len(qualities)
3138             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3139                 i = 0
3140                 for f in m3u8_formats:
3141                     if f['vcodec'] != 'none':
3142                         for protocol in ('http', 'https'):
3143                             http_f = f.copy()
3144                             del http_f['manifest_url']
3145                             http_url = re.sub(
3146                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3147                             http_f.update({
3148                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3149                                 'url': http_url,
3150                                 'protocol': protocol,
3151                             })
3152                             formats.append(http_f)
3153                         i += 1
3154
3155         return formats, subtitles
3156
3157     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3158         query = compat_urlparse.urlparse(url).query
3159         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3160         mobj = re.search(
3161             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3162         url_base = mobj.group('url')
3163         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3164         formats = []
3165
3166         def manifest_url(manifest):
3167             m_url = '%s/%s' % (http_base_url, manifest)
3168             if query:
3169                 m_url += '?%s' % query
3170             return m_url
3171
3172         if 'm3u8' not in skip_protocols:
3173             formats.extend(self._extract_m3u8_formats(
3174                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3175                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3176         if 'f4m' not in skip_protocols:
3177             formats.extend(self._extract_f4m_formats(
3178                 manifest_url('manifest.f4m'),
3179                 video_id, f4m_id='hds', fatal=False))
3180         if 'dash' not in skip_protocols:
3181             formats.extend(self._extract_mpd_formats(
3182                 manifest_url('manifest.mpd'),
3183                 video_id, mpd_id='dash', fatal=False))
3184         if re.search(r'(?:/smil:|\.smil)', url_base):
3185             if 'smil' not in skip_protocols:
3186                 rtmp_formats = self._extract_smil_formats(
3187                     manifest_url('jwplayer.smil'),
3188                     video_id, fatal=False)
3189                 for rtmp_format in rtmp_formats:
3190                     rtsp_format = rtmp_format.copy()
3191                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3192                     del rtsp_format['play_path']
3193                     del rtsp_format['ext']
3194                     rtsp_format.update({
3195                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3196                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3197                         'protocol': 'rtsp',
3198                     })
3199                     formats.extend([rtmp_format, rtsp_format])
3200         else:
3201             for protocol in ('rtmp', 'rtsp'):
3202                 if protocol not in skip_protocols:
3203                     formats.append({
3204                         'url': '%s:%s' % (protocol, url_base),
3205                         'format_id': protocol,
3206                         'protocol': protocol,
3207                     })
3208         return formats
3209
3210     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3211         mobj = re.search(
3212             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3213             webpage)
3214         if mobj:
3215             try:
3216                 jwplayer_data = self._parse_json(mobj.group('options'),
3217                                                  video_id=video_id,
3218                                                  transform_source=transform_source)
3219             except ExtractorError:
3220                 pass
3221             else:
3222                 if isinstance(jwplayer_data, dict):
3223                     return jwplayer_data
3224
3225     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3226         jwplayer_data = self._find_jwplayer_data(
3227             webpage, video_id, transform_source=js_to_json)
3228         return self._parse_jwplayer_data(
3229             jwplayer_data, video_id, *args, **kwargs)
3230
3231     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3232                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3233         # JWPlayer backward compatibility: flattened playlists
3234         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3235         if 'playlist' not in jwplayer_data:
3236             jwplayer_data = {'playlist': [jwplayer_data]}
3237
3238         entries = []
3239
3240         # JWPlayer backward compatibility: single playlist item
3241         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3242         if not isinstance(jwplayer_data['playlist'], list):
3243             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3244
3245         for video_data in jwplayer_data['playlist']:
3246             # JWPlayer backward compatibility: flattened sources
3247             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3248             if 'sources' not in video_data:
3249                 video_data['sources'] = [video_data]
3250
3251             this_video_id = video_id or video_data['mediaid']
3252
3253             formats = self._parse_jwplayer_formats(
3254                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3255                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3256
3257             subtitles = {}
3258             tracks = video_data.get('tracks')
3259             if tracks and isinstance(tracks, list):
3260                 for track in tracks:
3261                     if not isinstance(track, dict):
3262                         continue
3263                     track_kind = track.get('kind')
3264                     if not track_kind or not isinstance(track_kind, compat_str):
3265                         continue
3266                     if track_kind.lower() not in ('captions', 'subtitles'):
3267                         continue
3268                     track_url = urljoin(base_url, track.get('file'))
3269                     if not track_url:
3270                         continue
3271                     subtitles.setdefault(track.get('label') or 'en', []).append({
3272                         'url': self._proto_relative_url(track_url)
3273                     })
3274
3275             entry = {
3276                 'id': this_video_id,
3277                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3278                 'description': clean_html(video_data.get('description')),
3279                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3280                 'timestamp': int_or_none(video_data.get('pubdate')),
3281                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3282                 'subtitles': subtitles,
3283             }
3284             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3285             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3286                 entry.update({
3287                     '_type': 'url_transparent',
3288                     'url': formats[0]['url'],
3289                 })
3290             else:
3291                 self._sort_formats(formats)
3292                 entry['formats'] = formats
3293             entries.append(entry)
3294         if len(entries) == 1:
3295             return entries[0]
3296         else:
3297             return self.playlist_result(entries)
3298
3299     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3300                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3301         urls = []
3302         formats = []
3303         for source in jwplayer_sources_data:
3304             if not isinstance(source, dict):
3305                 continue
3306             source_url = urljoin(
3307                 base_url, self._proto_relative_url(source.get('file')))
3308             if not source_url or source_url in urls:
3309                 continue
3310             urls.append(source_url)
3311             source_type = source.get('type') or ''
3312             ext = mimetype2ext(source_type) or determine_ext(source_url)
3313             if source_type == 'hls' or ext == 'm3u8':
3314                 formats.extend(self._extract_m3u8_formats(
3315                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3316                     m3u8_id=m3u8_id, fatal=False))
3317             elif source_type == 'dash' or ext == 'mpd':
3318                 formats.extend(self._extract_mpd_formats(
3319                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3320             elif ext == 'smil':
3321                 formats.extend(self._extract_smil_formats(
3322                     source_url, video_id, fatal=False))
3323             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3324             elif source_type.startswith('audio') or ext in (
3325                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3326                 formats.append({
3327                     'url': source_url,
3328                     'vcodec': 'none',
3329                     'ext': ext,
3330                 })
3331             else:
3332                 height = int_or_none(source.get('height'))
3333                 if height is None:
3334                     # Often no height is provided but there is a label in
3335                     # format like "1080p", "720p SD", or 1080.
3336                     height = int_or_none(self._search_regex(
3337                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3338                         'height', default=None))
3339                 a_format = {
3340                     'url': source_url,
3341                     'width': int_or_none(source.get('width')),
3342                     'height': height,
3343                     'tbr': int_or_none(source.get('bitrate')),
3344                     'ext': ext,
3345                 }
3346                 if source_url.startswith('rtmp'):
3347                     a_format['ext'] = 'flv'
3348                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3349                     # of jwplayer.flash.swf
3350                     rtmp_url_parts = re.split(
3351                         r'((?:mp4|mp3|flv):)', source_url, 1)
3352                     if len(rtmp_url_parts) == 3:
3353                         rtmp_url, prefix, play_path = rtmp_url_parts
3354                         a_format.update({
3355                             'url': rtmp_url,
3356                             'play_path': prefix + play_path,
3357                         })
3358                     if rtmp_params:
3359                         a_format.update(rtmp_params)
3360                 formats.append(a_format)
3361         return formats
3362
3363     def _live_title(self, name):
3364         """ Generate the title for a live video """
3365         now = datetime.datetime.now()
3366         now_str = now.strftime('%Y-%m-%d %H:%M')
3367         return name + ' ' + now_str
3368
3369     def _int(self, v, name, fatal=False, **kwargs):
3370         res = int_or_none(v, **kwargs)
3371         if 'get_attr' in kwargs:
3372             print(getattr(v, kwargs['get_attr']))
3373         if res is None:
3374             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3375             if fatal:
3376                 raise ExtractorError(msg)
3377             else:
3378                 self.report_warning(msg)
3379         return res
3380
3381     def _float(self, v, name, fatal=False, **kwargs):
3382         res = float_or_none(v, **kwargs)
3383         if res is None:
3384             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3385             if fatal:
3386                 raise ExtractorError(msg)
3387             else:
3388                 self.report_warning(msg)
3389         return res
3390
3391     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3392                     path='/', secure=False, discard=False, rest={}, **kwargs):
3393         cookie = compat_cookiejar_Cookie(
3394             0, name, value, port, port is not None, domain, True,
3395             domain.startswith('.'), path, True, secure, expire_time,
3396             discard, None, None, rest)
3397         self._downloader.cookiejar.set_cookie(cookie)
3398
3399     def _get_cookies(self, url):
3400         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3401         req = sanitized_Request(url)
3402         self._downloader.cookiejar.add_cookie_header(req)
3403         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3404
3405     def _apply_first_set_cookie_header(self, url_handle, cookie):
3406         """
3407         Apply first Set-Cookie header instead of the last. Experimental.
3408
3409         Some sites (e.g. [1-3]) may serve two cookies under the same name
3410         in Set-Cookie header and expect the first (old) one to be set rather
3411         than second (new). However, as of RFC6265 the newer one cookie
3412         should be set into cookie store what actually happens.
3413         We will workaround this issue by resetting the cookie to
3414         the first one manually.
3415         1. https://new.vk.com/
3416         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3417         3. https://learning.oreilly.com/
3418         """
3419         for header, cookies in url_handle.headers.items():
3420             if header.lower() != 'set-cookie':
3421                 continue
3422             if sys.version_info[0] >= 3:
3423                 cookies = cookies.encode('iso-8859-1')
3424             cookies = cookies.decode('utf-8')
3425             cookie_value = re.search(
3426                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3427             if cookie_value:
3428                 value, domain = cookie_value.groups()
3429                 self._set_cookie(domain, cookie, value)
3430                 break
3431
3432     def get_testcases(self, include_onlymatching=False):
3433         t = getattr(self, '_TEST', None)
3434         if t:
3435             assert not hasattr(self, '_TESTS'), \
3436                 '%s has _TEST and _TESTS' % type(self).__name__
3437             tests = [t]
3438         else:
3439             tests = getattr(self, '_TESTS', [])
3440         for t in tests:
3441             if not include_onlymatching and t.get('only_matching', False):
3442                 continue
3443             t['name'] = type(self).__name__[:-len('IE')]
3444             yield t
3445
3446     def is_suitable(self, age_limit):
3447         """ Test whether the extractor is generally suitable for the given
3448         age limit (i.e. pornographic sites are not, all others usually are) """
3449
3450         any_restricted = False
3451         for tc in self.get_testcases(include_onlymatching=False):
3452             if tc.get('playlist', []):
3453                 tc = tc['playlist'][0]
3454             is_restricted = age_restricted(
3455                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3456             if not is_restricted:
3457                 return True
3458             any_restricted = any_restricted or is_restricted
3459         return not any_restricted
3460
3461     def extract_subtitles(self, *args, **kwargs):
3462         if (self.get_param('writesubtitles', False)
3463                 or self.get_param('listsubtitles')):
3464             return self._get_subtitles(*args, **kwargs)
3465         return {}
3466
3467     def _get_subtitles(self, *args, **kwargs):
3468         raise NotImplementedError('This method must be implemented by subclasses')
3469
3470     @staticmethod
3471     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3472         """ Merge subtitle items for one language. Items with duplicated URLs
3473         will be dropped. """
3474         list1_urls = set([item['url'] for item in subtitle_list1])
3475         ret = list(subtitle_list1)
3476         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3477         return ret
3478
3479     @classmethod
3480     def _merge_subtitles(cls, *dicts, target=None):
3481         """ Merge subtitle dictionaries, language by language. """
3482         if target is None:
3483             target = {}
3484         for d in dicts:
3485             for lang, subs in d.items():
3486                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3487         return target
3488
3489     def extract_automatic_captions(self, *args, **kwargs):
3490         if (self.get_param('writeautomaticsub', False)
3491                 or self.get_param('listsubtitles')):
3492             return self._get_automatic_captions(*args, **kwargs)
3493         return {}
3494
3495     def _get_automatic_captions(self, *args, **kwargs):
3496         raise NotImplementedError('This method must be implemented by subclasses')
3497
3498     def mark_watched(self, *args, **kwargs):
3499         if (self.get_param('mark_watched', False)
3500                 and (self._get_login_info()[0] is not None
3501                      or self.get_param('cookiefile') is not None)):
3502             self._mark_watched(*args, **kwargs)
3503
3504     def _mark_watched(self, *args, **kwargs):
3505         raise NotImplementedError('This method must be implemented by subclasses')
3506
3507     def geo_verification_headers(self):
3508         headers = {}
3509         geo_verification_proxy = self.get_param('geo_verification_proxy')
3510         if geo_verification_proxy:
3511             headers['Ytdl-request-proxy'] = geo_verification_proxy
3512         return headers
3513
3514     def _generic_id(self, url):
3515         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3516
3517     def _generic_title(self, url):
3518         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3519
3520     @staticmethod
3521     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3522         all_known = all(map(
3523             lambda x: x is not None,
3524             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3525         return (
3526             'private' if is_private
3527             else 'premium_only' if needs_premium
3528             else 'subscriber_only' if needs_subscription
3529             else 'needs_auth' if needs_auth
3530             else 'unlisted' if is_unlisted
3531             else 'public' if all_known
3532             else None)
3533
3534     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3535         '''
3536         @returns            A list of values for the extractor argument given by "key"
3537                             or "default" if no such key is present
3538         @param default      The default value to return when the key is not present (default: [])
3539         @param casesense    When false, the values are converted to lower case
3540         '''
3541         val = traverse_obj(
3542             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3543         if val is None:
3544             return [] if default is NO_DEFAULT else default
3545         return list(val) if casesense else [x.lower() for x in val]
3546
3547
3548 class SearchInfoExtractor(InfoExtractor):
3549     """
3550     Base class for paged search queries extractors.
3551     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3552     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3553     """
3554
3555     @classmethod
3556     def _make_valid_url(cls):
3557         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3558
3559     @classmethod
3560     def suitable(cls, url):
3561         return re.match(cls._make_valid_url(), url) is not None
3562
3563     def _real_extract(self, query):
3564         mobj = re.match(self._make_valid_url(), query)
3565         if mobj is None:
3566             raise ExtractorError('Invalid search query "%s"' % query)
3567
3568         prefix = mobj.group('prefix')
3569         query = mobj.group('query')
3570         if prefix == '':
3571             return self._get_n_results(query, 1)
3572         elif prefix == 'all':
3573             return self._get_n_results(query, self._MAX_RESULTS)
3574         else:
3575             n = int(prefix)
3576             if n <= 0:
3577                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3578             elif n > self._MAX_RESULTS:
3579                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3580                 n = self._MAX_RESULTS
3581             return self._get_n_results(query, n)
3582
3583     def _get_n_results(self, query, n):
3584         """Get a specified number of results for a query"""
3585         raise NotImplementedError('This method must be implemented by subclasses')
3586
3587     @property
3588     def SEARCH_KEY(self):
3589         return self._SEARCH_KEY