yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar_Cookie,
  18     compat_cookies_SimpleCookie,
  19     compat_etree_Element,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader import FileDownloader
  33 from ..downloader.f4m import (
  34     get_base_url,
  35     remove_encrypted_media,
  36 )
  37 from ..utils import (
  38     age_restricted,
  39     base_url,
  40     bug_reports_message,
  41     clean_html,
  42     compiled_regex_type,
  43     determine_ext,
  44     determine_protocol,
  45     dict_get,
  46     error_to_compat_str,
  47     extract_attributes,
  48     ExtractorError,
  49     fix_xml_ampersands,
  50     float_or_none,
  51     format_field,
  52     GeoRestrictedError,
  53     GeoUtils,
  54     int_or_none,
  55     js_to_json,
  56     JSON_LD_RE,
  57     mimetype2ext,
  58     network_exceptions,
  59     NO_DEFAULT,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     RegexNotFoundError,
  68     sanitize_filename,
  69     sanitized_Request,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     traverse_obj,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     url_basename,
  80     url_or_none,
  81     urljoin,
  82     variadic,
  83     xpath_element,
  84     xpath_text,
  85     xpath_with_ns,
  86 )
  87
  88
  89 class InfoExtractor(object):
  90     """Information Extractor class.
  91
  92     Information extractors are the classes that, given a URL, extract
  93     information about the video (or videos) the URL refers to. This
  94     information includes the real video URL, the video title, author and
  95     others. The information is stored in a dictionary which is then
  96     passed to the YoutubeDL. The YoutubeDL processes this
  97     information possibly downloading the video to the file system, among
  98     other possible outcomes.
  99
 100     The type field determines the type of the result.
 101     By far the most common value (and the default if _type is missing) is
 102     "video", which indicates a single video.
 103
 104     For a video, the dictionaries must include the following fields:
 105
 106     id:             Video identifier.
 107     title:          Video title, unescaped.
 108
 109     Additionally, it must contain either a formats entry or a url one:
 110
 111     formats:        A list of dictionaries for each format available, ordered
 112                     from worst to best quality.
 113
 114                     Potential fields:
 115                     * url        The mandatory URL representing the media:
 116                                    for plain file media - HTTP URL of this file,
 117                                    for RTMP - RTMP URL,
 118                                    for HLS - URL of the M3U8 media playlist,
 119                                    for HDS - URL of the F4M manifest,
 120                                    for DASH
 121                                      - HTTP URL to plain file media (in case of
 122                                        unfragmented media)
 123                                      - URL of the MPD manifest or base URL
 124                                        representing the media if MPD manifest
 125                                        is parsed from a string (in case of
 126                                        fragmented media)
 127                                    for MSS - URL of the ISM manifest.
 128                     * manifest_url
 129                                  The URL of the manifest file in case of
 130                                  fragmented media:
 131                                    for HLS - URL of the M3U8 master playlist,
 132                                    for HDS - URL of the F4M manifest,
 133                                    for DASH - URL of the MPD manifest,
 134                                    for MSS - URL of the ISM manifest.
 135                     * ext        Will be calculated from URL if missing
 136                     * format     A human-readable description of the format
 137                                  ("mp4 container with h264/opus").
 138                                  Calculated from the format_id, width, height.
 139                                  and format_note fields if missing.
 140                     * format_id  A short description of the format
 141                                  ("mp4_h264_opus" or "19").
 142                                 Technically optional, but strongly recommended.
 143                     * format_note Additional info about the format
 144                                  ("3D" or "DASH video")
 145                     * width      Width of the video, if known
 146                     * height     Height of the video, if known
 147                     * resolution Textual description of width and height
 148                     * tbr        Average bitrate of audio and video in KBit/s
 149                     * abr        Average audio bitrate in KBit/s
 150                     * acodec     Name of the audio codec in use
 151                     * asr        Audio sampling rate in Hertz
 152                     * vbr        Average video bitrate in KBit/s
 153                     * fps        Frame rate
 154                     * vcodec     Name of the video codec in use
 155                     * container  Name of the container format
 156                     * filesize   The number of bytes, if known in advance
 157                     * filesize_approx  An estimate for the number of bytes
 158                     * player_url SWF Player URL (used for rtmpdump).
 159                     * protocol   The protocol that will be used for the actual
 160                                  download, lower-case.
 161                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 162                                  "m3u8", "m3u8_native" or "http_dash_segments".
 163                     * fragment_base_url
 164                                  Base URL for fragments. Each fragment's path
 165                                  value (if present) will be relative to
 166                                  this URL.
 167                     * fragments  A list of fragments of a fragmented media.
 168                                  Each fragment entry must contain either an url
 169                                  or a path. If an url is present it should be
 170                                  considered by a client. Otherwise both path and
 171                                  fragment_base_url must be present. Here is
 172                                  the list of all potential fields:
 173                                  * "url" - fragment's URL
 174                                  * "path" - fragment's path relative to
 175                                             fragment_base_url
 176                                  * "duration" (optional, int or float)
 177                                  * "filesize" (optional, int)
 178                     * preference Order number of this format. If this field is
 179                                  present and not None, the formats get sorted
 180                                  by this field, regardless of all other values.
 181                                  -1 for default (order by other properties),
 182                                  -2 or smaller for less than default.
 183                                  < -1000 to hide the format (if there is
 184                                     another one which is strictly better)
 185                     * language   Language code, e.g. "de" or "en-US".
 186                     * language_preference  Is this in the language mentioned in
 187                                  the URL?
 188                                  10 if it's what the URL is about,
 189                                  -1 for default (don't know),
 190                                  -10 otherwise, other values reserved for now.
 191                     * quality    Order number of the video quality of this
 192                                  format, irrespective of the file format.
 193                                  -1 for default (order by other properties),
 194                                  -2 or smaller for less than default.
 195                     * source_preference  Order number for this video source
 196                                   (quality takes higher priority)
 197                                  -1 for default (order by other properties),
 198                                  -2 or smaller for less than default.
 199                     * http_headers  A dictionary of additional HTTP headers
 200                                  to add to the request.
 201                     * stretched_ratio  If given and not 1, indicates that the
 202                                  video's pixels are not square.
 203                                  width : height ratio as float.
 204                     * no_resume  The server does not support resuming the
 205                                  (HTTP or RTMP) download. Boolean.
 206                     * downloader_options  A dictionary of downloader options as
 207                                  described in FileDownloader
 208                     RTMP formats can also have the additional fields: page_url,
 209                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 210                     rtmp_protocol, rtmp_real_time
 211
 212     url:            Final video URL.
 213     ext:            Video filename extension.
 214     format:         The video format, defaults to ext (used for --get-format)
 215     player_url:     SWF Player URL (used for rtmpdump).
 216
 217     The following fields are optional:
 218
 219     alt_title:      A secondary title of the video.
 220     display_id      An alternative identifier for the video, not necessarily
 221                     unique, but available before title. Typically, id is
 222                     something like "4234987", title "Dancing naked mole rats",
 223                     and display_id "dancing-naked-mole-rats"
 224     thumbnails:     A list of dictionaries, with the following entries:
 225                         * "id" (optional, string) - Thumbnail format ID
 226                         * "url"
 227                         * "preference" (optional, int) - quality of the image
 228                         * "width" (optional, int)
 229                         * "height" (optional, int)
 230                         * "resolution" (optional, string "{width}x{height}",
 231                                         deprecated)
 232                         * "filesize" (optional, int)
 233                         * "_test_url" (optional, bool) - If true, test the URL
 234     thumbnail:      Full URL to a video thumbnail image.
 235     description:    Full video description.
 236     uploader:       Full name of the video uploader.
 237     license:        License name the video is licensed under.
 238     creator:        The creator of the video.
 239     release_timestamp: UNIX timestamp of the moment the video was released.
 240     release_date:   The date (YYYYMMDD) when the video was released.
 241     timestamp:      UNIX timestamp of the moment the video was uploaded
 242     upload_date:    Video upload date (YYYYMMDD).
 243                     If not explicitly set, calculated from timestamp.
 244     uploader_id:    Nickname or id of the video uploader.
 245     uploader_url:   Full URL to a personal webpage of the video uploader.
 246     channel:        Full name of the channel the video is uploaded on.
 247                     Note that channel fields may or may not repeat uploader
 248                     fields. This depends on a particular extractor.
 249     channel_id:     Id of the channel.
 250     channel_url:    Full URL to a channel webpage.
 251     location:       Physical location where the video was filmed.
 252     subtitles:      The available subtitles as a dictionary in the format
 253                     {tag: subformats}. "tag" is usually a language code, and
 254                     "subformats" is a list sorted from lower to higher
 255                     preference, each element is a dictionary with the "ext"
 256                     entry and one of:
 257                         * "data": The subtitles file contents
 258                         * "url": A URL pointing to the subtitles file
 259                     It can optionally also have:
 260                         * "name": Name or description of the subtitles
 261                     "ext" will be calculated from URL if missing
 262     automatic_captions: Like 'subtitles'; contains automatically generated
 263                     captions instead of normal subtitles
 264     duration:       Length of the video in seconds, as an integer or float.
 265     view_count:     How many users have watched the video on the platform.
 266     like_count:     Number of positive ratings of the video
 267     dislike_count:  Number of negative ratings of the video
 268     repost_count:   Number of reposts of the video
 269     average_rating: Average rating give by users, the scale used depends on the webpage
 270     comment_count:  Number of comments on the video
 271     comments:       A list of comments, each with one or more of the following
 272                     properties (all but one of text or html optional):
 273                         * "author" - human-readable name of the comment author
 274                         * "author_id" - user ID of the comment author
 275                         * "author_thumbnail" - The thumbnail of the comment author
 276                         * "id" - Comment ID
 277                         * "html" - Comment as HTML
 278                         * "text" - Plain text of the comment
 279                         * "timestamp" - UNIX timestamp of comment
 280                         * "parent" - ID of the comment this one is replying to.
 281                                      Set to "root" to indicate that this is a
 282                                      comment to the original video.
 283                         * "like_count" - Number of positive ratings of the comment
 284                         * "dislike_count" - Number of negative ratings of the comment
 285                         * "is_favorited" - Whether the comment is marked as
 286                                            favorite by the video uploader
 287                         * "author_is_uploader" - Whether the comment is made by
 288                                                  the video uploader
 289     age_limit:      Age restriction for the video, as an integer (years)
 290     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 291                     should allow to get the same result again. (It will be set
 292                     by YoutubeDL if it's missing)
 293     categories:     A list of categories that the video falls in, for example
 294                     ["Sports", "Berlin"]
 295     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 296     cast:           A list of the video cast
 297     is_live:        True, False, or None (=unknown). Whether this video is a
 298                     live stream that goes on instead of a fixed-length video.
 299     was_live:       True, False, or None (=unknown). Whether this video was
 300                     originally a live stream.
 301     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 302                     If absent, automatically set from is_live, was_live
 303     start_time:     Time in seconds where the reproduction should start, as
 304                     specified in the URL.
 305     end_time:       Time in seconds where the reproduction should end, as
 306                     specified in the URL.
 307     chapters:       A list of dictionaries, with the following entries:
 308                         * "start_time" - The start time of the chapter in seconds
 309                         * "end_time" - The end time of the chapter in seconds
 310                         * "title" (optional, string)
 311     playable_in_embed: Whether this video is allowed to play in embedded
 312                     players on other sites. Can be True (=always allowed),
 313                     False (=never allowed), None (=unknown), or a string
 314                     specifying the criteria for embedability (Eg: 'whitelist')
 315     availability:   Under what condition the video is available. One of
 316                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 317                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 318                     to set it
 319     __post_extractor: A function to be called just before the metadata is
 320                     written to either disk, logger or console. The function
 321                     must return a dict which will be added to the info_dict.
 322                     This is usefull for additional information that is
 323                     time-consuming to extract. Note that the fields thus
 324                     extracted will not be available to output template and
 325                     match_filter. So, only "comments" and "comment_count" are
 326                     currently allowed to be extracted via this method.
 327
 328     The following fields should only be used when the video belongs to some logical
 329     chapter or section:
 330
 331     chapter:        Name or title of the chapter the video belongs to.
 332     chapter_number: Number of the chapter the video belongs to, as an integer.
 333     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 334
 335     The following fields should only be used when the video is an episode of some
 336     series, programme or podcast:
 337
 338     series:         Title of the series or programme the video episode belongs to.
 339     season:         Title of the season the video episode belongs to.
 340     season_number:  Number of the season the video episode belongs to, as an integer.
 341     season_id:      Id of the season the video episode belongs to, as a unicode string.
 342     episode:        Title of the video episode. Unlike mandatory video title field,
 343                     this field should denote the exact title of the video episode
 344                     without any kind of decoration.
 345     episode_number: Number of the video episode within a season, as an integer.
 346     episode_id:     Id of the video episode, as a unicode string.
 347
 348     The following fields should only be used when the media is a track or a part of
 349     a music album:
 350
 351     track:          Title of the track.
 352     track_number:   Number of the track within an album or a disc, as an integer.
 353     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 354                     as a unicode string.
 355     artist:         Artist(s) of the track.
 356     genre:          Genre(s) of the track.
 357     album:          Title of the album the track belongs to.
 358     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 359     album_artist:   List of all artists appeared on the album (e.g.
 360                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 361                     and compilations).
 362     disc_number:    Number of the disc or other physical medium the track belongs to,
 363                     as an integer.
 364     release_year:   Year (YYYY) when the album was released.
 365
 366     Unless mentioned otherwise, the fields should be Unicode strings.
 367
 368     Unless mentioned otherwise, None is equivalent to absence of information.
 369
 370
 371     _type "playlist" indicates multiple videos.
 372     There must be a key "entries", which is a list, an iterable, or a PagedList
 373     object, each element of which is a valid dictionary by this specification.
 374
 375     Additionally, playlists can have "id", "title", and any other relevent
 376     attributes with the same semantics as videos (see above).
 377
 378
 379     _type "multi_video" indicates that there are multiple videos that
 380     form a single show, for examples multiple acts of an opera or TV episode.
 381     It must have an entries key like a playlist and contain all the keys
 382     required for a video at the same time.
 383
 384
 385     _type "url" indicates that the video must be extracted from another
 386     location, possibly by a different extractor. Its only required key is:
 387     "url" - the next URL to extract.
 388     The key "ie_key" can be set to the class name (minus the trailing "IE",
 389     e.g. "Youtube") if the extractor class is known in advance.
 390     Additionally, the dictionary may have any properties of the resolved entity
 391     known in advance, for example "title" if the title of the referred video is
 392     known ahead of time.
 393
 394
 395     _type "url_transparent" entities have the same specification as "url", but
 396     indicate that the given additional information is more precise than the one
 397     associated with the resolved URL.
 398     This is useful when a site employs a video service that hosts the video and
 399     its technical metadata, but that video service does not embed a useful
 400     title, description etc.
 401
 402
 403     Subclasses of this one should re-define the _real_initialize() and
 404     _real_extract() methods and define a _VALID_URL regexp.
 405     Probably, they should also be added to the list of extractors.
 406
 407     _GEO_BYPASS attribute may be set to False in order to disable
 408     geo restriction bypass mechanisms for a particular extractor.
 409     Though it won't disable explicit geo restriction bypass based on
 410     country code provided with geo_bypass_country.
 411
 412     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 413     countries for this extractor. One of these countries will be used by
 414     geo restriction bypass mechanism right away in order to bypass
 415     geo restriction, of course, if the mechanism is not disabled.
 416
 417     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 418     IP blocks in CIDR notation for this extractor. One of these IP blocks
 419     will be used by geo restriction bypass mechanism similarly
 420     to _GEO_COUNTRIES.
 421
 422     Finally, the _WORKING attribute should be set to False for broken IEs
 423     in order to warn the users and skip the tests.
 424     """
 425
 426     _ready = False
 427     _downloader = None
 428     _x_forwarded_for_ip = None
 429     _GEO_BYPASS = True
 430     _GEO_COUNTRIES = None
 431     _GEO_IP_BLOCKS = None
 432     _WORKING = True
 433
 434     _LOGIN_HINTS = {
 435         'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
 436         'cookies': (
 437             'Use --cookies for the authentication. '
 438             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to pass cookies'),
 439         'password': 'Use --username and --password or --netrc to provide account credentials',
 440     }
 441
 442     def __init__(self, downloader=None):
 443         """Constructor. Receives an optional downloader."""
 444         self._ready = False
 445         self._x_forwarded_for_ip = None
 446         self._printed_messages = set()
 447         self.set_downloader(downloader)
 448
 449     @classmethod
 450     def _match_valid_url(cls, url):
 451         # This does not use has/getattr intentionally - we want to know whether
 452         # we have cached the regexp for *this* class, whereas getattr would also
 453         # match the superclass
 454         if '_VALID_URL_RE' not in cls.__dict__:
 455             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 456         return cls._VALID_URL_RE.match(url)
 457
 458     @classmethod
 459     def suitable(cls, url):
 460         """Receives a URL and returns True if suitable for this IE."""
 461         # This function must import everything it needs (except other extractors),
 462         # so that lazy_extractors works correctly
 463         return cls._match_valid_url(url) is not None
 464
 465     @classmethod
 466     def _match_id(cls, url):
 467         return cls._match_valid_url(url).group('id')
 468
 469     @classmethod
 470     def working(cls):
 471         """Getter method for _WORKING."""
 472         return cls._WORKING
 473
 474     def initialize(self):
 475         """Initializes an instance (authentication, etc)."""
 476         self._printed_messages = set()
 477         self._initialize_geo_bypass({
 478             'countries': self._GEO_COUNTRIES,
 479             'ip_blocks': self._GEO_IP_BLOCKS,
 480         })
 481         if not self._ready:
 482             self._real_initialize()
 483             self._ready = True
 484
 485     def _initialize_geo_bypass(self, geo_bypass_context):
 486         """
 487         Initialize geo restriction bypass mechanism.
 488
 489         This method is used to initialize geo bypass mechanism based on faking
 490         X-Forwarded-For HTTP header. A random country from provided country list
 491         is selected and a random IP belonging to this country is generated. This
 492         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 493         HTTP requests.
 494
 495         This method will be used for initial geo bypass mechanism initialization
 496         during the instance initialization with _GEO_COUNTRIES and
 497         _GEO_IP_BLOCKS.
 498
 499         You may also manually call it from extractor's code if geo bypass
 500         information is not available beforehand (e.g. obtained during
 501         extraction) or due to some other reason. In this case you should pass
 502         this information in geo bypass context passed as first argument. It may
 503         contain following fields:
 504
 505         countries:  List of geo unrestricted countries (similar
 506                     to _GEO_COUNTRIES)
 507         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 508                     (similar to _GEO_IP_BLOCKS)
 509
 510         """
 511         if not self._x_forwarded_for_ip:
 512
 513             # Geo bypass mechanism is explicitly disabled by user
 514             if not self.get_param('geo_bypass', True):
 515                 return
 516
 517             if not geo_bypass_context:
 518                 geo_bypass_context = {}
 519
 520             # Backward compatibility: previously _initialize_geo_bypass
 521             # expected a list of countries, some 3rd party code may still use
 522             # it this way
 523             if isinstance(geo_bypass_context, (list, tuple)):
 524                 geo_bypass_context = {
 525                     'countries': geo_bypass_context,
 526                 }
 527
 528             # The whole point of geo bypass mechanism is to fake IP
 529             # as X-Forwarded-For HTTP header based on some IP block or
 530             # country code.
 531
 532             # Path 1: bypassing based on IP block in CIDR notation
 533
 534             # Explicit IP block specified by user, use it right away
 535             # regardless of whether extractor is geo bypassable or not
 536             ip_block = self.get_param('geo_bypass_ip_block', None)
 537
 538             # Otherwise use random IP block from geo bypass context but only
 539             # if extractor is known as geo bypassable
 540             if not ip_block:
 541                 ip_blocks = geo_bypass_context.get('ip_blocks')
 542                 if self._GEO_BYPASS and ip_blocks:
 543                     ip_block = random.choice(ip_blocks)
 544
 545             if ip_block:
 546                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 547                 self._downloader.write_debug(
 548                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 549                 return
 550
 551             # Path 2: bypassing based on country code
 552
 553             # Explicit country code specified by user, use it right away
 554             # regardless of whether extractor is geo bypassable or not
 555             country = self.get_param('geo_bypass_country', None)
 556
 557             # Otherwise use random country code from geo bypass context but
 558             # only if extractor is known as geo bypassable
 559             if not country:
 560                 countries = geo_bypass_context.get('countries')
 561                 if self._GEO_BYPASS and countries:
 562                     country = random.choice(countries)
 563
 564             if country:
 565                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 566                 self._downloader.write_debug(
 567                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 568
 569     def extract(self, url):
 570         """Extracts URL information and returns it in list of dicts."""
 571         try:
 572             for _ in range(2):
 573                 try:
 574                     self.initialize()
 575                     self.write_debug('Extracting URL: %s' % url)
 576                     ie_result = self._real_extract(url)
 577                     if ie_result is None:
 578                         return None
 579                     if self._x_forwarded_for_ip:
 580                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 581                     subtitles = ie_result.get('subtitles')
 582                     if (subtitles and 'live_chat' in subtitles
 583                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 584                         del subtitles['live_chat']
 585                     return ie_result
 586                 except GeoRestrictedError as e:
 587                     if self.__maybe_fake_ip_and_retry(e.countries):
 588                         continue
 589                     raise
 590         except ExtractorError:
 591             raise
 592         except compat_http_client.IncompleteRead as e:
 593             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 594         except (KeyError, StopIteration) as e:
 595             raise ExtractorError('An extractor error has occurred.', cause=e)
 596
 597     def __maybe_fake_ip_and_retry(self, countries):
 598         if (not self.get_param('geo_bypass_country', None)
 599                 and self._GEO_BYPASS
 600                 and self.get_param('geo_bypass', True)
 601                 and not self._x_forwarded_for_ip
 602                 and countries):
 603             country_code = random.choice(countries)
 604             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 605             if self._x_forwarded_for_ip:
 606                 self.report_warning(
 607                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 608                     % (self._x_forwarded_for_ip, country_code.upper()))
 609                 return True
 610         return False
 611
 612     def set_downloader(self, downloader):
 613         """Sets the downloader for this IE."""
 614         self._downloader = downloader
 615
 616     def _real_initialize(self):
 617         """Real initialization process. Redefine in subclasses."""
 618         pass
 619
 620     def _real_extract(self, url):
 621         """Real extraction process. Redefine in subclasses."""
 622         pass
 623
 624     @classmethod
 625     def ie_key(cls):
 626         """A string for getting the InfoExtractor with get_info_extractor"""
 627         return cls.__name__[:-2]
 628
 629     @property
 630     def IE_NAME(self):
 631         return compat_str(type(self).__name__[:-2])
 632
 633     @staticmethod
 634     def __can_accept_status_code(err, expected_status):
 635         assert isinstance(err, compat_urllib_error.HTTPError)
 636         if expected_status is None:
 637             return False
 638         elif callable(expected_status):
 639             return expected_status(err.code) is True
 640         else:
 641             return err.code in variadic(expected_status)
 642
 643     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 644         """
 645         Return the response handle.
 646
 647         See _download_webpage docstring for arguments specification.
 648         """
 649         if not self._downloader._first_webpage_request:
 650             sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
 651             if sleep_interval > 0:
 652                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 653                 time.sleep(sleep_interval)
 654         else:
 655             self._downloader._first_webpage_request = False
 656
 657         if note is None:
 658             self.report_download_webpage(video_id)
 659         elif note is not False:
 660             if video_id is None:
 661                 self.to_screen('%s' % (note,))
 662             else:
 663                 self.to_screen('%s: %s' % (video_id, note))
 664
 665         # Some sites check X-Forwarded-For HTTP header in order to figure out
 666         # the origin of the client behind proxy. This allows bypassing geo
 667         # restriction by faking this header's value to IP that belongs to some
 668         # geo unrestricted country. We will do so once we encounter any
 669         # geo restriction error.
 670         if self._x_forwarded_for_ip:
 671             if 'X-Forwarded-For' not in headers:
 672                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 673
 674         if isinstance(url_or_request, compat_urllib_request.Request):
 675             url_or_request = update_Request(
 676                 url_or_request, data=data, headers=headers, query=query)
 677         else:
 678             if query:
 679                 url_or_request = update_url_query(url_or_request, query)
 680             if data is not None or headers:
 681                 url_or_request = sanitized_Request(url_or_request, data, headers)
 682         try:
 683             return self._downloader.urlopen(url_or_request)
 684         except network_exceptions as err:
 685             if isinstance(err, compat_urllib_error.HTTPError):
 686                 if self.__can_accept_status_code(err, expected_status):
 687                     # Retain reference to error to prevent file object from
 688                     # being closed before it can be read. Works around the
 689                     # effects of <https://bugs.python.org/issue15002>
 690                     # introduced in Python 3.4.1.
 691                     err.fp._error = err
 692                     return err.fp
 693
 694             if errnote is False:
 695                 return False
 696             if errnote is None:
 697                 errnote = 'Unable to download webpage'
 698
 699             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 700             if fatal:
 701                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 702             else:
 703                 self.report_warning(errmsg)
 704                 return False
 705
 706     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 707         """
 708         Return a tuple (page content as string, URL handle).
 709
 710         See _download_webpage docstring for arguments specification.
 711         """
 712         # Strip hashes from the URL (#1038)
 713         if isinstance(url_or_request, (compat_str, str)):
 714             url_or_request = url_or_request.partition('#')[0]
 715
 716         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 717         if urlh is False:
 718             assert not fatal
 719             return False
 720         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 721         return (content, urlh)
 722
 723     @staticmethod
 724     def _guess_encoding_from_content(content_type, webpage_bytes):
 725         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 726         if m:
 727             encoding = m.group(1)
 728         else:
 729             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 730                           webpage_bytes[:1024])
 731             if m:
 732                 encoding = m.group(1).decode('ascii')
 733             elif webpage_bytes.startswith(b'\xff\xfe'):
 734                 encoding = 'utf-16'
 735             else:
 736                 encoding = 'utf-8'
 737
 738         return encoding
 739
 740     def __check_blocked(self, content):
 741         first_block = content[:512]
 742         if ('<title>Access to this site is blocked</title>' in content
 743                 and 'Websense' in first_block):
 744             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 745             blocked_iframe = self._html_search_regex(
 746                 r'<iframe src="([^"]+)"', content,
 747                 'Websense information URL', default=None)
 748             if blocked_iframe:
 749                 msg += ' Visit %s for more details' % blocked_iframe
 750             raise ExtractorError(msg, expected=True)
 751         if '<title>The URL you requested has been blocked</title>' in first_block:
 752             msg = (
 753                 'Access to this webpage has been blocked by Indian censorship. '
 754                 'Use a VPN or proxy server (with --proxy) to route around it.')
 755             block_msg = self._html_search_regex(
 756                 r'</h1><p>(.*?)</p>',
 757                 content, 'block message', default=None)
 758             if block_msg:
 759                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 760             raise ExtractorError(msg, expected=True)
 761         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 762                 and 'blocklist.rkn.gov.ru' in content):
 763             raise ExtractorError(
 764                 'Access to this webpage has been blocked by decision of the Russian government. '
 765                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 766                 expected=True)
 767
 768     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 769         content_type = urlh.headers.get('Content-Type', '')
 770         webpage_bytes = urlh.read()
 771         if prefix is not None:
 772             webpage_bytes = prefix + webpage_bytes
 773         if not encoding:
 774             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 775         if self.get_param('dump_intermediate_pages', False):
 776             self.to_screen('Dumping request to ' + urlh.geturl())
 777             dump = base64.b64encode(webpage_bytes).decode('ascii')
 778             self._downloader.to_screen(dump)
 779         if self.get_param('write_pages', False):
 780             basen = '%s_%s' % (video_id, urlh.geturl())
 781             if len(basen) > 240:
 782                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 783                 basen = basen[:240 - len(h)] + h
 784             raw_filename = basen + '.dump'
 785             filename = sanitize_filename(raw_filename, restricted=True)
 786             self.to_screen('Saving request to ' + filename)
 787             # Working around MAX_PATH limitation on Windows (see
 788             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 789             if compat_os_name == 'nt':
 790                 absfilepath = os.path.abspath(filename)
 791                 if len(absfilepath) > 259:
 792                     filename = '\\\\?\\' + absfilepath
 793             with open(filename, 'wb') as outf:
 794                 outf.write(webpage_bytes)
 795
 796         try:
 797             content = webpage_bytes.decode(encoding, 'replace')
 798         except LookupError:
 799             content = webpage_bytes.decode('utf-8', 'replace')
 800
 801         self.__check_blocked(content)
 802
 803         return content
 804
 805     def _download_webpage(
 806             self, url_or_request, video_id, note=None, errnote=None,
 807             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 808             headers={}, query={}, expected_status=None):
 809         """
 810         Return the data of the page as a string.
 811
 812         Arguments:
 813         url_or_request -- plain text URL as a string or
 814             a compat_urllib_request.Requestobject
 815         video_id -- Video/playlist/item identifier (string)
 816
 817         Keyword arguments:
 818         note -- note printed before downloading (string)
 819         errnote -- note printed in case of an error (string)
 820         fatal -- flag denoting whether error should be considered fatal,
 821             i.e. whether it should cause ExtractionError to be raised,
 822             otherwise a warning will be reported and extraction continued
 823         tries -- number of tries
 824         timeout -- sleep interval between tries
 825         encoding -- encoding for a page content decoding, guessed automatically
 826             when not explicitly specified
 827         data -- POST data (bytes)
 828         headers -- HTTP headers (dict)
 829         query -- URL query (dict)
 830         expected_status -- allows to accept failed HTTP requests (non 2xx
 831             status code) by explicitly specifying a set of accepted status
 832             codes. Can be any of the following entities:
 833                 - an integer type specifying an exact failed status code to
 834                   accept
 835                 - a list or a tuple of integer types specifying a list of
 836                   failed status codes to accept
 837                 - a callable accepting an actual failed status code and
 838                   returning True if it should be accepted
 839             Note that this argument does not affect success status codes (2xx)
 840             which are always accepted.
 841         """
 842
 843         success = False
 844         try_count = 0
 845         while success is False:
 846             try:
 847                 res = self._download_webpage_handle(
 848                     url_or_request, video_id, note, errnote, fatal,
 849                     encoding=encoding, data=data, headers=headers, query=query,
 850                     expected_status=expected_status)
 851                 success = True
 852             except compat_http_client.IncompleteRead as e:
 853                 try_count += 1
 854                 if try_count >= tries:
 855                     raise e
 856                 self._sleep(timeout, video_id)
 857         if res is False:
 858             return res
 859         else:
 860             content, _ = res
 861             return content
 862
 863     def _download_xml_handle(
 864             self, url_or_request, video_id, note='Downloading XML',
 865             errnote='Unable to download XML', transform_source=None,
 866             fatal=True, encoding=None, data=None, headers={}, query={},
 867             expected_status=None):
 868         """
 869         Return a tuple (xml as an compat_etree_Element, URL handle).
 870
 871         See _download_webpage docstring for arguments specification.
 872         """
 873         res = self._download_webpage_handle(
 874             url_or_request, video_id, note, errnote, fatal=fatal,
 875             encoding=encoding, data=data, headers=headers, query=query,
 876             expected_status=expected_status)
 877         if res is False:
 878             return res
 879         xml_string, urlh = res
 880         return self._parse_xml(
 881             xml_string, video_id, transform_source=transform_source,
 882             fatal=fatal), urlh
 883
 884     def _download_xml(
 885             self, url_or_request, video_id,
 886             note='Downloading XML', errnote='Unable to download XML',
 887             transform_source=None, fatal=True, encoding=None,
 888             data=None, headers={}, query={}, expected_status=None):
 889         """
 890         Return the xml as an compat_etree_Element.
 891
 892         See _download_webpage docstring for arguments specification.
 893         """
 894         res = self._download_xml_handle(
 895             url_or_request, video_id, note=note, errnote=errnote,
 896             transform_source=transform_source, fatal=fatal, encoding=encoding,
 897             data=data, headers=headers, query=query,
 898             expected_status=expected_status)
 899         return res if res is False else res[0]
 900
 901     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 902         if transform_source:
 903             xml_string = transform_source(xml_string)
 904         try:
 905             return compat_etree_fromstring(xml_string.encode('utf-8'))
 906         except compat_xml_parse_error as ve:
 907             errmsg = '%s: Failed to parse XML ' % video_id
 908             if fatal:
 909                 raise ExtractorError(errmsg, cause=ve)
 910             else:
 911                 self.report_warning(errmsg + str(ve))
 912
 913     def _download_json_handle(
 914             self, url_or_request, video_id, note='Downloading JSON metadata',
 915             errnote='Unable to download JSON metadata', transform_source=None,
 916             fatal=True, encoding=None, data=None, headers={}, query={},
 917             expected_status=None):
 918         """
 919         Return a tuple (JSON object, URL handle).
 920
 921         See _download_webpage docstring for arguments specification.
 922         """
 923         res = self._download_webpage_handle(
 924             url_or_request, video_id, note, errnote, fatal=fatal,
 925             encoding=encoding, data=data, headers=headers, query=query,
 926             expected_status=expected_status)
 927         if res is False:
 928             return res
 929         json_string, urlh = res
 930         return self._parse_json(
 931             json_string, video_id, transform_source=transform_source,
 932             fatal=fatal), urlh
 933
 934     def _download_json(
 935             self, url_or_request, video_id, note='Downloading JSON metadata',
 936             errnote='Unable to download JSON metadata', transform_source=None,
 937             fatal=True, encoding=None, data=None, headers={}, query={},
 938             expected_status=None):
 939         """
 940         Return the JSON object as a dict.
 941
 942         See _download_webpage docstring for arguments specification.
 943         """
 944         res = self._download_json_handle(
 945             url_or_request, video_id, note=note, errnote=errnote,
 946             transform_source=transform_source, fatal=fatal, encoding=encoding,
 947             data=data, headers=headers, query=query,
 948             expected_status=expected_status)
 949         return res if res is False else res[0]
 950
 951     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 952         if transform_source:
 953             json_string = transform_source(json_string)
 954         try:
 955             return json.loads(json_string)
 956         except ValueError as ve:
 957             errmsg = '%s: Failed to parse JSON ' % video_id
 958             if fatal:
 959                 raise ExtractorError(errmsg, cause=ve)
 960             else:
 961                 self.report_warning(errmsg + str(ve))
 962
 963     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 964         return self._parse_json(
 965             data[data.find('{'):data.rfind('}') + 1],
 966             video_id, transform_source, fatal)
 967
 968     def _download_socket_json_handle(
 969             self, url_or_request, video_id, note='Polling socket',
 970             errnote='Unable to poll socket', transform_source=None,
 971             fatal=True, encoding=None, data=None, headers={}, query={},
 972             expected_status=None):
 973         """
 974         Return a tuple (JSON object, URL handle).
 975
 976         See _download_webpage docstring for arguments specification.
 977         """
 978         res = self._download_webpage_handle(
 979             url_or_request, video_id, note, errnote, fatal=fatal,
 980             encoding=encoding, data=data, headers=headers, query=query,
 981             expected_status=expected_status)
 982         if res is False:
 983             return res
 984         webpage, urlh = res
 985         return self._parse_socket_response_as_json(
 986             webpage, video_id, transform_source=transform_source,
 987             fatal=fatal), urlh
 988
 989     def _download_socket_json(
 990             self, url_or_request, video_id, note='Polling socket',
 991             errnote='Unable to poll socket', transform_source=None,
 992             fatal=True, encoding=None, data=None, headers={}, query={},
 993             expected_status=None):
 994         """
 995         Return the JSON object as a dict.
 996
 997         See _download_webpage docstring for arguments specification.
 998         """
 999         res = self._download_socket_json_handle(
1000             url_or_request, video_id, note=note, errnote=errnote,
1001             transform_source=transform_source, fatal=fatal, encoding=encoding,
1002             data=data, headers=headers, query=query,
1003             expected_status=expected_status)
1004         return res if res is False else res[0]
1005
1006     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1007         idstr = format_field(video_id, template='%s: ')
1008         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1009         if only_once:
1010             if f'WARNING: {msg}' in self._printed_messages:
1011                 return
1012             self._printed_messages.add(f'WARNING: {msg}')
1013         self._downloader.report_warning(msg, *args, **kwargs)
1014
1015     def to_screen(self, msg, *args, **kwargs):
1016         """Print msg to screen, prefixing it with '[ie_name]'"""
1017         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1018
1019     def write_debug(self, msg, *args, **kwargs):
1020         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1021
1022     def get_param(self, name, default=None, *args, **kwargs):
1023         if self._downloader:
1024             return self._downloader.params.get(name, default, *args, **kwargs)
1025         return default
1026
1027     def report_extraction(self, id_or_name):
1028         """Report information extraction."""
1029         self.to_screen('%s: Extracting information' % id_or_name)
1030
1031     def report_download_webpage(self, video_id):
1032         """Report webpage download."""
1033         self.to_screen('%s: Downloading webpage' % video_id)
1034
1035     def report_age_confirmation(self):
1036         """Report attempt to confirm age."""
1037         self.to_screen('Confirming age')
1038
1039     def report_login(self):
1040         """Report attempt to log in."""
1041         self.to_screen('Logging in')
1042
1043     def raise_login_required(
1044             self, msg='This video is only available for registered users',
1045             metadata_available=False, method='any'):
1046         if metadata_available and self.get_param('ignore_no_formats_error'):
1047             self.report_warning(msg)
1048         if method is not None:
1049             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1050         raise ExtractorError(msg, expected=True)
1051
1052     def raise_geo_restricted(
1053             self, msg='This video is not available from your location due to geo restriction',
1054             countries=None, metadata_available=False):
1055         if metadata_available and self.get_param('ignore_no_formats_error'):
1056             self.report_warning(msg)
1057         else:
1058             raise GeoRestrictedError(msg, countries=countries)
1059
1060     def raise_no_formats(self, msg, expected=False, video_id=None):
1061         if expected and self.get_param('ignore_no_formats_error'):
1062             self.report_warning(msg, video_id)
1063         elif isinstance(msg, ExtractorError):
1064             raise msg
1065         else:
1066             raise ExtractorError(msg, expected=expected, video_id=video_id)
1067
1068     # Methods for following #608
1069     @staticmethod
1070     def url_result(url, ie=None, video_id=None, video_title=None):
1071         """Returns a URL that points to a page that should be processed"""
1072         # TODO: ie should be the class used for getting the info
1073         video_info = {'_type': 'url',
1074                       'url': url,
1075                       'ie_key': ie}
1076         if video_id is not None:
1077             video_info['id'] = video_id
1078         if video_title is not None:
1079             video_info['title'] = video_title
1080         return video_info
1081
1082     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1083         urls = orderedSet(
1084             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1085             for m in matches)
1086         return self.playlist_result(
1087             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1088
1089     @staticmethod
1090     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1091         """Returns a playlist"""
1092         video_info = {'_type': 'playlist',
1093                       'entries': entries}
1094         video_info.update(kwargs)
1095         if playlist_id:
1096             video_info['id'] = playlist_id
1097         if playlist_title:
1098             video_info['title'] = playlist_title
1099         if playlist_description is not None:
1100             video_info['description'] = playlist_description
1101         return video_info
1102
1103     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1104         """
1105         Perform a regex search on the given string, using a single or a list of
1106         patterns returning the first matching group.
1107         In case of failure return a default value or raise a WARNING or a
1108         RegexNotFoundError, depending on fatal, specifying the field name.
1109         """
1110         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1111             mobj = re.search(pattern, string, flags)
1112         else:
1113             for p in pattern:
1114                 mobj = re.search(p, string, flags)
1115                 if mobj:
1116                     break
1117
1118         if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1119             _name = '\033[0;34m%s\033[0m' % name
1120         else:
1121             _name = name
1122
1123         if mobj:
1124             if group is None:
1125                 # return the first matching group
1126                 return next(g for g in mobj.groups() if g is not None)
1127             elif isinstance(group, (list, tuple)):
1128                 return tuple(mobj.group(g) for g in group)
1129             else:
1130                 return mobj.group(group)
1131         elif default is not NO_DEFAULT:
1132             return default
1133         elif fatal:
1134             raise RegexNotFoundError('Unable to extract %s' % _name)
1135         else:
1136             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1137             return None
1138
1139     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1140         """
1141         Like _search_regex, but strips HTML tags and unescapes entities.
1142         """
1143         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1144         if res:
1145             return clean_html(res).strip()
1146         else:
1147             return res
1148
1149     def _get_netrc_login_info(self, netrc_machine=None):
1150         username = None
1151         password = None
1152         netrc_machine = netrc_machine or self._NETRC_MACHINE
1153
1154         if self.get_param('usenetrc', False):
1155             try:
1156                 info = netrc.netrc().authenticators(netrc_machine)
1157                 if info is not None:
1158                     username = info[0]
1159                     password = info[2]
1160                 else:
1161                     raise netrc.NetrcParseError(
1162                         'No authenticators for %s' % netrc_machine)
1163             except (IOError, netrc.NetrcParseError) as err:
1164                 self.report_warning(
1165                     'parsing .netrc: %s' % error_to_compat_str(err))
1166
1167         return username, password
1168
1169     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1170         """
1171         Get the login info as (username, password)
1172         First look for the manually specified credentials using username_option
1173         and password_option as keys in params dictionary. If no such credentials
1174         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1175         value.
1176         If there's no info available, return (None, None)
1177         """
1178
1179         # Attempt to use provided username and password or .netrc data
1180         username = self.get_param(username_option)
1181         if username is not None:
1182             password = self.get_param(password_option)
1183         else:
1184             username, password = self._get_netrc_login_info(netrc_machine)
1185
1186         return username, password
1187
1188     def _get_tfa_info(self, note='two-factor verification code'):
1189         """
1190         Get the two-factor authentication info
1191         TODO - asking the user will be required for sms/phone verify
1192         currently just uses the command line option
1193         If there's no info available, return None
1194         """
1195
1196         tfa = self.get_param('twofactor')
1197         if tfa is not None:
1198             return tfa
1199
1200         return compat_getpass('Type %s and press [Return]: ' % note)
1201
1202     # Helper functions for extracting OpenGraph info
1203     @staticmethod
1204     def _og_regexes(prop):
1205         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1206         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1207                        % {'prop': re.escape(prop)})
1208         template = r'<meta[^>]+?%s[^>]+?%s'
1209         return [
1210             template % (property_re, content_re),
1211             template % (content_re, property_re),
1212         ]
1213
1214     @staticmethod
1215     def _meta_regex(prop):
1216         return r'''(?isx)<meta
1217                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1218                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1219
1220     def _og_search_property(self, prop, html, name=None, **kargs):
1221         prop = variadic(prop)
1222         if name is None:
1223             name = 'OpenGraph %s' % prop[0]
1224         og_regexes = []
1225         for p in prop:
1226             og_regexes.extend(self._og_regexes(p))
1227         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1228         if escaped is None:
1229             return None
1230         return unescapeHTML(escaped)
1231
1232     def _og_search_thumbnail(self, html, **kargs):
1233         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1234
1235     def _og_search_description(self, html, **kargs):
1236         return self._og_search_property('description', html, fatal=False, **kargs)
1237
1238     def _og_search_title(self, html, **kargs):
1239         return self._og_search_property('title', html, **kargs)
1240
1241     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1242         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1243         if secure:
1244             regexes = self._og_regexes('video:secure_url') + regexes
1245         return self._html_search_regex(regexes, html, name, **kargs)
1246
1247     def _og_search_url(self, html, **kargs):
1248         return self._og_search_property('url', html, **kargs)
1249
1250     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1251         name = variadic(name)
1252         if display_name is None:
1253             display_name = name[0]
1254         return self._html_search_regex(
1255             [self._meta_regex(n) for n in name],
1256             html, display_name, fatal=fatal, group='content', **kwargs)
1257
1258     def _dc_search_uploader(self, html):
1259         return self._html_search_meta('dc.creator', html, 'uploader')
1260
1261     def _rta_search(self, html):
1262         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1263         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1264                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1265                      html):
1266             return 18
1267         return 0
1268
1269     def _media_rating_search(self, html):
1270         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1271         rating = self._html_search_meta('rating', html)
1272
1273         if not rating:
1274             return None
1275
1276         RATING_TABLE = {
1277             'safe for kids': 0,
1278             'general': 8,
1279             '14 years': 14,
1280             'mature': 17,
1281             'restricted': 19,
1282         }
1283         return RATING_TABLE.get(rating.lower())
1284
1285     def _family_friendly_search(self, html):
1286         # See http://schema.org/VideoObject
1287         family_friendly = self._html_search_meta(
1288             'isFamilyFriendly', html, default=None)
1289
1290         if not family_friendly:
1291             return None
1292
1293         RATING_TABLE = {
1294             '1': 0,
1295             'true': 0,
1296             '0': 18,
1297             'false': 18,
1298         }
1299         return RATING_TABLE.get(family_friendly.lower())
1300
1301     def _twitter_search_player(self, html):
1302         return self._html_search_meta('twitter:player', html,
1303                                       'twitter card player')
1304
1305     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1306         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1307         default = kwargs.get('default', NO_DEFAULT)
1308         # JSON-LD may be malformed and thus `fatal` should be respected.
1309         # At the same time `default` may be passed that assumes `fatal=False`
1310         # for _search_regex. Let's simulate the same behavior here as well.
1311         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1312         json_ld = []
1313         for mobj in json_ld_list:
1314             json_ld_item = self._parse_json(
1315                 mobj.group('json_ld'), video_id, fatal=fatal)
1316             if not json_ld_item:
1317                 continue
1318             if isinstance(json_ld_item, dict):
1319                 json_ld.append(json_ld_item)
1320             elif isinstance(json_ld_item, (list, tuple)):
1321                 json_ld.extend(json_ld_item)
1322         if json_ld:
1323             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1324         if json_ld:
1325             return json_ld
1326         if default is not NO_DEFAULT:
1327             return default
1328         elif fatal:
1329             raise RegexNotFoundError('Unable to extract JSON-LD')
1330         else:
1331             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1332             return {}
1333
1334     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1335         if isinstance(json_ld, compat_str):
1336             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1337         if not json_ld:
1338             return {}
1339         info = {}
1340         if not isinstance(json_ld, (list, tuple, dict)):
1341             return info
1342         if isinstance(json_ld, dict):
1343             json_ld = [json_ld]
1344
1345         INTERACTION_TYPE_MAP = {
1346             'CommentAction': 'comment',
1347             'AgreeAction': 'like',
1348             'DisagreeAction': 'dislike',
1349             'LikeAction': 'like',
1350             'DislikeAction': 'dislike',
1351             'ListenAction': 'view',
1352             'WatchAction': 'view',
1353             'ViewAction': 'view',
1354         }
1355
1356         def extract_interaction_type(e):
1357             interaction_type = e.get('interactionType')
1358             if isinstance(interaction_type, dict):
1359                 interaction_type = interaction_type.get('@type')
1360             return str_or_none(interaction_type)
1361
1362         def extract_interaction_statistic(e):
1363             interaction_statistic = e.get('interactionStatistic')
1364             if isinstance(interaction_statistic, dict):
1365                 interaction_statistic = [interaction_statistic]
1366             if not isinstance(interaction_statistic, list):
1367                 return
1368             for is_e in interaction_statistic:
1369                 if not isinstance(is_e, dict):
1370                     continue
1371                 if is_e.get('@type') != 'InteractionCounter':
1372                     continue
1373                 interaction_type = extract_interaction_type(is_e)
1374                 if not interaction_type:
1375                     continue
1376                 # For interaction count some sites provide string instead of
1377                 # an integer (as per spec) with non digit characters (e.g. ",")
1378                 # so extracting count with more relaxed str_to_int
1379                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1380                 if interaction_count is None:
1381                     continue
1382                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1383                 if not count_kind:
1384                     continue
1385                 count_key = '%s_count' % count_kind
1386                 if info.get(count_key) is not None:
1387                     continue
1388                 info[count_key] = interaction_count
1389
1390         def extract_video_object(e):
1391             assert e['@type'] == 'VideoObject'
1392             author = e.get('author')
1393             info.update({
1394                 'url': url_or_none(e.get('contentUrl')),
1395                 'title': unescapeHTML(e.get('name')),
1396                 'description': unescapeHTML(e.get('description')),
1397                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1398                 'duration': parse_duration(e.get('duration')),
1399                 'timestamp': unified_timestamp(e.get('uploadDate')),
1400                 # author can be an instance of 'Organization' or 'Person' types.
1401                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1402                 # however some websites are using 'Text' type instead.
1403                 # 1. https://schema.org/VideoObject
1404                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1405                 'filesize': float_or_none(e.get('contentSize')),
1406                 'tbr': int_or_none(e.get('bitrate')),
1407                 'width': int_or_none(e.get('width')),
1408                 'height': int_or_none(e.get('height')),
1409                 'view_count': int_or_none(e.get('interactionCount')),
1410             })
1411             extract_interaction_statistic(e)
1412
1413         for e in json_ld:
1414             if '@context' in e:
1415                 item_type = e.get('@type')
1416                 if expected_type is not None and expected_type != item_type:
1417                     continue
1418                 if item_type in ('TVEpisode', 'Episode'):
1419                     episode_name = unescapeHTML(e.get('name'))
1420                     info.update({
1421                         'episode': episode_name,
1422                         'episode_number': int_or_none(e.get('episodeNumber')),
1423                         'description': unescapeHTML(e.get('description')),
1424                     })
1425                     if not info.get('title') and episode_name:
1426                         info['title'] = episode_name
1427                     part_of_season = e.get('partOfSeason')
1428                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1429                         info.update({
1430                             'season': unescapeHTML(part_of_season.get('name')),
1431                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1432                         })
1433                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1434                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1435                         info['series'] = unescapeHTML(part_of_series.get('name'))
1436                 elif item_type == 'Movie':
1437                     info.update({
1438                         'title': unescapeHTML(e.get('name')),
1439                         'description': unescapeHTML(e.get('description')),
1440                         'duration': parse_duration(e.get('duration')),
1441                         'timestamp': unified_timestamp(e.get('dateCreated')),
1442                     })
1443                 elif item_type in ('Article', 'NewsArticle'):
1444                     info.update({
1445                         'timestamp': parse_iso8601(e.get('datePublished')),
1446                         'title': unescapeHTML(e.get('headline')),
1447                         'description': unescapeHTML(e.get('articleBody')),
1448                     })
1449                 elif item_type == 'VideoObject':
1450                     extract_video_object(e)
1451                     if expected_type is None:
1452                         continue
1453                     else:
1454                         break
1455                 video = e.get('video')
1456                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1457                     extract_video_object(video)
1458                 if expected_type is None:
1459                     continue
1460                 else:
1461                     break
1462         return dict((k, v) for k, v in info.items() if v is not None)
1463
1464     @staticmethod
1465     def _hidden_inputs(html):
1466         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1467         hidden_inputs = {}
1468         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1469             attrs = extract_attributes(input)
1470             if not input:
1471                 continue
1472             if attrs.get('type') not in ('hidden', 'submit'):
1473                 continue
1474             name = attrs.get('name') or attrs.get('id')
1475             value = attrs.get('value')
1476             if name and value is not None:
1477                 hidden_inputs[name] = value
1478         return hidden_inputs
1479
1480     def _form_hidden_inputs(self, form_id, html):
1481         form = self._search_regex(
1482             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1483             html, '%s form' % form_id, group='form')
1484         return self._hidden_inputs(form)
1485
1486     class FormatSort:
1487         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1488
1489         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1490                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1491                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1492         ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
1493                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1494                         'fps', 'fs_approx', 'source', 'format_id')
1495
1496         settings = {
1497             'vcodec': {'type': 'ordered', 'regex': True,
1498                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1499             'acodec': {'type': 'ordered', 'regex': True,
1500                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1501             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1502                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1503             'vext': {'type': 'ordered', 'field': 'video_ext',
1504                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1505                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1506             'aext': {'type': 'ordered', 'field': 'audio_ext',
1507                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1508                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1509             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1510             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1511                            'field': ('vcodec', 'acodec'),
1512                            'function': lambda it: int(any(v != 'none' for v in it))},
1513             'ie_pref': {'priority': True, 'type': 'extractor'},
1514             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1515             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1516             'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
1517             'quality': {'convert': 'float_none', 'default': -1},
1518             'filesize': {'convert': 'bytes'},
1519             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1520             'id': {'convert': 'string', 'field': 'format_id'},
1521             'height': {'convert': 'float_none'},
1522             'width': {'convert': 'float_none'},
1523             'fps': {'convert': 'float_none'},
1524             'tbr': {'convert': 'float_none'},
1525             'vbr': {'convert': 'float_none'},
1526             'abr': {'convert': 'float_none'},
1527             'asr': {'convert': 'float_none'},
1528             'source': {'convert': 'ignore', 'field': 'source_preference'},
1529
1530             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1531             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1532             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1533             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1534             'res': {'type': 'multiple', 'field': ('height', 'width'),
1535                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1536
1537             # Most of these exist only for compatibility reasons
1538             'dimension': {'type': 'alias', 'field': 'res'},
1539             'resolution': {'type': 'alias', 'field': 'res'},
1540             'extension': {'type': 'alias', 'field': 'ext'},
1541             'bitrate': {'type': 'alias', 'field': 'br'},
1542             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1543             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1544             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1545             'framerate': {'type': 'alias', 'field': 'fps'},
1546             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1547             'protocol': {'type': 'alias', 'field': 'proto'},
1548             'source_preference': {'type': 'alias', 'field': 'source'},
1549             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1550             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1551             'samplerate': {'type': 'alias', 'field': 'asr'},
1552             'video_ext': {'type': 'alias', 'field': 'vext'},
1553             'audio_ext': {'type': 'alias', 'field': 'aext'},
1554             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1555             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1556             'video': {'type': 'alias', 'field': 'hasvid'},
1557             'has_video': {'type': 'alias', 'field': 'hasvid'},
1558             'audio': {'type': 'alias', 'field': 'hasaud'},
1559             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1560             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1561             'preference': {'type': 'alias', 'field': 'ie_pref'},
1562             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1563             'format_id': {'type': 'alias', 'field': 'id'},
1564         }
1565
1566         _order = []
1567
1568         def _get_field_setting(self, field, key):
1569             if field not in self.settings:
1570                 self.settings[field] = {}
1571             propObj = self.settings[field]
1572             if key not in propObj:
1573                 type = propObj.get('type')
1574                 if key == 'field':
1575                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1576                 elif key == 'convert':
1577                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1578                 else:
1579                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1580                 propObj[key] = default
1581             return propObj[key]
1582
1583         def _resolve_field_value(self, field, value, convertNone=False):
1584             if value is None:
1585                 if not convertNone:
1586                     return None
1587             else:
1588                 value = value.lower()
1589             conversion = self._get_field_setting(field, 'convert')
1590             if conversion == 'ignore':
1591                 return None
1592             if conversion == 'string':
1593                 return value
1594             elif conversion == 'float_none':
1595                 return float_or_none(value)
1596             elif conversion == 'bytes':
1597                 return FileDownloader.parse_bytes(value)
1598             elif conversion == 'order':
1599                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1600                 use_regex = self._get_field_setting(field, 'regex')
1601                 list_length = len(order_list)
1602                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1603                 if use_regex and value is not None:
1604                     for i, regex in enumerate(order_list):
1605                         if regex and re.match(regex, value):
1606                             return list_length - i
1607                     return list_length - empty_pos  # not in list
1608                 else:  # not regex or  value = None
1609                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1610             else:
1611                 if value.isnumeric():
1612                     return float(value)
1613                 else:
1614                     self.settings[field]['convert'] = 'string'
1615                     return value
1616
1617         def evaluate_params(self, params, sort_extractor):
1618             self._use_free_order = params.get('prefer_free_formats', False)
1619             self._sort_user = params.get('format_sort', [])
1620             self._sort_extractor = sort_extractor
1621
1622             def add_item(field, reverse, closest, limit_text):
1623                 field = field.lower()
1624                 if field in self._order:
1625                     return
1626                 self._order.append(field)
1627                 limit = self._resolve_field_value(field, limit_text)
1628                 data = {
1629                     'reverse': reverse,
1630                     'closest': False if limit is None else closest,
1631                     'limit_text': limit_text,
1632                     'limit': limit}
1633                 if field in self.settings:
1634                     self.settings[field].update(data)
1635                 else:
1636                     self.settings[field] = data
1637
1638             sort_list = (
1639                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1640                 + (tuple() if params.get('format_sort_force', False)
1641                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1642                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1643
1644             for item in sort_list:
1645                 match = re.match(self.regex, item)
1646                 if match is None:
1647                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1648                 field = match.group('field')
1649                 if field is None:
1650                     continue
1651                 if self._get_field_setting(field, 'type') == 'alias':
1652                     field = self._get_field_setting(field, 'field')
1653                 reverse = match.group('reverse') is not None
1654                 closest = match.group('separator') == '~'
1655                 limit_text = match.group('limit')
1656
1657                 has_limit = limit_text is not None
1658                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1659                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1660
1661                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1662                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1663                 limit_count = len(limits)
1664                 for (i, f) in enumerate(fields):
1665                     add_item(f, reverse, closest,
1666                              limits[i] if i < limit_count
1667                              else limits[0] if has_limit and not has_multiple_limits
1668                              else None)
1669
1670         def print_verbose_info(self, write_debug):
1671             if self._sort_user:
1672                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1673             if self._sort_extractor:
1674                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1675             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1676                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1677                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1678                               self._get_field_setting(field, 'limit_text'),
1679                               self._get_field_setting(field, 'limit'))
1680                 if self._get_field_setting(field, 'limit_text') is not None else '')
1681                 for field in self._order if self._get_field_setting(field, 'visible')]))
1682
1683         def _calculate_field_preference_from_value(self, format, field, type, value):
1684             reverse = self._get_field_setting(field, 'reverse')
1685             closest = self._get_field_setting(field, 'closest')
1686             limit = self._get_field_setting(field, 'limit')
1687
1688             if type == 'extractor':
1689                 maximum = self._get_field_setting(field, 'max')
1690                 if value is None or (maximum is not None and value >= maximum):
1691                     value = -1
1692             elif type == 'boolean':
1693                 in_list = self._get_field_setting(field, 'in_list')
1694                 not_in_list = self._get_field_setting(field, 'not_in_list')
1695                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1696             elif type == 'ordered':
1697                 value = self._resolve_field_value(field, value, True)
1698
1699             # try to convert to number
1700             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1701             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1702             if is_num:
1703                 value = val_num
1704
1705             return ((-10, 0) if value is None
1706                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1707                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1708                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1709                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1710                     else (-1, value, 0))
1711
1712         def _calculate_field_preference(self, format, field):
1713             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1714             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1715             if type == 'multiple':
1716                 type = 'field'  # Only 'field' is allowed in multiple for now
1717                 actual_fields = self._get_field_setting(field, 'field')
1718
1719                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1720             else:
1721                 value = get_value(field)
1722             return self._calculate_field_preference_from_value(format, field, type, value)
1723
1724         def calculate_preference(self, format):
1725             # Determine missing protocol
1726             if not format.get('protocol'):
1727                 format['protocol'] = determine_protocol(format)
1728
1729             # Determine missing ext
1730             if not format.get('ext') and 'url' in format:
1731                 format['ext'] = determine_ext(format['url'])
1732             if format.get('vcodec') == 'none':
1733                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1734                 format['video_ext'] = 'none'
1735             else:
1736                 format['video_ext'] = format['ext']
1737                 format['audio_ext'] = 'none'
1738             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1739             #    format['preference'] = -1000
1740
1741             # Determine missing bitrates
1742             if format.get('tbr') is None:
1743                 if format.get('vbr') is not None and format.get('abr') is not None:
1744                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1745             else:
1746                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1747                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1748                 if format.get('acodec') != "none" and format.get('abr') is None:
1749                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1750
1751             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1752
1753     def _sort_formats(self, formats, field_preference=[]):
1754         if not formats:
1755             if self.get_param('ignore_no_formats_error'):
1756                 return
1757             raise ExtractorError('No video formats found')
1758         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1759         format_sort.evaluate_params(self._downloader.params, field_preference)
1760         if self.get_param('verbose', False):
1761             format_sort.print_verbose_info(self._downloader.write_debug)
1762         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1763
1764     def _check_formats(self, formats, video_id):
1765         if formats:
1766             formats[:] = filter(
1767                 lambda f: self._is_valid_url(
1768                     f['url'], video_id,
1769                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1770                 formats)
1771
1772     @staticmethod
1773     def _remove_duplicate_formats(formats):
1774         format_urls = set()
1775         unique_formats = []
1776         for f in formats:
1777             if f['url'] not in format_urls:
1778                 format_urls.add(f['url'])
1779                 unique_formats.append(f)
1780         formats[:] = unique_formats
1781
1782     def _is_valid_url(self, url, video_id, item='video', headers={}):
1783         url = self._proto_relative_url(url, scheme='http:')
1784         # For now assume non HTTP(S) URLs always valid
1785         if not (url.startswith('http://') or url.startswith('https://')):
1786             return True
1787         try:
1788             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1789             return True
1790         except ExtractorError as e:
1791             self.to_screen(
1792                 '%s: %s URL is invalid, skipping: %s'
1793                 % (video_id, item, error_to_compat_str(e.cause)))
1794             return False
1795
1796     def http_scheme(self):
1797         """ Either "http:" or "https:", depending on the user's preferences """
1798         return (
1799             'http:'
1800             if self.get_param('prefer_insecure', False)
1801             else 'https:')
1802
1803     def _proto_relative_url(self, url, scheme=None):
1804         if url is None:
1805             return url
1806         if url.startswith('//'):
1807             if scheme is None:
1808                 scheme = self.http_scheme()
1809             return scheme + url
1810         else:
1811             return url
1812
1813     def _sleep(self, timeout, video_id, msg_template=None):
1814         if msg_template is None:
1815             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1816         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1817         self.to_screen(msg)
1818         time.sleep(timeout)
1819
1820     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1821                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1822                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1823         manifest = self._download_xml(
1824             manifest_url, video_id, 'Downloading f4m manifest',
1825             'Unable to download f4m manifest',
1826             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1827             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1828             transform_source=transform_source,
1829             fatal=fatal, data=data, headers=headers, query=query)
1830
1831         if manifest is False:
1832             return []
1833
1834         return self._parse_f4m_formats(
1835             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1836             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1837
1838     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1839                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1840                            fatal=True, m3u8_id=None):
1841         if not isinstance(manifest, compat_etree_Element) and not fatal:
1842             return []
1843
1844         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1845         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1846         if akamai_pv is not None and ';' in akamai_pv.text:
1847             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1848             if playerVerificationChallenge.strip() != '':
1849                 return []
1850
1851         formats = []
1852         manifest_version = '1.0'
1853         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1854         if not media_nodes:
1855             manifest_version = '2.0'
1856             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1857         # Remove unsupported DRM protected media from final formats
1858         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1859         media_nodes = remove_encrypted_media(media_nodes)
1860         if not media_nodes:
1861             return formats
1862
1863         manifest_base_url = get_base_url(manifest)
1864
1865         bootstrap_info = xpath_element(
1866             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1867             'bootstrap info', default=None)
1868
1869         vcodec = None
1870         mime_type = xpath_text(
1871             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1872             'base URL', default=None)
1873         if mime_type and mime_type.startswith('audio/'):
1874             vcodec = 'none'
1875
1876         for i, media_el in enumerate(media_nodes):
1877             tbr = int_or_none(media_el.attrib.get('bitrate'))
1878             width = int_or_none(media_el.attrib.get('width'))
1879             height = int_or_none(media_el.attrib.get('height'))
1880             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1881             # If <bootstrapInfo> is present, the specified f4m is a
1882             # stream-level manifest, and only set-level manifests may refer to
1883             # external resources.  See section 11.4 and section 4 of F4M spec
1884             if bootstrap_info is None:
1885                 media_url = None
1886                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1887                 if manifest_version == '2.0':
1888                     media_url = media_el.attrib.get('href')
1889                 if media_url is None:
1890                     media_url = media_el.attrib.get('url')
1891                 if not media_url:
1892                     continue
1893                 manifest_url = (
1894                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1895                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1896                 # If media_url is itself a f4m manifest do the recursive extraction
1897                 # since bitrates in parent manifest (this one) and media_url manifest
1898                 # may differ leading to inability to resolve the format by requested
1899                 # bitrate in f4m downloader
1900                 ext = determine_ext(manifest_url)
1901                 if ext == 'f4m':
1902                     f4m_formats = self._extract_f4m_formats(
1903                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1904                         transform_source=transform_source, fatal=fatal)
1905                     # Sometimes stream-level manifest contains single media entry that
1906                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1907                     # At the same time parent's media entry in set-level manifest may
1908                     # contain it. We will copy it from parent in such cases.
1909                     if len(f4m_formats) == 1:
1910                         f = f4m_formats[0]
1911                         f.update({
1912                             'tbr': f.get('tbr') or tbr,
1913                             'width': f.get('width') or width,
1914                             'height': f.get('height') or height,
1915                             'format_id': f.get('format_id') if not tbr else format_id,
1916                             'vcodec': vcodec,
1917                         })
1918                     formats.extend(f4m_formats)
1919                     continue
1920                 elif ext == 'm3u8':
1921                     formats.extend(self._extract_m3u8_formats(
1922                         manifest_url, video_id, 'mp4', preference=preference,
1923                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1924                     continue
1925             formats.append({
1926                 'format_id': format_id,
1927                 'url': manifest_url,
1928                 'manifest_url': manifest_url,
1929                 'ext': 'flv' if bootstrap_info is not None else None,
1930                 'protocol': 'f4m',
1931                 'tbr': tbr,
1932                 'width': width,
1933                 'height': height,
1934                 'vcodec': vcodec,
1935                 'preference': preference,
1936                 'quality': quality,
1937             })
1938         return formats
1939
1940     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1941         return {
1942             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1943             'url': m3u8_url,
1944             'ext': ext,
1945             'protocol': 'm3u8',
1946             'preference': preference - 100 if preference else -100,
1947             'quality': quality,
1948             'resolution': 'multiple',
1949             'format_note': 'Quality selection URL',
1950         }
1951
1952     def _extract_m3u8_formats(self, *args, **kwargs):
1953         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1954         if subs:
1955             self.report_warning(bug_reports_message(
1956                 "Ignoring subtitle tracks found in the HLS manifest; "
1957                 "if any subtitle tracks are missing,"
1958             ), only_once=True)
1959         return fmts
1960
1961     def _extract_m3u8_formats_and_subtitles(
1962             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1963             preference=None, quality=None, m3u8_id=None, note=None,
1964             errnote=None, fatal=True, live=False, data=None, headers={},
1965             query={}):
1966
1967         res = self._download_webpage_handle(
1968             m3u8_url, video_id,
1969             note='Downloading m3u8 information' if note is None else note,
1970             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1971             fatal=fatal, data=data, headers=headers, query=query)
1972
1973         if res is False:
1974             return [], {}
1975
1976         m3u8_doc, urlh = res
1977         m3u8_url = urlh.geturl()
1978
1979         return self._parse_m3u8_formats_and_subtitles(
1980             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1981             preference=preference, quality=quality, m3u8_id=m3u8_id,
1982             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1983             headers=headers, query=query, video_id=video_id)
1984
1985     def _parse_m3u8_formats_and_subtitles(
1986             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
1987             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1988             errnote=None, fatal=True, data=None, headers={}, query={},
1989             video_id=None):
1990         formats, subtitles = [], {}
1991
1992         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1993             return formats, subtitles
1994
1995         if (not self.get_param('allow_unplayable_formats')
1996                 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)):  # Apple FairPlay
1997             return formats, subtitles
1998
1999         def format_url(url):
2000             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2001
2002         if self.get_param('hls_split_discontinuity', False):
2003             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2004                 if not m3u8_doc:
2005                     if not manifest_url:
2006                         return []
2007                     m3u8_doc = self._download_webpage(
2008                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2009                         note=False, errnote='Failed to download m3u8 playlist information')
2010                     if m3u8_doc is False:
2011                         return []
2012                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2013
2014         else:
2015             def _extract_m3u8_playlist_indices(*args, **kwargs):
2016                 return [None]
2017
2018         # References:
2019         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2020         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2021         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2022
2023         # We should try extracting formats only from master playlists [1, 4.3.4],
2024         # i.e. playlists that describe available qualities. On the other hand
2025         # media playlists [1, 4.3.3] should be returned as is since they contain
2026         # just the media without qualities renditions.
2027         # Fortunately, master playlist can be easily distinguished from media
2028         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2029         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2030         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2031         # media playlist and MUST NOT appear in master playlist thus we can
2032         # clearly detect media playlist with this criterion.
2033
2034         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2035             formats = [{
2036                 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
2037                 'format_index': idx,
2038                 'url': m3u8_url,
2039                 'ext': ext,
2040                 'protocol': entry_protocol,
2041                 'preference': preference,
2042                 'quality': quality,
2043             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2044
2045             return formats, subtitles
2046
2047         groups = {}
2048         last_stream_inf = {}
2049
2050         def extract_media(x_media_line):
2051             media = parse_m3u8_attributes(x_media_line)
2052             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2053             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2054             if not (media_type and group_id and name):
2055                 return
2056             groups.setdefault(group_id, []).append(media)
2057             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2058             if media_type == 'SUBTITLES':
2059                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2060                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2061                 # However, lack of URI has been spotted in the wild.
2062                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2063                 if not media.get('URI'):
2064                     return
2065                 url = format_url(media['URI'])
2066                 sub_info = {
2067                     'url': url,
2068                     'ext': determine_ext(url),
2069                 }
2070                 if sub_info['ext'] == 'm3u8':
2071                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2072                     # files may contain is WebVTT:
2073                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2074                     sub_info['ext'] = 'vtt'
2075                     sub_info['protocol'] = 'm3u8_native'
2076                 lang = media.get('LANGUAGE') or 'und'
2077                 subtitles.setdefault(lang, []).append(sub_info)
2078             if media_type not in ('VIDEO', 'AUDIO'):
2079                 return
2080             media_url = media.get('URI')
2081             if media_url:
2082                 manifest_url = format_url(media_url)
2083                 formats.extend({
2084                     'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
2085                     'format_note': name,
2086                     'format_index': idx,
2087                     'url': manifest_url,
2088                     'manifest_url': m3u8_url,
2089                     'language': media.get('LANGUAGE'),
2090                     'ext': ext,
2091                     'protocol': entry_protocol,
2092                     'preference': preference,
2093                     'quality': quality,
2094                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2095                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2096
2097         def build_stream_name():
2098             # Despite specification does not mention NAME attribute for
2099             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2100             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2101             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2102             stream_name = last_stream_inf.get('NAME')
2103             if stream_name:
2104                 return stream_name
2105             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2106             # from corresponding rendition group
2107             stream_group_id = last_stream_inf.get('VIDEO')
2108             if not stream_group_id:
2109                 return
2110             stream_group = groups.get(stream_group_id)
2111             if not stream_group:
2112                 return stream_group_id
2113             rendition = stream_group[0]
2114             return rendition.get('NAME') or stream_group_id
2115
2116         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2117         # chance to detect video only formats when EXT-X-STREAM-INF tags
2118         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2119         for line in m3u8_doc.splitlines():
2120             if line.startswith('#EXT-X-MEDIA:'):
2121                 extract_media(line)
2122
2123         for line in m3u8_doc.splitlines():
2124             if line.startswith('#EXT-X-STREAM-INF:'):
2125                 last_stream_inf = parse_m3u8_attributes(line)
2126             elif line.startswith('#') or not line.strip():
2127                 continue
2128             else:
2129                 tbr = float_or_none(
2130                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2131                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2132                 manifest_url = format_url(line.strip())
2133
2134                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2135                     format_id = [m3u8_id, None, idx]
2136                     # Bandwidth of live streams may differ over time thus making
2137                     # format_id unpredictable. So it's better to keep provided
2138                     # format_id intact.
2139                     if not live:
2140                         stream_name = build_stream_name()
2141                         format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
2142                     f = {
2143                         'format_id': '-'.join(map(str, filter(None, format_id))),
2144                         'format_index': idx,
2145                         'url': manifest_url,
2146                         'manifest_url': m3u8_url,
2147                         'tbr': tbr,
2148                         'ext': ext,
2149                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2150                         'protocol': entry_protocol,
2151                         'preference': preference,
2152                         'quality': quality,
2153                     }
2154                     resolution = last_stream_inf.get('RESOLUTION')
2155                     if resolution:
2156                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2157                         if mobj:
2158                             f['width'] = int(mobj.group('width'))
2159                             f['height'] = int(mobj.group('height'))
2160                     # Unified Streaming Platform
2161                     mobj = re.search(
2162                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2163                     if mobj:
2164                         abr, vbr = mobj.groups()
2165                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2166                         f.update({
2167                             'vbr': vbr,
2168                             'abr': abr,
2169                         })
2170                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2171                     f.update(codecs)
2172                     audio_group_id = last_stream_inf.get('AUDIO')
2173                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2174                     # references a rendition group MUST have a CODECS attribute.
2175                     # However, this is not always respected, for example, [2]
2176                     # contains EXT-X-STREAM-INF tag which references AUDIO
2177                     # rendition group but does not have CODECS and despite
2178                     # referencing an audio group it represents a complete
2179                     # (with audio and video) format. So, for such cases we will
2180                     # ignore references to rendition groups and treat them
2181                     # as complete formats.
2182                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2183                         audio_group = groups.get(audio_group_id)
2184                         if audio_group and audio_group[0].get('URI'):
2185                             # TODO: update acodec for audio only formats with
2186                             # the same GROUP-ID
2187                             f['acodec'] = 'none'
2188                     if not f.get('ext'):
2189                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2190                     formats.append(f)
2191
2192                     # for DailyMotion
2193                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2194                     if progressive_uri:
2195                         http_f = f.copy()
2196                         del http_f['manifest_url']
2197                         http_f.update({
2198                             'format_id': f['format_id'].replace('hls-', 'http-'),
2199                             'protocol': 'http',
2200                             'url': progressive_uri,
2201                         })
2202                         formats.append(http_f)
2203
2204                 last_stream_inf = {}
2205         return formats, subtitles
2206
2207     @staticmethod
2208     def _xpath_ns(path, namespace=None):
2209         if not namespace:
2210             return path
2211         out = []
2212         for c in path.split('/'):
2213             if not c or c == '.':
2214                 out.append(c)
2215             else:
2216                 out.append('{%s}%s' % (namespace, c))
2217         return '/'.join(out)
2218
2219     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2220         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2221
2222         if smil is False:
2223             assert not fatal
2224             return []
2225
2226         namespace = self._parse_smil_namespace(smil)
2227
2228         fmts = self._parse_smil_formats(
2229             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2230         subs = self._parse_smil_subtitles(
2231             smil, namespace=namespace)
2232
2233         return fmts, subs
2234
2235     def _extract_smil_formats(self, *args, **kwargs):
2236         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2237         if subs:
2238             self.report_warning(bug_reports_message(
2239                 "Ignoring subtitle tracks found in the SMIL manifest; "
2240                 "if any subtitle tracks are missing,"
2241             ), only_once=True)
2242         return fmts
2243
2244     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2245         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2246         if smil is False:
2247             return {}
2248         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2249
2250     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2251         return self._download_xml(
2252             smil_url, video_id, 'Downloading SMIL file',
2253             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2254
2255     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2256         namespace = self._parse_smil_namespace(smil)
2257
2258         formats = self._parse_smil_formats(
2259             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2260         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2261
2262         video_id = os.path.splitext(url_basename(smil_url))[0]
2263         title = None
2264         description = None
2265         upload_date = None
2266         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2267             name = meta.attrib.get('name')
2268             content = meta.attrib.get('content')
2269             if not name or not content:
2270                 continue
2271             if not title and name == 'title':
2272                 title = content
2273             elif not description and name in ('description', 'abstract'):
2274                 description = content
2275             elif not upload_date and name == 'date':
2276                 upload_date = unified_strdate(content)
2277
2278         thumbnails = [{
2279             'id': image.get('type'),
2280             'url': image.get('src'),
2281             'width': int_or_none(image.get('width')),
2282             'height': int_or_none(image.get('height')),
2283         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2284
2285         return {
2286             'id': video_id,
2287             'title': title or video_id,
2288             'description': description,
2289             'upload_date': upload_date,
2290             'thumbnails': thumbnails,
2291             'formats': formats,
2292             'subtitles': subtitles,
2293         }
2294
2295     def _parse_smil_namespace(self, smil):
2296         return self._search_regex(
2297             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2298
2299     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2300         base = smil_url
2301         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2302             b = meta.get('base') or meta.get('httpBase')
2303             if b:
2304                 base = b
2305                 break
2306
2307         formats = []
2308         rtmp_count = 0
2309         http_count = 0
2310         m3u8_count = 0
2311
2312         srcs = []
2313         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2314         for medium in media:
2315             src = medium.get('src')
2316             if not src or src in srcs:
2317                 continue
2318             srcs.append(src)
2319
2320             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2321             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2322             width = int_or_none(medium.get('width'))
2323             height = int_or_none(medium.get('height'))
2324             proto = medium.get('proto')
2325             ext = medium.get('ext')
2326             src_ext = determine_ext(src)
2327             streamer = medium.get('streamer') or base
2328
2329             if proto == 'rtmp' or streamer.startswith('rtmp'):
2330                 rtmp_count += 1
2331                 formats.append({
2332                     'url': streamer,
2333                     'play_path': src,
2334                     'ext': 'flv',
2335                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2336                     'tbr': bitrate,
2337                     'filesize': filesize,
2338                     'width': width,
2339                     'height': height,
2340                 })
2341                 if transform_rtmp_url:
2342                     streamer, src = transform_rtmp_url(streamer, src)
2343                     formats[-1].update({
2344                         'url': streamer,
2345                         'play_path': src,
2346                     })
2347                 continue
2348
2349             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2350             src_url = src_url.strip()
2351
2352             if proto == 'm3u8' or src_ext == 'm3u8':
2353                 m3u8_formats = self._extract_m3u8_formats(
2354                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2355                 if len(m3u8_formats) == 1:
2356                     m3u8_count += 1
2357                     m3u8_formats[0].update({
2358                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2359                         'tbr': bitrate,
2360                         'width': width,
2361                         'height': height,
2362                     })
2363                 formats.extend(m3u8_formats)
2364             elif src_ext == 'f4m':
2365                 f4m_url = src_url
2366                 if not f4m_params:
2367                     f4m_params = {
2368                         'hdcore': '3.2.0',
2369                         'plugin': 'flowplayer-3.2.0.1',
2370                     }
2371                 f4m_url += '&' if '?' in f4m_url else '?'
2372                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2373                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2374             elif src_ext == 'mpd':
2375                 formats.extend(self._extract_mpd_formats(
2376                     src_url, video_id, mpd_id='dash', fatal=False))
2377             elif re.search(r'\.ism/[Mm]anifest', src_url):
2378                 formats.extend(self._extract_ism_formats(
2379                     src_url, video_id, ism_id='mss', fatal=False))
2380             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2381                 http_count += 1
2382                 formats.append({
2383                     'url': src_url,
2384                     'ext': ext or src_ext or 'flv',
2385                     'format_id': 'http-%d' % (bitrate or http_count),
2386                     'tbr': bitrate,
2387                     'filesize': filesize,
2388                     'width': width,
2389                     'height': height,
2390                 })
2391
2392         return formats
2393
2394     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2395         urls = []
2396         subtitles = {}
2397         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2398             src = textstream.get('src')
2399             if not src or src in urls:
2400                 continue
2401             urls.append(src)
2402             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2403             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2404             subtitles.setdefault(lang, []).append({
2405                 'url': src,
2406                 'ext': ext,
2407             })
2408         return subtitles
2409
2410     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2411         xspf = self._download_xml(
2412             xspf_url, playlist_id, 'Downloading xpsf playlist',
2413             'Unable to download xspf manifest', fatal=fatal)
2414         if xspf is False:
2415             return []
2416         return self._parse_xspf(
2417             xspf, playlist_id, xspf_url=xspf_url,
2418             xspf_base_url=base_url(xspf_url))
2419
2420     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2421         NS_MAP = {
2422             'xspf': 'http://xspf.org/ns/0/',
2423             's1': 'http://static.streamone.nl/player/ns/0',
2424         }
2425
2426         entries = []
2427         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2428             title = xpath_text(
2429                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2430             description = xpath_text(
2431                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2432             thumbnail = xpath_text(
2433                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2434             duration = float_or_none(
2435                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2436
2437             formats = []
2438             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2439                 format_url = urljoin(xspf_base_url, location.text)
2440                 if not format_url:
2441                     continue
2442                 formats.append({
2443                     'url': format_url,
2444                     'manifest_url': xspf_url,
2445                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2446                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2447                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2448                 })
2449             self._sort_formats(formats)
2450
2451             entries.append({
2452                 'id': playlist_id,
2453                 'title': title,
2454                 'description': description,
2455                 'thumbnail': thumbnail,
2456                 'duration': duration,
2457                 'formats': formats,
2458             })
2459         return entries
2460
2461     def _extract_mpd_formats(self, *args, **kwargs):
2462         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2463         if subs:
2464             self.report_warning(bug_reports_message(
2465                 "Ignoring subtitle tracks found in the DASH manifest; "
2466                 "if any subtitle tracks are missing,"
2467             ), only_once=True)
2468         return fmts
2469
2470     def _extract_mpd_formats_and_subtitles(
2471             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2472             fatal=True, data=None, headers={}, query={}):
2473         res = self._download_xml_handle(
2474             mpd_url, video_id,
2475             note='Downloading MPD manifest' if note is None else note,
2476             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2477             fatal=fatal, data=data, headers=headers, query=query)
2478         if res is False:
2479             return [], {}
2480         mpd_doc, urlh = res
2481         if mpd_doc is None:
2482             return [], {}
2483         mpd_base_url = base_url(urlh.geturl())
2484
2485         return self._parse_mpd_formats_and_subtitles(
2486             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2487
2488     def _parse_mpd_formats(self, *args, **kwargs):
2489         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2490         if subs:
2491             self.report_warning(bug_reports_message(
2492                 "Ignoring subtitle tracks found in the DASH manifest; "
2493                 "if any subtitle tracks are missing,"
2494             ), only_once=True)
2495         return fmts
2496
2497     def _parse_mpd_formats_and_subtitles(
2498             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2499         """
2500         Parse formats from MPD manifest.
2501         References:
2502          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2503             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2504          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2505         """
2506         if not self.get_param('dynamic_mpd', True):
2507             if mpd_doc.get('type') == 'dynamic':
2508                 return [], {}
2509
2510         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2511
2512         def _add_ns(path):
2513             return self._xpath_ns(path, namespace)
2514
2515         def is_drm_protected(element):
2516             return element.find(_add_ns('ContentProtection')) is not None
2517
2518         def extract_multisegment_info(element, ms_parent_info):
2519             ms_info = ms_parent_info.copy()
2520
2521             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2522             # common attributes and elements.  We will only extract relevant
2523             # for us.
2524             def extract_common(source):
2525                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2526                 if segment_timeline is not None:
2527                     s_e = segment_timeline.findall(_add_ns('S'))
2528                     if s_e:
2529                         ms_info['total_number'] = 0
2530                         ms_info['s'] = []
2531                         for s in s_e:
2532                             r = int(s.get('r', 0))
2533                             ms_info['total_number'] += 1 + r
2534                             ms_info['s'].append({
2535                                 't': int(s.get('t', 0)),
2536                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2537                                 'd': int(s.attrib['d']),
2538                                 'r': r,
2539                             })
2540                 start_number = source.get('startNumber')
2541                 if start_number:
2542                     ms_info['start_number'] = int(start_number)
2543                 timescale = source.get('timescale')
2544                 if timescale:
2545                     ms_info['timescale'] = int(timescale)
2546                 segment_duration = source.get('duration')
2547                 if segment_duration:
2548                     ms_info['segment_duration'] = float(segment_duration)
2549
2550             def extract_Initialization(source):
2551                 initialization = source.find(_add_ns('Initialization'))
2552                 if initialization is not None:
2553                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2554
2555             segment_list = element.find(_add_ns('SegmentList'))
2556             if segment_list is not None:
2557                 extract_common(segment_list)
2558                 extract_Initialization(segment_list)
2559                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2560                 if segment_urls_e:
2561                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2562             else:
2563                 segment_template = element.find(_add_ns('SegmentTemplate'))
2564                 if segment_template is not None:
2565                     extract_common(segment_template)
2566                     media = segment_template.get('media')
2567                     if media:
2568                         ms_info['media'] = media
2569                     initialization = segment_template.get('initialization')
2570                     if initialization:
2571                         ms_info['initialization'] = initialization
2572                     else:
2573                         extract_Initialization(segment_template)
2574             return ms_info
2575
2576         skip_unplayable = not self.get_param('allow_unplayable_formats')
2577
2578         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2579         formats = []
2580         subtitles = {}
2581         for period in mpd_doc.findall(_add_ns('Period')):
2582             period_duration = parse_duration(period.get('duration')) or mpd_duration
2583             period_ms_info = extract_multisegment_info(period, {
2584                 'start_number': 1,
2585                 'timescale': 1,
2586             })
2587             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2588                 if skip_unplayable and is_drm_protected(adaptation_set):
2589                     continue
2590                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2591                 for representation in adaptation_set.findall(_add_ns('Representation')):
2592                     if skip_unplayable and is_drm_protected(representation):
2593                         continue
2594                     representation_attrib = adaptation_set.attrib.copy()
2595                     representation_attrib.update(representation.attrib)
2596                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2597                     mime_type = representation_attrib['mimeType']
2598                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2599
2600                     codecs = representation_attrib.get('codecs', '')
2601                     if content_type not in ('video', 'audio', 'text'):
2602                         if mime_type == 'image/jpeg':
2603                             content_type = mime_type
2604                         elif codecs.split('.')[0] == 'stpp':
2605                             content_type = 'text'
2606                         else:
2607                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2608                             continue
2609
2610                     base_url = ''
2611                     for element in (representation, adaptation_set, period, mpd_doc):
2612                         base_url_e = element.find(_add_ns('BaseURL'))
2613                         if base_url_e is not None:
2614                             base_url = base_url_e.text + base_url
2615                             if re.match(r'^https?://', base_url):
2616                                 break
2617                     if mpd_base_url and not re.match(r'^https?://', base_url):
2618                         if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2619                             mpd_base_url += '/'
2620                         base_url = mpd_base_url + base_url
2621                     representation_id = representation_attrib.get('id')
2622                     lang = representation_attrib.get('lang')
2623                     url_el = representation.find(_add_ns('BaseURL'))
2624                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2625                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2626                     if representation_id is not None:
2627                         format_id = representation_id
2628                     else:
2629                         format_id = content_type
2630                     if mpd_id:
2631                         format_id = mpd_id + '-' + format_id
2632                     if content_type in ('video', 'audio'):
2633                         f = {
2634                             'format_id': format_id,
2635                             'manifest_url': mpd_url,
2636                             'ext': mimetype2ext(mime_type),
2637                             'width': int_or_none(representation_attrib.get('width')),
2638                             'height': int_or_none(representation_attrib.get('height')),
2639                             'tbr': float_or_none(bandwidth, 1000),
2640                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2641                             'fps': int_or_none(representation_attrib.get('frameRate')),
2642                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2643                             'format_note': 'DASH %s' % content_type,
2644                             'filesize': filesize,
2645                             'container': mimetype2ext(mime_type) + '_dash',
2646                         }
2647                         f.update(parse_codecs(codecs))
2648                     elif content_type == 'text':
2649                         f = {
2650                             'ext': mimetype2ext(mime_type),
2651                             'manifest_url': mpd_url,
2652                             'filesize': filesize,
2653                         }
2654                     elif content_type == 'image/jpeg':
2655                         # See test case in VikiIE
2656                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2657                         f = {
2658                             'format_id': format_id,
2659                             'ext': 'mhtml',
2660                             'manifest_url': mpd_url,
2661                             'format_note': 'DASH storyboards (jpeg)',
2662                             'acodec': 'none',
2663                             'vcodec': 'none',
2664                         }
2665                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2666
2667                     def prepare_template(template_name, identifiers):
2668                         tmpl = representation_ms_info[template_name]
2669                         # First of, % characters outside $...$ templates
2670                         # must be escaped by doubling for proper processing
2671                         # by % operator string formatting used further (see
2672                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2673                         t = ''
2674                         in_template = False
2675                         for c in tmpl:
2676                             t += c
2677                             if c == '$':
2678                                 in_template = not in_template
2679                             elif c == '%' and not in_template:
2680                                 t += c
2681                         # Next, $...$ templates are translated to their
2682                         # %(...) counterparts to be used with % operator
2683                         if representation_id is not None:
2684                             t = t.replace('$RepresentationID$', representation_id)
2685                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2686                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2687                         t.replace('$$', '$')
2688                         return t
2689
2690                     # @initialization is a regular template like @media one
2691                     # so it should be handled just the same way (see
2692                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2693                     if 'initialization' in representation_ms_info:
2694                         initialization_template = prepare_template(
2695                             'initialization',
2696                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2697                             # $Time$ shall not be included for @initialization thus
2698                             # only $Bandwidth$ remains
2699                             ('Bandwidth', ))
2700                         representation_ms_info['initialization_url'] = initialization_template % {
2701                             'Bandwidth': bandwidth,
2702                         }
2703
2704                     def location_key(location):
2705                         return 'url' if re.match(r'^https?://', location) else 'path'
2706
2707                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2708
2709                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2710                         media_location_key = location_key(media_template)
2711
2712                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2713                         # can't be used at the same time
2714                         if '%(Number' in media_template and 's' not in representation_ms_info:
2715                             segment_duration = None
2716                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2717                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2718                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2719                             representation_ms_info['fragments'] = [{
2720                                 media_location_key: media_template % {
2721                                     'Number': segment_number,
2722                                     'Bandwidth': bandwidth,
2723                                 },
2724                                 'duration': segment_duration,
2725                             } for segment_number in range(
2726                                 representation_ms_info['start_number'],
2727                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2728                         else:
2729                             # $Number*$ or $Time$ in media template with S list available
2730                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2731                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2732                             representation_ms_info['fragments'] = []
2733                             segment_time = 0
2734                             segment_d = None
2735                             segment_number = representation_ms_info['start_number']
2736
2737                             def add_segment_url():
2738                                 segment_url = media_template % {
2739                                     'Time': segment_time,
2740                                     'Bandwidth': bandwidth,
2741                                     'Number': segment_number,
2742                                 }
2743                                 representation_ms_info['fragments'].append({
2744                                     media_location_key: segment_url,
2745                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2746                                 })
2747
2748                             for num, s in enumerate(representation_ms_info['s']):
2749                                 segment_time = s.get('t') or segment_time
2750                                 segment_d = s['d']
2751                                 add_segment_url()
2752                                 segment_number += 1
2753                                 for r in range(s.get('r', 0)):
2754                                     segment_time += segment_d
2755                                     add_segment_url()
2756                                     segment_number += 1
2757                                 segment_time += segment_d
2758                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2759                         # No media template
2760                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2761                         # or any YouTube dashsegments video
2762                         fragments = []
2763                         segment_index = 0
2764                         timescale = representation_ms_info['timescale']
2765                         for s in representation_ms_info['s']:
2766                             duration = float_or_none(s['d'], timescale)
2767                             for r in range(s.get('r', 0) + 1):
2768                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2769                                 fragments.append({
2770                                     location_key(segment_uri): segment_uri,
2771                                     'duration': duration,
2772                                 })
2773                                 segment_index += 1
2774                         representation_ms_info['fragments'] = fragments
2775                     elif 'segment_urls' in representation_ms_info:
2776                         # Segment URLs with no SegmentTimeline
2777                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2778                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2779                         fragments = []
2780                         segment_duration = float_or_none(
2781                             representation_ms_info['segment_duration'],
2782                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2783                         for segment_url in representation_ms_info['segment_urls']:
2784                             fragment = {
2785                                 location_key(segment_url): segment_url,
2786                             }
2787                             if segment_duration:
2788                                 fragment['duration'] = segment_duration
2789                             fragments.append(fragment)
2790                         representation_ms_info['fragments'] = fragments
2791                     # If there is a fragments key available then we correctly recognized fragmented media.
2792                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2793                     # assumption is not necessarily correct since we may simply have no support for
2794                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2795                     if 'fragments' in representation_ms_info:
2796                         f.update({
2797                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2798                             'url': mpd_url or base_url,
2799                             'fragment_base_url': base_url,
2800                             'fragments': [],
2801                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2802                         })
2803                         if 'initialization_url' in representation_ms_info:
2804                             initialization_url = representation_ms_info['initialization_url']
2805                             if not f.get('url'):
2806                                 f['url'] = initialization_url
2807                             f['fragments'].append({location_key(initialization_url): initialization_url})
2808                         f['fragments'].extend(representation_ms_info['fragments'])
2809                     else:
2810                         # Assuming direct URL to unfragmented media.
2811                         f['url'] = base_url
2812                     if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2813                         formats.append(f)
2814                     elif content_type == 'text':
2815                         subtitles.setdefault(lang or 'und', []).append(f)
2816
2817         return formats, subtitles
2818
2819     def _extract_ism_formats(self, *args, **kwargs):
2820         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2821         if subs:
2822             self.report_warning(bug_reports_message(
2823                 "Ignoring subtitle tracks found in the ISM manifest; "
2824                 "if any subtitle tracks are missing,"
2825             ))
2826         return fmts
2827
2828     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2829         res = self._download_xml_handle(
2830             ism_url, video_id,
2831             note='Downloading ISM manifest' if note is None else note,
2832             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2833             fatal=fatal, data=data, headers=headers, query=query)
2834         if res is False:
2835             return [], {}
2836         ism_doc, urlh = res
2837         if ism_doc is None:
2838             return [], {}
2839
2840         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2841
2842     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2843         """
2844         Parse formats from ISM manifest.
2845         References:
2846          1. [MS-SSTR]: Smooth Streaming Protocol,
2847             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2848         """
2849         if ism_doc.get('IsLive') == 'TRUE':
2850             return [], {}
2851         if (not self.get_param('allow_unplayable_formats')
2852                 and ism_doc.find('Protection') is not None):
2853             return [], {}
2854
2855         duration = int(ism_doc.attrib['Duration'])
2856         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2857
2858         formats = []
2859         subtitles = {}
2860         for stream in ism_doc.findall('StreamIndex'):
2861             stream_type = stream.get('Type')
2862             if stream_type not in ('video', 'audio', 'text'):
2863                 continue
2864             url_pattern = stream.attrib['Url']
2865             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2866             stream_name = stream.get('Name')
2867             stream_language = stream.get('Language', 'und')
2868             for track in stream.findall('QualityLevel'):
2869                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2870                 # TODO: add support for WVC1 and WMAP
2871                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2872                     self.report_warning('%s is not a supported codec' % fourcc)
2873                     continue
2874                 tbr = int(track.attrib['Bitrate']) // 1000
2875                 # [1] does not mention Width and Height attributes. However,
2876                 # they're often present while MaxWidth and MaxHeight are
2877                 # missing, so should be used as fallbacks
2878                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2879                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2880                 sampling_rate = int_or_none(track.get('SamplingRate'))
2881
2882                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2883                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2884
2885                 fragments = []
2886                 fragment_ctx = {
2887                     'time': 0,
2888                 }
2889                 stream_fragments = stream.findall('c')
2890                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2891                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2892                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2893                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2894                     if not fragment_ctx['duration']:
2895                         try:
2896                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2897                         except IndexError:
2898                             next_fragment_time = duration
2899                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2900                     for _ in range(fragment_repeat):
2901                         fragments.append({
2902                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2903                             'duration': fragment_ctx['duration'] / stream_timescale,
2904                         })
2905                         fragment_ctx['time'] += fragment_ctx['duration']
2906
2907                 format_id = []
2908                 if ism_id:
2909                     format_id.append(ism_id)
2910                 if stream_name:
2911                     format_id.append(stream_name)
2912                 format_id.append(compat_str(tbr))
2913
2914                 if stream_type == 'text':
2915                     subtitles.setdefault(stream_language, []).append({
2916                         'ext': 'ismt',
2917                         'protocol': 'ism',
2918                         'url': ism_url,
2919                         'manifest_url': ism_url,
2920                         'fragments': fragments,
2921                         '_download_params': {
2922                             'stream_type': stream_type,
2923                             'duration': duration,
2924                             'timescale': stream_timescale,
2925                             'fourcc': fourcc,
2926                             'language': stream_language,
2927                             'codec_private_data': track.get('CodecPrivateData'),
2928                         }
2929                     })
2930                 elif stream_type in ('video', 'audio'):
2931                     formats.append({
2932                         'format_id': '-'.join(format_id),
2933                         'url': ism_url,
2934                         'manifest_url': ism_url,
2935                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2936                         'width': width,
2937                         'height': height,
2938                         'tbr': tbr,
2939                         'asr': sampling_rate,
2940                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2941                         'acodec': 'none' if stream_type == 'video' else fourcc,
2942                         'protocol': 'ism',
2943                         'fragments': fragments,
2944                         '_download_params': {
2945                             'stream_type': stream_type,
2946                             'duration': duration,
2947                             'timescale': stream_timescale,
2948                             'width': width or 0,
2949                             'height': height or 0,
2950                             'fourcc': fourcc,
2951                             'language': stream_language,
2952                             'codec_private_data': track.get('CodecPrivateData'),
2953                             'sampling_rate': sampling_rate,
2954                             'channels': int_or_none(track.get('Channels', 2)),
2955                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2956                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2957                         },
2958                     })
2959         return formats, subtitles
2960
2961     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2962         def absolute_url(item_url):
2963             return urljoin(base_url, item_url)
2964
2965         def parse_content_type(content_type):
2966             if not content_type:
2967                 return {}
2968             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2969             if ctr:
2970                 mimetype, codecs = ctr.groups()
2971                 f = parse_codecs(codecs)
2972                 f['ext'] = mimetype2ext(mimetype)
2973                 return f
2974             return {}
2975
2976         def _media_formats(src, cur_media_type, type_info={}):
2977             full_url = absolute_url(src)
2978             ext = type_info.get('ext') or determine_ext(full_url)
2979             if ext == 'm3u8':
2980                 is_plain_url = False
2981                 formats = self._extract_m3u8_formats(
2982                     full_url, video_id, ext='mp4',
2983                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2984                     preference=preference, quality=quality, fatal=False)
2985             elif ext == 'mpd':
2986                 is_plain_url = False
2987                 formats = self._extract_mpd_formats(
2988                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2989             else:
2990                 is_plain_url = True
2991                 formats = [{
2992                     'url': full_url,
2993                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2994                 }]
2995             return is_plain_url, formats
2996
2997         entries = []
2998         # amp-video and amp-audio are very similar to their HTML5 counterparts
2999         # so we wll include them right here (see
3000         # https://www.ampproject.org/docs/reference/components/amp-video)
3001         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3002         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3003         media_tags = [(media_tag, media_tag_name, media_type, '')
3004                       for media_tag, media_tag_name, media_type
3005                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3006         media_tags.extend(re.findall(
3007             # We only allow video|audio followed by a whitespace or '>'.
3008             # Allowing more characters may end up in significant slow down (see
3009             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3010             # http://www.porntrex.com/maps/videositemap.xml).
3011             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3012         for media_tag, _, media_type, media_content in media_tags:
3013             media_info = {
3014                 'formats': [],
3015                 'subtitles': {},
3016             }
3017             media_attributes = extract_attributes(media_tag)
3018             src = strip_or_none(media_attributes.get('src'))
3019             if src:
3020                 _, formats = _media_formats(src, media_type)
3021                 media_info['formats'].extend(formats)
3022             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3023             if media_content:
3024                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3025                     s_attr = extract_attributes(source_tag)
3026                     # data-video-src and data-src are non standard but seen
3027                     # several times in the wild
3028                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3029                     if not src:
3030                         continue
3031                     f = parse_content_type(s_attr.get('type'))
3032                     is_plain_url, formats = _media_formats(src, media_type, f)
3033                     if is_plain_url:
3034                         # width, height, res, label and title attributes are
3035                         # all not standard but seen several times in the wild
3036                         labels = [
3037                             s_attr.get(lbl)
3038                             for lbl in ('label', 'title')
3039                             if str_or_none(s_attr.get(lbl))
3040                         ]
3041                         width = int_or_none(s_attr.get('width'))
3042                         height = (int_or_none(s_attr.get('height'))
3043                                   or int_or_none(s_attr.get('res')))
3044                         if not width or not height:
3045                             for lbl in labels:
3046                                 resolution = parse_resolution(lbl)
3047                                 if not resolution:
3048                                     continue
3049                                 width = width or resolution.get('width')
3050                                 height = height or resolution.get('height')
3051                         for lbl in labels:
3052                             tbr = parse_bitrate(lbl)
3053                             if tbr:
3054                                 break
3055                         else:
3056                             tbr = None
3057                         f.update({
3058                             'width': width,
3059                             'height': height,
3060                             'tbr': tbr,
3061                             'format_id': s_attr.get('label') or s_attr.get('title'),
3062                         })
3063                         f.update(formats[0])
3064                         media_info['formats'].append(f)
3065                     else:
3066                         media_info['formats'].extend(formats)
3067                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3068                     track_attributes = extract_attributes(track_tag)
3069                     kind = track_attributes.get('kind')
3070                     if not kind or kind in ('subtitles', 'captions'):
3071                         src = strip_or_none(track_attributes.get('src'))
3072                         if not src:
3073                             continue
3074                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3075                         media_info['subtitles'].setdefault(lang, []).append({
3076                             'url': absolute_url(src),
3077                         })
3078             for f in media_info['formats']:
3079                 f.setdefault('http_headers', {})['Referer'] = base_url
3080             if media_info['formats'] or media_info['subtitles']:
3081                 entries.append(media_info)
3082         return entries
3083
3084     def _extract_akamai_formats(self, *args, **kwargs):
3085         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3086         if subs:
3087             self.report_warning(bug_reports_message(
3088                 "Ignoring subtitle tracks found in the manifests; "
3089                 "if any subtitle tracks are missing,"
3090             ))
3091         return fmts
3092
3093     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3094         signed = 'hdnea=' in manifest_url
3095         if not signed:
3096             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3097             manifest_url = re.sub(
3098                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3099                 '', manifest_url).strip('?')
3100
3101         formats = []
3102         subtitles = {}
3103
3104         hdcore_sign = 'hdcore=3.7.0'
3105         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3106         hds_host = hosts.get('hds')
3107         if hds_host:
3108             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3109         if 'hdcore=' not in f4m_url:
3110             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3111         f4m_formats = self._extract_f4m_formats(
3112             f4m_url, video_id, f4m_id='hds', fatal=False)
3113         for entry in f4m_formats:
3114             entry.update({'extra_param_to_segment_url': hdcore_sign})
3115         formats.extend(f4m_formats)
3116
3117         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3118         hls_host = hosts.get('hls')
3119         if hls_host:
3120             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3121         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3122             m3u8_url, video_id, 'mp4', 'm3u8_native',
3123             m3u8_id='hls', fatal=False)
3124         formats.extend(m3u8_formats)
3125         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3126
3127         http_host = hosts.get('http')
3128         if http_host and m3u8_formats and not signed:
3129             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3130             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3131             qualities_length = len(qualities)
3132             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3133                 i = 0
3134                 for f in m3u8_formats:
3135                     if f['vcodec'] != 'none':
3136                         for protocol in ('http', 'https'):
3137                             http_f = f.copy()
3138                             del http_f['manifest_url']
3139                             http_url = re.sub(
3140                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3141                             http_f.update({
3142                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3143                                 'url': http_url,
3144                                 'protocol': protocol,
3145                             })
3146                             formats.append(http_f)
3147                         i += 1
3148
3149         return formats, subtitles
3150
3151     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3152         query = compat_urlparse.urlparse(url).query
3153         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3154         mobj = re.search(
3155             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3156         url_base = mobj.group('url')
3157         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3158         formats = []
3159
3160         def manifest_url(manifest):
3161             m_url = '%s/%s' % (http_base_url, manifest)
3162             if query:
3163                 m_url += '?%s' % query
3164             return m_url
3165
3166         if 'm3u8' not in skip_protocols:
3167             formats.extend(self._extract_m3u8_formats(
3168                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3169                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3170         if 'f4m' not in skip_protocols:
3171             formats.extend(self._extract_f4m_formats(
3172                 manifest_url('manifest.f4m'),
3173                 video_id, f4m_id='hds', fatal=False))
3174         if 'dash' not in skip_protocols:
3175             formats.extend(self._extract_mpd_formats(
3176                 manifest_url('manifest.mpd'),
3177                 video_id, mpd_id='dash', fatal=False))
3178         if re.search(r'(?:/smil:|\.smil)', url_base):
3179             if 'smil' not in skip_protocols:
3180                 rtmp_formats = self._extract_smil_formats(
3181                     manifest_url('jwplayer.smil'),
3182                     video_id, fatal=False)
3183                 for rtmp_format in rtmp_formats:
3184                     rtsp_format = rtmp_format.copy()
3185                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3186                     del rtsp_format['play_path']
3187                     del rtsp_format['ext']
3188                     rtsp_format.update({
3189                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3190                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3191                         'protocol': 'rtsp',
3192                     })
3193                     formats.extend([rtmp_format, rtsp_format])
3194         else:
3195             for protocol in ('rtmp', 'rtsp'):
3196                 if protocol not in skip_protocols:
3197                     formats.append({
3198                         'url': '%s:%s' % (protocol, url_base),
3199                         'format_id': protocol,
3200                         'protocol': protocol,
3201                     })
3202         return formats
3203
3204     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3205         mobj = re.search(
3206             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3207             webpage)
3208         if mobj:
3209             try:
3210                 jwplayer_data = self._parse_json(mobj.group('options'),
3211                                                  video_id=video_id,
3212                                                  transform_source=transform_source)
3213             except ExtractorError:
3214                 pass
3215             else:
3216                 if isinstance(jwplayer_data, dict):
3217                     return jwplayer_data
3218
3219     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3220         jwplayer_data = self._find_jwplayer_data(
3221             webpage, video_id, transform_source=js_to_json)
3222         return self._parse_jwplayer_data(
3223             jwplayer_data, video_id, *args, **kwargs)
3224
3225     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3226                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3227         # JWPlayer backward compatibility: flattened playlists
3228         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3229         if 'playlist' not in jwplayer_data:
3230             jwplayer_data = {'playlist': [jwplayer_data]}
3231
3232         entries = []
3233
3234         # JWPlayer backward compatibility: single playlist item
3235         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3236         if not isinstance(jwplayer_data['playlist'], list):
3237             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3238
3239         for video_data in jwplayer_data['playlist']:
3240             # JWPlayer backward compatibility: flattened sources
3241             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3242             if 'sources' not in video_data:
3243                 video_data['sources'] = [video_data]
3244
3245             this_video_id = video_id or video_data['mediaid']
3246
3247             formats = self._parse_jwplayer_formats(
3248                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3249                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3250
3251             subtitles = {}
3252             tracks = video_data.get('tracks')
3253             if tracks and isinstance(tracks, list):
3254                 for track in tracks:
3255                     if not isinstance(track, dict):
3256                         continue
3257                     track_kind = track.get('kind')
3258                     if not track_kind or not isinstance(track_kind, compat_str):
3259                         continue
3260                     if track_kind.lower() not in ('captions', 'subtitles'):
3261                         continue
3262                     track_url = urljoin(base_url, track.get('file'))
3263                     if not track_url:
3264                         continue
3265                     subtitles.setdefault(track.get('label') or 'en', []).append({
3266                         'url': self._proto_relative_url(track_url)
3267                     })
3268
3269             entry = {
3270                 'id': this_video_id,
3271                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3272                 'description': clean_html(video_data.get('description')),
3273                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3274                 'timestamp': int_or_none(video_data.get('pubdate')),
3275                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3276                 'subtitles': subtitles,
3277             }
3278             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3279             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3280                 entry.update({
3281                     '_type': 'url_transparent',
3282                     'url': formats[0]['url'],
3283                 })
3284             else:
3285                 self._sort_formats(formats)
3286                 entry['formats'] = formats
3287             entries.append(entry)
3288         if len(entries) == 1:
3289             return entries[0]
3290         else:
3291             return self.playlist_result(entries)
3292
3293     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3294                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3295         urls = []
3296         formats = []
3297         for source in jwplayer_sources_data:
3298             if not isinstance(source, dict):
3299                 continue
3300             source_url = urljoin(
3301                 base_url, self._proto_relative_url(source.get('file')))
3302             if not source_url or source_url in urls:
3303                 continue
3304             urls.append(source_url)
3305             source_type = source.get('type') or ''
3306             ext = mimetype2ext(source_type) or determine_ext(source_url)
3307             if source_type == 'hls' or ext == 'm3u8':
3308                 formats.extend(self._extract_m3u8_formats(
3309                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3310                     m3u8_id=m3u8_id, fatal=False))
3311             elif source_type == 'dash' or ext == 'mpd':
3312                 formats.extend(self._extract_mpd_formats(
3313                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3314             elif ext == 'smil':
3315                 formats.extend(self._extract_smil_formats(
3316                     source_url, video_id, fatal=False))
3317             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3318             elif source_type.startswith('audio') or ext in (
3319                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3320                 formats.append({
3321                     'url': source_url,
3322                     'vcodec': 'none',
3323                     'ext': ext,
3324                 })
3325             else:
3326                 height = int_or_none(source.get('height'))
3327                 if height is None:
3328                     # Often no height is provided but there is a label in
3329                     # format like "1080p", "720p SD", or 1080.
3330                     height = int_or_none(self._search_regex(
3331                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3332                         'height', default=None))
3333                 a_format = {
3334                     'url': source_url,
3335                     'width': int_or_none(source.get('width')),
3336                     'height': height,
3337                     'tbr': int_or_none(source.get('bitrate')),
3338                     'ext': ext,
3339                 }
3340                 if source_url.startswith('rtmp'):
3341                     a_format['ext'] = 'flv'
3342                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3343                     # of jwplayer.flash.swf
3344                     rtmp_url_parts = re.split(
3345                         r'((?:mp4|mp3|flv):)', source_url, 1)
3346                     if len(rtmp_url_parts) == 3:
3347                         rtmp_url, prefix, play_path = rtmp_url_parts
3348                         a_format.update({
3349                             'url': rtmp_url,
3350                             'play_path': prefix + play_path,
3351                         })
3352                     if rtmp_params:
3353                         a_format.update(rtmp_params)
3354                 formats.append(a_format)
3355         return formats
3356
3357     def _live_title(self, name):
3358         """ Generate the title for a live video """
3359         now = datetime.datetime.now()
3360         now_str = now.strftime('%Y-%m-%d %H:%M')
3361         return name + ' ' + now_str
3362
3363     def _int(self, v, name, fatal=False, **kwargs):
3364         res = int_or_none(v, **kwargs)
3365         if 'get_attr' in kwargs:
3366             print(getattr(v, kwargs['get_attr']))
3367         if res is None:
3368             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3369             if fatal:
3370                 raise ExtractorError(msg)
3371             else:
3372                 self.report_warning(msg)
3373         return res
3374
3375     def _float(self, v, name, fatal=False, **kwargs):
3376         res = float_or_none(v, **kwargs)
3377         if res is None:
3378             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3379             if fatal:
3380                 raise ExtractorError(msg)
3381             else:
3382                 self.report_warning(msg)
3383         return res
3384
3385     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3386                     path='/', secure=False, discard=False, rest={}, **kwargs):
3387         cookie = compat_cookiejar_Cookie(
3388             0, name, value, port, port is not None, domain, True,
3389             domain.startswith('.'), path, True, secure, expire_time,
3390             discard, None, None, rest)
3391         self._downloader.cookiejar.set_cookie(cookie)
3392
3393     def _get_cookies(self, url):
3394         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3395         req = sanitized_Request(url)
3396         self._downloader.cookiejar.add_cookie_header(req)
3397         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3398
3399     def _apply_first_set_cookie_header(self, url_handle, cookie):
3400         """
3401         Apply first Set-Cookie header instead of the last. Experimental.
3402
3403         Some sites (e.g. [1-3]) may serve two cookies under the same name
3404         in Set-Cookie header and expect the first (old) one to be set rather
3405         than second (new). However, as of RFC6265 the newer one cookie
3406         should be set into cookie store what actually happens.
3407         We will workaround this issue by resetting the cookie to
3408         the first one manually.
3409         1. https://new.vk.com/
3410         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3411         3. https://learning.oreilly.com/
3412         """
3413         for header, cookies in url_handle.headers.items():
3414             if header.lower() != 'set-cookie':
3415                 continue
3416             if sys.version_info[0] >= 3:
3417                 cookies = cookies.encode('iso-8859-1')
3418             cookies = cookies.decode('utf-8')
3419             cookie_value = re.search(
3420                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3421             if cookie_value:
3422                 value, domain = cookie_value.groups()
3423                 self._set_cookie(domain, cookie, value)
3424                 break
3425
3426     def get_testcases(self, include_onlymatching=False):
3427         t = getattr(self, '_TEST', None)
3428         if t:
3429             assert not hasattr(self, '_TESTS'), \
3430                 '%s has _TEST and _TESTS' % type(self).__name__
3431             tests = [t]
3432         else:
3433             tests = getattr(self, '_TESTS', [])
3434         for t in tests:
3435             if not include_onlymatching and t.get('only_matching', False):
3436                 continue
3437             t['name'] = type(self).__name__[:-len('IE')]
3438             yield t
3439
3440     def is_suitable(self, age_limit):
3441         """ Test whether the extractor is generally suitable for the given
3442         age limit (i.e. pornographic sites are not, all others usually are) """
3443
3444         any_restricted = False
3445         for tc in self.get_testcases(include_onlymatching=False):
3446             if tc.get('playlist', []):
3447                 tc = tc['playlist'][0]
3448             is_restricted = age_restricted(
3449                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3450             if not is_restricted:
3451                 return True
3452             any_restricted = any_restricted or is_restricted
3453         return not any_restricted
3454
3455     def extract_subtitles(self, *args, **kwargs):
3456         if (self.get_param('writesubtitles', False)
3457                 or self.get_param('listsubtitles')):
3458             return self._get_subtitles(*args, **kwargs)
3459         return {}
3460
3461     def _get_subtitles(self, *args, **kwargs):
3462         raise NotImplementedError('This method must be implemented by subclasses')
3463
3464     @staticmethod
3465     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3466         """ Merge subtitle items for one language. Items with duplicated URLs
3467         will be dropped. """
3468         list1_urls = set([item['url'] for item in subtitle_list1])
3469         ret = list(subtitle_list1)
3470         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3471         return ret
3472
3473     @classmethod
3474     def _merge_subtitles(cls, *dicts, target=None):
3475         """ Merge subtitle dictionaries, language by language. """
3476         if target is None:
3477             target = {}
3478         for d in dicts:
3479             for lang, subs in d.items():
3480                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3481         return target
3482
3483     def extract_automatic_captions(self, *args, **kwargs):
3484         if (self.get_param('writeautomaticsub', False)
3485                 or self.get_param('listsubtitles')):
3486             return self._get_automatic_captions(*args, **kwargs)
3487         return {}
3488
3489     def _get_automatic_captions(self, *args, **kwargs):
3490         raise NotImplementedError('This method must be implemented by subclasses')
3491
3492     def mark_watched(self, *args, **kwargs):
3493         if (self.get_param('mark_watched', False)
3494                 and (self._get_login_info()[0] is not None
3495                      or self.get_param('cookiefile') is not None)):
3496             self._mark_watched(*args, **kwargs)
3497
3498     def _mark_watched(self, *args, **kwargs):
3499         raise NotImplementedError('This method must be implemented by subclasses')
3500
3501     def geo_verification_headers(self):
3502         headers = {}
3503         geo_verification_proxy = self.get_param('geo_verification_proxy')
3504         if geo_verification_proxy:
3505             headers['Ytdl-request-proxy'] = geo_verification_proxy
3506         return headers
3507
3508     def _generic_id(self, url):
3509         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3510
3511     def _generic_title(self, url):
3512         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3513
3514     @staticmethod
3515     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3516         all_known = all(map(
3517             lambda x: x is not None,
3518             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3519         return (
3520             'private' if is_private
3521             else 'premium_only' if needs_premium
3522             else 'subscriber_only' if needs_subscription
3523             else 'needs_auth' if needs_auth
3524             else 'unlisted' if is_unlisted
3525             else 'public' if all_known
3526             else None)
3527
3528     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3529         '''
3530         @returns            A list of values for the extractor argument given by "key"
3531                             or "default" if no such key is present
3532         @param default      The default value to return when the key is not present (default: [])
3533         @param casesense    When false, the values are converted to lower case
3534         '''
3535         val = traverse_obj(
3536             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3537         if val is None:
3538             return [] if default is NO_DEFAULT else default
3539         return list(val) if casesense else [x.lower() for x in val]
3540
3541
3542 class SearchInfoExtractor(InfoExtractor):
3543     """
3544     Base class for paged search queries extractors.
3545     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3546     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3547     """
3548
3549     @classmethod
3550     def _make_valid_url(cls):
3551         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3552
3553     @classmethod
3554     def suitable(cls, url):
3555         return re.match(cls._make_valid_url(), url) is not None
3556
3557     def _real_extract(self, query):
3558         mobj = re.match(self._make_valid_url(), query)
3559         if mobj is None:
3560             raise ExtractorError('Invalid search query "%s"' % query)
3561
3562         prefix = mobj.group('prefix')
3563         query = mobj.group('query')
3564         if prefix == '':
3565             return self._get_n_results(query, 1)
3566         elif prefix == 'all':
3567             return self._get_n_results(query, self._MAX_RESULTS)
3568         else:
3569             n = int(prefix)
3570             if n <= 0:
3571                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3572             elif n > self._MAX_RESULTS:
3573                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3574                 n = self._MAX_RESULTS
3575             return self._get_n_results(query, n)
3576
3577     def _get_n_results(self, query, n):
3578         """Get a specified number of results for a query"""
3579         raise NotImplementedError('This method must be implemented by subclasses')
3580
3581     @property
3582     def SEARCH_KEY(self):
3583         return self._SEARCH_KEY