yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar_Cookie,
  18     compat_cookies_SimpleCookie,
  19     compat_etree_Element,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader import FileDownloader
  33 from ..downloader.f4m import (
  34     get_base_url,
  35     remove_encrypted_media,
  36 )
  37 from ..utils import (
  38     NO_DEFAULT,
  39     age_restricted,
  40     base_url,
  41     bug_reports_message,
  42     clean_html,
  43     compiled_regex_type,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     error_to_compat_str,
  48     ExtractorError,
  49     extract_attributes,
  50     fix_xml_ampersands,
  51     float_or_none,
  52     GeoRestrictedError,
  53     GeoUtils,
  54     int_or_none,
  55     js_to_json,
  56     JSON_LD_RE,
  57     mimetype2ext,
  58     network_exceptions,
  59     orderedSet,
  60     parse_bitrate,
  61     parse_codecs,
  62     parse_duration,
  63     parse_iso8601,
  64     parse_m3u8_attributes,
  65     parse_resolution,
  66     RegexNotFoundError,
  67     sanitized_Request,
  68     sanitize_filename,
  69     str_or_none,
  70     str_to_int,
  71     strip_or_none,
  72     traverse_obj,
  73     unescapeHTML,
  74     unified_strdate,
  75     unified_timestamp,
  76     update_Request,
  77     update_url_query,
  78     urljoin,
  79     url_basename,
  80     url_or_none,
  81     variadic,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor(object):
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped.
 107
 108     Additionally, it must contain either a formats entry or a url one:
 109
 110     formats:        A list of dictionaries for each format available, ordered
 111                     from worst to best quality.
 112
 113                     Potential fields:
 114                     * url        The mandatory URL representing the media:
 115                                    for plain file media - HTTP URL of this file,
 116                                    for RTMP - RTMP URL,
 117                                    for HLS - URL of the M3U8 media playlist,
 118                                    for HDS - URL of the F4M manifest,
 119                                    for DASH
 120                                      - HTTP URL to plain file media (in case of
 121                                        unfragmented media)
 122                                      - URL of the MPD manifest or base URL
 123                                        representing the media if MPD manifest
 124                                        is parsed from a string (in case of
 125                                        fragmented media)
 126                                    for MSS - URL of the ISM manifest.
 127                     * manifest_url
 128                                  The URL of the manifest file in case of
 129                                  fragmented media:
 130                                    for HLS - URL of the M3U8 master playlist,
 131                                    for HDS - URL of the F4M manifest,
 132                                    for DASH - URL of the MPD manifest,
 133                                    for MSS - URL of the ISM manifest.
 134                     * ext        Will be calculated from URL if missing
 135                     * format     A human-readable description of the format
 136                                  ("mp4 container with h264/opus").
 137                                  Calculated from the format_id, width, height.
 138                                  and format_note fields if missing.
 139                     * format_id  A short description of the format
 140                                  ("mp4_h264_opus" or "19").
 141                                 Technically optional, but strongly recommended.
 142                     * format_note Additional info about the format
 143                                  ("3D" or "DASH video")
 144                     * width      Width of the video, if known
 145                     * height     Height of the video, if known
 146                     * resolution Textual description of width and height
 147                     * tbr        Average bitrate of audio and video in KBit/s
 148                     * abr        Average audio bitrate in KBit/s
 149                     * acodec     Name of the audio codec in use
 150                     * asr        Audio sampling rate in Hertz
 151                     * vbr        Average video bitrate in KBit/s
 152                     * fps        Frame rate
 153                     * vcodec     Name of the video codec in use
 154                     * container  Name of the container format
 155                     * filesize   The number of bytes, if known in advance
 156                     * filesize_approx  An estimate for the number of bytes
 157                     * player_url SWF Player URL (used for rtmpdump).
 158                     * protocol   The protocol that will be used for the actual
 159                                  download, lower-case.
 160                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 161                                  "m3u8", "m3u8_native" or "http_dash_segments".
 162                     * fragment_base_url
 163                                  Base URL for fragments. Each fragment's path
 164                                  value (if present) will be relative to
 165                                  this URL.
 166                     * fragments  A list of fragments of a fragmented media.
 167                                  Each fragment entry must contain either an url
 168                                  or a path. If an url is present it should be
 169                                  considered by a client. Otherwise both path and
 170                                  fragment_base_url must be present. Here is
 171                                  the list of all potential fields:
 172                                  * "url" - fragment's URL
 173                                  * "path" - fragment's path relative to
 174                                             fragment_base_url
 175                                  * "duration" (optional, int or float)
 176                                  * "filesize" (optional, int)
 177                     * preference Order number of this format. If this field is
 178                                  present and not None, the formats get sorted
 179                                  by this field, regardless of all other values.
 180                                  -1 for default (order by other properties),
 181                                  -2 or smaller for less than default.
 182                                  < -1000 to hide the format (if there is
 183                                     another one which is strictly better)
 184                     * language   Language code, e.g. "de" or "en-US".
 185                     * language_preference  Is this in the language mentioned in
 186                                  the URL?
 187                                  10 if it's what the URL is about,
 188                                  -1 for default (don't know),
 189                                  -10 otherwise, other values reserved for now.
 190                     * quality    Order number of the video quality of this
 191                                  format, irrespective of the file format.
 192                                  -1 for default (order by other properties),
 193                                  -2 or smaller for less than default.
 194                     * source_preference  Order number for this video source
 195                                   (quality takes higher priority)
 196                                  -1 for default (order by other properties),
 197                                  -2 or smaller for less than default.
 198                     * http_headers  A dictionary of additional HTTP headers
 199                                  to add to the request.
 200                     * stretched_ratio  If given and not 1, indicates that the
 201                                  video's pixels are not square.
 202                                  width : height ratio as float.
 203                     * no_resume  The server does not support resuming the
 204                                  (HTTP or RTMP) download. Boolean.
 205                     * downloader_options  A dictionary of downloader options as
 206                                  described in FileDownloader
 207                     RTMP formats can also have the additional fields: page_url,
 208                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 209                     rtmp_protocol, rtmp_real_time
 210
 211     url:            Final video URL.
 212     ext:            Video filename extension.
 213     format:         The video format, defaults to ext (used for --get-format)
 214     player_url:     SWF Player URL (used for rtmpdump).
 215
 216     The following fields are optional:
 217
 218     alt_title:      A secondary title of the video.
 219     display_id      An alternative identifier for the video, not necessarily
 220                     unique, but available before title. Typically, id is
 221                     something like "4234987", title "Dancing naked mole rats",
 222                     and display_id "dancing-naked-mole-rats"
 223     thumbnails:     A list of dictionaries, with the following entries:
 224                         * "id" (optional, string) - Thumbnail format ID
 225                         * "url"
 226                         * "preference" (optional, int) - quality of the image
 227                         * "width" (optional, int)
 228                         * "height" (optional, int)
 229                         * "resolution" (optional, string "{width}x{height}",
 230                                         deprecated)
 231                         * "filesize" (optional, int)
 232                         * "_test_url" (optional, bool) - If true, test the URL
 233     thumbnail:      Full URL to a video thumbnail image.
 234     description:    Full video description.
 235     uploader:       Full name of the video uploader.
 236     license:        License name the video is licensed under.
 237     creator:        The creator of the video.
 238     release_timestamp: UNIX timestamp of the moment the video was released.
 239     release_date:   The date (YYYYMMDD) when the video was released.
 240     timestamp:      UNIX timestamp of the moment the video was uploaded
 241     upload_date:    Video upload date (YYYYMMDD).
 242                     If not explicitly set, calculated from timestamp.
 243     uploader_id:    Nickname or id of the video uploader.
 244     uploader_url:   Full URL to a personal webpage of the video uploader.
 245     channel:        Full name of the channel the video is uploaded on.
 246                     Note that channel fields may or may not repeat uploader
 247                     fields. This depends on a particular extractor.
 248     channel_id:     Id of the channel.
 249     channel_url:    Full URL to a channel webpage.
 250     location:       Physical location where the video was filmed.
 251     subtitles:      The available subtitles as a dictionary in the format
 252                     {tag: subformats}. "tag" is usually a language code, and
 253                     "subformats" is a list sorted from lower to higher
 254                     preference, each element is a dictionary with the "ext"
 255                     entry and one of:
 256                         * "data": The subtitles file contents
 257                         * "url": A URL pointing to the subtitles file
 258                     It can optionally also have:
 259                         * "name": Name or description of the subtitles
 260                     "ext" will be calculated from URL if missing
 261     automatic_captions: Like 'subtitles'; contains automatically generated
 262                     captions instead of normal subtitles
 263     duration:       Length of the video in seconds, as an integer or float.
 264     view_count:     How many users have watched the video on the platform.
 265     like_count:     Number of positive ratings of the video
 266     dislike_count:  Number of negative ratings of the video
 267     repost_count:   Number of reposts of the video
 268     average_rating: Average rating give by users, the scale used depends on the webpage
 269     comment_count:  Number of comments on the video
 270     comments:       A list of comments, each with one or more of the following
 271                     properties (all but one of text or html optional):
 272                         * "author" - human-readable name of the comment author
 273                         * "author_id" - user ID of the comment author
 274                         * "author_thumbnail" - The thumbnail of the comment author
 275                         * "id" - Comment ID
 276                         * "html" - Comment as HTML
 277                         * "text" - Plain text of the comment
 278                         * "timestamp" - UNIX timestamp of comment
 279                         * "parent" - ID of the comment this one is replying to.
 280                                      Set to "root" to indicate that this is a
 281                                      comment to the original video.
 282                         * "like_count" - Number of positive ratings of the comment
 283                         * "dislike_count" - Number of negative ratings of the comment
 284                         * "is_favorited" - Whether the comment is marked as
 285                                            favorite by the video uploader
 286                         * "author_is_uploader" - Whether the comment is made by
 287                                                  the video uploader
 288     age_limit:      Age restriction for the video, as an integer (years)
 289     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 290                     should allow to get the same result again. (It will be set
 291                     by YoutubeDL if it's missing)
 292     categories:     A list of categories that the video falls in, for example
 293                     ["Sports", "Berlin"]
 294     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 295     cast:           A list of the video cast
 296     is_live:        True, False, or None (=unknown). Whether this video is a
 297                     live stream that goes on instead of a fixed-length video.
 298     was_live:       True, False, or None (=unknown). Whether this video was
 299                     originally a live stream.
 300     live_status:    'is_live', 'upcoming', 'was_live', 'not_live' or None (=unknown)
 301                     If absent, automatically set from is_live, was_live
 302     start_time:     Time in seconds where the reproduction should start, as
 303                     specified in the URL.
 304     end_time:       Time in seconds where the reproduction should end, as
 305                     specified in the URL.
 306     chapters:       A list of dictionaries, with the following entries:
 307                         * "start_time" - The start time of the chapter in seconds
 308                         * "end_time" - The end time of the chapter in seconds
 309                         * "title" (optional, string)
 310     playable_in_embed: Whether this video is allowed to play in embedded
 311                     players on other sites. Can be True (=always allowed),
 312                     False (=never allowed), None (=unknown), or a string
 313                     specifying the criteria for embedability (Eg: 'whitelist')
 314     availability:   Under what condition the video is available. One of
 315                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 316                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 317                     to set it
 318     __post_extractor: A function to be called just before the metadata is
 319                     written to either disk, logger or console. The function
 320                     must return a dict which will be added to the info_dict.
 321                     This is usefull for additional information that is
 322                     time-consuming to extract. Note that the fields thus
 323                     extracted will not be available to output template and
 324                     match_filter. So, only "comments" and "comment_count" are
 325                     currently allowed to be extracted via this method.
 326
 327     The following fields should only be used when the video belongs to some logical
 328     chapter or section:
 329
 330     chapter:        Name or title of the chapter the video belongs to.
 331     chapter_number: Number of the chapter the video belongs to, as an integer.
 332     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 333
 334     The following fields should only be used when the video is an episode of some
 335     series, programme or podcast:
 336
 337     series:         Title of the series or programme the video episode belongs to.
 338     season:         Title of the season the video episode belongs to.
 339     season_number:  Number of the season the video episode belongs to, as an integer.
 340     season_id:      Id of the season the video episode belongs to, as a unicode string.
 341     episode:        Title of the video episode. Unlike mandatory video title field,
 342                     this field should denote the exact title of the video episode
 343                     without any kind of decoration.
 344     episode_number: Number of the video episode within a season, as an integer.
 345     episode_id:     Id of the video episode, as a unicode string.
 346
 347     The following fields should only be used when the media is a track or a part of
 348     a music album:
 349
 350     track:          Title of the track.
 351     track_number:   Number of the track within an album or a disc, as an integer.
 352     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 353                     as a unicode string.
 354     artist:         Artist(s) of the track.
 355     genre:          Genre(s) of the track.
 356     album:          Title of the album the track belongs to.
 357     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 358     album_artist:   List of all artists appeared on the album (e.g.
 359                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 360                     and compilations).
 361     disc_number:    Number of the disc or other physical medium the track belongs to,
 362                     as an integer.
 363     release_year:   Year (YYYY) when the album was released.
 364
 365     Unless mentioned otherwise, the fields should be Unicode strings.
 366
 367     Unless mentioned otherwise, None is equivalent to absence of information.
 368
 369
 370     _type "playlist" indicates multiple videos.
 371     There must be a key "entries", which is a list, an iterable, or a PagedList
 372     object, each element of which is a valid dictionary by this specification.
 373
 374     Additionally, playlists can have "id", "title", and any other relevent
 375     attributes with the same semantics as videos (see above).
 376
 377
 378     _type "multi_video" indicates that there are multiple videos that
 379     form a single show, for examples multiple acts of an opera or TV episode.
 380     It must have an entries key like a playlist and contain all the keys
 381     required for a video at the same time.
 382
 383
 384     _type "url" indicates that the video must be extracted from another
 385     location, possibly by a different extractor. Its only required key is:
 386     "url" - the next URL to extract.
 387     The key "ie_key" can be set to the class name (minus the trailing "IE",
 388     e.g. "Youtube") if the extractor class is known in advance.
 389     Additionally, the dictionary may have any properties of the resolved entity
 390     known in advance, for example "title" if the title of the referred video is
 391     known ahead of time.
 392
 393
 394     _type "url_transparent" entities have the same specification as "url", but
 395     indicate that the given additional information is more precise than the one
 396     associated with the resolved URL.
 397     This is useful when a site employs a video service that hosts the video and
 398     its technical metadata, but that video service does not embed a useful
 399     title, description etc.
 400
 401
 402     Subclasses of this one should re-define the _real_initialize() and
 403     _real_extract() methods and define a _VALID_URL regexp.
 404     Probably, they should also be added to the list of extractors.
 405
 406     _GEO_BYPASS attribute may be set to False in order to disable
 407     geo restriction bypass mechanisms for a particular extractor.
 408     Though it won't disable explicit geo restriction bypass based on
 409     country code provided with geo_bypass_country.
 410
 411     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 412     countries for this extractor. One of these countries will be used by
 413     geo restriction bypass mechanism right away in order to bypass
 414     geo restriction, of course, if the mechanism is not disabled.
 415
 416     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 417     IP blocks in CIDR notation for this extractor. One of these IP blocks
 418     will be used by geo restriction bypass mechanism similarly
 419     to _GEO_COUNTRIES.
 420
 421     Finally, the _WORKING attribute should be set to False for broken IEs
 422     in order to warn the users and skip the tests.
 423     """
 424
 425     _ready = False
 426     _downloader = None
 427     _x_forwarded_for_ip = None
 428     _GEO_BYPASS = True
 429     _GEO_COUNTRIES = None
 430     _GEO_IP_BLOCKS = None
 431     _WORKING = True
 432
 433     _LOGIN_HINTS = {
 434         'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
 435         'cookies': (
 436             'Use --cookies for the authentication. '
 437             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to pass cookies'),
 438         'password': 'Use --username and --password or --netrc to provide account credentials',
 439     }
 440
 441     def __init__(self, downloader=None):
 442         """Constructor. Receives an optional downloader."""
 443         self._ready = False
 444         self._x_forwarded_for_ip = None
 445         self.set_downloader(downloader)
 446
 447     @classmethod
 448     def suitable(cls, url):
 449         """Receives a URL and returns True if suitable for this IE."""
 450
 451         # This does not use has/getattr intentionally - we want to know whether
 452         # we have cached the regexp for *this* class, whereas getattr would also
 453         # match the superclass
 454         if '_VALID_URL_RE' not in cls.__dict__:
 455             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 456         return cls._VALID_URL_RE.match(url) is not None
 457
 458     @classmethod
 459     def _match_id(cls, url):
 460         if '_VALID_URL_RE' not in cls.__dict__:
 461             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 462         m = cls._VALID_URL_RE.match(url)
 463         assert m
 464         return compat_str(m.group('id'))
 465
 466     @classmethod
 467     def working(cls):
 468         """Getter method for _WORKING."""
 469         return cls._WORKING
 470
 471     def initialize(self):
 472         """Initializes an instance (authentication, etc)."""
 473         self._initialize_geo_bypass({
 474             'countries': self._GEO_COUNTRIES,
 475             'ip_blocks': self._GEO_IP_BLOCKS,
 476         })
 477         if not self._ready:
 478             self._real_initialize()
 479             self._ready = True
 480
 481     def _initialize_geo_bypass(self, geo_bypass_context):
 482         """
 483         Initialize geo restriction bypass mechanism.
 484
 485         This method is used to initialize geo bypass mechanism based on faking
 486         X-Forwarded-For HTTP header. A random country from provided country list
 487         is selected and a random IP belonging to this country is generated. This
 488         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 489         HTTP requests.
 490
 491         This method will be used for initial geo bypass mechanism initialization
 492         during the instance initialization with _GEO_COUNTRIES and
 493         _GEO_IP_BLOCKS.
 494
 495         You may also manually call it from extractor's code if geo bypass
 496         information is not available beforehand (e.g. obtained during
 497         extraction) or due to some other reason. In this case you should pass
 498         this information in geo bypass context passed as first argument. It may
 499         contain following fields:
 500
 501         countries:  List of geo unrestricted countries (similar
 502                     to _GEO_COUNTRIES)
 503         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 504                     (similar to _GEO_IP_BLOCKS)
 505
 506         """
 507         if not self._x_forwarded_for_ip:
 508
 509             # Geo bypass mechanism is explicitly disabled by user
 510             if not self.get_param('geo_bypass', True):
 511                 return
 512
 513             if not geo_bypass_context:
 514                 geo_bypass_context = {}
 515
 516             # Backward compatibility: previously _initialize_geo_bypass
 517             # expected a list of countries, some 3rd party code may still use
 518             # it this way
 519             if isinstance(geo_bypass_context, (list, tuple)):
 520                 geo_bypass_context = {
 521                     'countries': geo_bypass_context,
 522                 }
 523
 524             # The whole point of geo bypass mechanism is to fake IP
 525             # as X-Forwarded-For HTTP header based on some IP block or
 526             # country code.
 527
 528             # Path 1: bypassing based on IP block in CIDR notation
 529
 530             # Explicit IP block specified by user, use it right away
 531             # regardless of whether extractor is geo bypassable or not
 532             ip_block = self.get_param('geo_bypass_ip_block', None)
 533
 534             # Otherwise use random IP block from geo bypass context but only
 535             # if extractor is known as geo bypassable
 536             if not ip_block:
 537                 ip_blocks = geo_bypass_context.get('ip_blocks')
 538                 if self._GEO_BYPASS and ip_blocks:
 539                     ip_block = random.choice(ip_blocks)
 540
 541             if ip_block:
 542                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 543                 self._downloader.write_debug(
 544                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 545                 return
 546
 547             # Path 2: bypassing based on country code
 548
 549             # Explicit country code specified by user, use it right away
 550             # regardless of whether extractor is geo bypassable or not
 551             country = self.get_param('geo_bypass_country', None)
 552
 553             # Otherwise use random country code from geo bypass context but
 554             # only if extractor is known as geo bypassable
 555             if not country:
 556                 countries = geo_bypass_context.get('countries')
 557                 if self._GEO_BYPASS and countries:
 558                     country = random.choice(countries)
 559
 560             if country:
 561                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 562                 self._downloader.write_debug(
 563                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 564
 565     def extract(self, url):
 566         """Extracts URL information and returns it in list of dicts."""
 567         try:
 568             for _ in range(2):
 569                 try:
 570                     self.initialize()
 571                     self.write_debug('Extracting URL: %s' % url)
 572                     ie_result = self._real_extract(url)
 573                     if ie_result is None:
 574                         return None
 575                     if self._x_forwarded_for_ip:
 576                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 577                     subtitles = ie_result.get('subtitles')
 578                     if (subtitles and 'live_chat' in subtitles
 579                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 580                         del subtitles['live_chat']
 581                     return ie_result
 582                 except GeoRestrictedError as e:
 583                     if self.__maybe_fake_ip_and_retry(e.countries):
 584                         continue
 585                     raise
 586         except ExtractorError:
 587             raise
 588         except compat_http_client.IncompleteRead as e:
 589             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 590         except (KeyError, StopIteration) as e:
 591             raise ExtractorError('An extractor error has occurred.', cause=e)
 592
 593     def __maybe_fake_ip_and_retry(self, countries):
 594         if (not self.get_param('geo_bypass_country', None)
 595                 and self._GEO_BYPASS
 596                 and self.get_param('geo_bypass', True)
 597                 and not self._x_forwarded_for_ip
 598                 and countries):
 599             country_code = random.choice(countries)
 600             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 601             if self._x_forwarded_for_ip:
 602                 self.report_warning(
 603                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 604                     % (self._x_forwarded_for_ip, country_code.upper()))
 605                 return True
 606         return False
 607
 608     def set_downloader(self, downloader):
 609         """Sets the downloader for this IE."""
 610         self._downloader = downloader
 611
 612     def _real_initialize(self):
 613         """Real initialization process. Redefine in subclasses."""
 614         pass
 615
 616     def _real_extract(self, url):
 617         """Real extraction process. Redefine in subclasses."""
 618         pass
 619
 620     @classmethod
 621     def ie_key(cls):
 622         """A string for getting the InfoExtractor with get_info_extractor"""
 623         return compat_str(cls.__name__[:-2])
 624
 625     @property
 626     def IE_NAME(self):
 627         return compat_str(type(self).__name__[:-2])
 628
 629     @staticmethod
 630     def __can_accept_status_code(err, expected_status):
 631         assert isinstance(err, compat_urllib_error.HTTPError)
 632         if expected_status is None:
 633             return False
 634         elif callable(expected_status):
 635             return expected_status(err.code) is True
 636         else:
 637             return err.code in variadic(expected_status)
 638
 639     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 640         """
 641         Return the response handle.
 642
 643         See _download_webpage docstring for arguments specification.
 644         """
 645         if not self._downloader._first_webpage_request:
 646             sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
 647             if sleep_interval > 0:
 648                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 649                 time.sleep(sleep_interval)
 650         else:
 651             self._downloader._first_webpage_request = False
 652
 653         if note is None:
 654             self.report_download_webpage(video_id)
 655         elif note is not False:
 656             if video_id is None:
 657                 self.to_screen('%s' % (note,))
 658             else:
 659                 self.to_screen('%s: %s' % (video_id, note))
 660
 661         # Some sites check X-Forwarded-For HTTP header in order to figure out
 662         # the origin of the client behind proxy. This allows bypassing geo
 663         # restriction by faking this header's value to IP that belongs to some
 664         # geo unrestricted country. We will do so once we encounter any
 665         # geo restriction error.
 666         if self._x_forwarded_for_ip:
 667             if 'X-Forwarded-For' not in headers:
 668                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 669
 670         if isinstance(url_or_request, compat_urllib_request.Request):
 671             url_or_request = update_Request(
 672                 url_or_request, data=data, headers=headers, query=query)
 673         else:
 674             if query:
 675                 url_or_request = update_url_query(url_or_request, query)
 676             if data is not None or headers:
 677                 url_or_request = sanitized_Request(url_or_request, data, headers)
 678         try:
 679             return self._downloader.urlopen(url_or_request)
 680         except network_exceptions as err:
 681             if isinstance(err, compat_urllib_error.HTTPError):
 682                 if self.__can_accept_status_code(err, expected_status):
 683                     # Retain reference to error to prevent file object from
 684                     # being closed before it can be read. Works around the
 685                     # effects of <https://bugs.python.org/issue15002>
 686                     # introduced in Python 3.4.1.
 687                     err.fp._error = err
 688                     return err.fp
 689
 690             if errnote is False:
 691                 return False
 692             if errnote is None:
 693                 errnote = 'Unable to download webpage'
 694
 695             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 696             if fatal:
 697                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 698             else:
 699                 self.report_warning(errmsg)
 700                 return False
 701
 702     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 703         """
 704         Return a tuple (page content as string, URL handle).
 705
 706         See _download_webpage docstring for arguments specification.
 707         """
 708         # Strip hashes from the URL (#1038)
 709         if isinstance(url_or_request, (compat_str, str)):
 710             url_or_request = url_or_request.partition('#')[0]
 711
 712         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 713         if urlh is False:
 714             assert not fatal
 715             return False
 716         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 717         return (content, urlh)
 718
 719     @staticmethod
 720     def _guess_encoding_from_content(content_type, webpage_bytes):
 721         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 722         if m:
 723             encoding = m.group(1)
 724         else:
 725             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 726                           webpage_bytes[:1024])
 727             if m:
 728                 encoding = m.group(1).decode('ascii')
 729             elif webpage_bytes.startswith(b'\xff\xfe'):
 730                 encoding = 'utf-16'
 731             else:
 732                 encoding = 'utf-8'
 733
 734         return encoding
 735
 736     def __check_blocked(self, content):
 737         first_block = content[:512]
 738         if ('<title>Access to this site is blocked</title>' in content
 739                 and 'Websense' in first_block):
 740             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 741             blocked_iframe = self._html_search_regex(
 742                 r'<iframe src="([^"]+)"', content,
 743                 'Websense information URL', default=None)
 744             if blocked_iframe:
 745                 msg += ' Visit %s for more details' % blocked_iframe
 746             raise ExtractorError(msg, expected=True)
 747         if '<title>The URL you requested has been blocked</title>' in first_block:
 748             msg = (
 749                 'Access to this webpage has been blocked by Indian censorship. '
 750                 'Use a VPN or proxy server (with --proxy) to route around it.')
 751             block_msg = self._html_search_regex(
 752                 r'</h1><p>(.*?)</p>',
 753                 content, 'block message', default=None)
 754             if block_msg:
 755                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 756             raise ExtractorError(msg, expected=True)
 757         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 758                 and 'blocklist.rkn.gov.ru' in content):
 759             raise ExtractorError(
 760                 'Access to this webpage has been blocked by decision of the Russian government. '
 761                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 762                 expected=True)
 763
 764     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 765         content_type = urlh.headers.get('Content-Type', '')
 766         webpage_bytes = urlh.read()
 767         if prefix is not None:
 768             webpage_bytes = prefix + webpage_bytes
 769         if not encoding:
 770             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 771         if self.get_param('dump_intermediate_pages', False):
 772             self.to_screen('Dumping request to ' + urlh.geturl())
 773             dump = base64.b64encode(webpage_bytes).decode('ascii')
 774             self._downloader.to_screen(dump)
 775         if self.get_param('write_pages', False):
 776             basen = '%s_%s' % (video_id, urlh.geturl())
 777             if len(basen) > 240:
 778                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 779                 basen = basen[:240 - len(h)] + h
 780             raw_filename = basen + '.dump'
 781             filename = sanitize_filename(raw_filename, restricted=True)
 782             self.to_screen('Saving request to ' + filename)
 783             # Working around MAX_PATH limitation on Windows (see
 784             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 785             if compat_os_name == 'nt':
 786                 absfilepath = os.path.abspath(filename)
 787                 if len(absfilepath) > 259:
 788                     filename = '\\\\?\\' + absfilepath
 789             with open(filename, 'wb') as outf:
 790                 outf.write(webpage_bytes)
 791
 792         try:
 793             content = webpage_bytes.decode(encoding, 'replace')
 794         except LookupError:
 795             content = webpage_bytes.decode('utf-8', 'replace')
 796
 797         self.__check_blocked(content)
 798
 799         return content
 800
 801     def _download_webpage(
 802             self, url_or_request, video_id, note=None, errnote=None,
 803             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 804             headers={}, query={}, expected_status=None):
 805         """
 806         Return the data of the page as a string.
 807
 808         Arguments:
 809         url_or_request -- plain text URL as a string or
 810             a compat_urllib_request.Requestobject
 811         video_id -- Video/playlist/item identifier (string)
 812
 813         Keyword arguments:
 814         note -- note printed before downloading (string)
 815         errnote -- note printed in case of an error (string)
 816         fatal -- flag denoting whether error should be considered fatal,
 817             i.e. whether it should cause ExtractionError to be raised,
 818             otherwise a warning will be reported and extraction continued
 819         tries -- number of tries
 820         timeout -- sleep interval between tries
 821         encoding -- encoding for a page content decoding, guessed automatically
 822             when not explicitly specified
 823         data -- POST data (bytes)
 824         headers -- HTTP headers (dict)
 825         query -- URL query (dict)
 826         expected_status -- allows to accept failed HTTP requests (non 2xx
 827             status code) by explicitly specifying a set of accepted status
 828             codes. Can be any of the following entities:
 829                 - an integer type specifying an exact failed status code to
 830                   accept
 831                 - a list or a tuple of integer types specifying a list of
 832                   failed status codes to accept
 833                 - a callable accepting an actual failed status code and
 834                   returning True if it should be accepted
 835             Note that this argument does not affect success status codes (2xx)
 836             which are always accepted.
 837         """
 838
 839         success = False
 840         try_count = 0
 841         while success is False:
 842             try:
 843                 res = self._download_webpage_handle(
 844                     url_or_request, video_id, note, errnote, fatal,
 845                     encoding=encoding, data=data, headers=headers, query=query,
 846                     expected_status=expected_status)
 847                 success = True
 848             except compat_http_client.IncompleteRead as e:
 849                 try_count += 1
 850                 if try_count >= tries:
 851                     raise e
 852                 self._sleep(timeout, video_id)
 853         if res is False:
 854             return res
 855         else:
 856             content, _ = res
 857             return content
 858
 859     def _download_xml_handle(
 860             self, url_or_request, video_id, note='Downloading XML',
 861             errnote='Unable to download XML', transform_source=None,
 862             fatal=True, encoding=None, data=None, headers={}, query={},
 863             expected_status=None):
 864         """
 865         Return a tuple (xml as an compat_etree_Element, URL handle).
 866
 867         See _download_webpage docstring for arguments specification.
 868         """
 869         res = self._download_webpage_handle(
 870             url_or_request, video_id, note, errnote, fatal=fatal,
 871             encoding=encoding, data=data, headers=headers, query=query,
 872             expected_status=expected_status)
 873         if res is False:
 874             return res
 875         xml_string, urlh = res
 876         return self._parse_xml(
 877             xml_string, video_id, transform_source=transform_source,
 878             fatal=fatal), urlh
 879
 880     def _download_xml(
 881             self, url_or_request, video_id,
 882             note='Downloading XML', errnote='Unable to download XML',
 883             transform_source=None, fatal=True, encoding=None,
 884             data=None, headers={}, query={}, expected_status=None):
 885         """
 886         Return the xml as an compat_etree_Element.
 887
 888         See _download_webpage docstring for arguments specification.
 889         """
 890         res = self._download_xml_handle(
 891             url_or_request, video_id, note=note, errnote=errnote,
 892             transform_source=transform_source, fatal=fatal, encoding=encoding,
 893             data=data, headers=headers, query=query,
 894             expected_status=expected_status)
 895         return res if res is False else res[0]
 896
 897     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 898         if transform_source:
 899             xml_string = transform_source(xml_string)
 900         try:
 901             return compat_etree_fromstring(xml_string.encode('utf-8'))
 902         except compat_xml_parse_error as ve:
 903             errmsg = '%s: Failed to parse XML ' % video_id
 904             if fatal:
 905                 raise ExtractorError(errmsg, cause=ve)
 906             else:
 907                 self.report_warning(errmsg + str(ve))
 908
 909     def _download_json_handle(
 910             self, url_or_request, video_id, note='Downloading JSON metadata',
 911             errnote='Unable to download JSON metadata', transform_source=None,
 912             fatal=True, encoding=None, data=None, headers={}, query={},
 913             expected_status=None):
 914         """
 915         Return a tuple (JSON object, URL handle).
 916
 917         See _download_webpage docstring for arguments specification.
 918         """
 919         res = self._download_webpage_handle(
 920             url_or_request, video_id, note, errnote, fatal=fatal,
 921             encoding=encoding, data=data, headers=headers, query=query,
 922             expected_status=expected_status)
 923         if res is False:
 924             return res
 925         json_string, urlh = res
 926         return self._parse_json(
 927             json_string, video_id, transform_source=transform_source,
 928             fatal=fatal), urlh
 929
 930     def _download_json(
 931             self, url_or_request, video_id, note='Downloading JSON metadata',
 932             errnote='Unable to download JSON metadata', transform_source=None,
 933             fatal=True, encoding=None, data=None, headers={}, query={},
 934             expected_status=None):
 935         """
 936         Return the JSON object as a dict.
 937
 938         See _download_webpage docstring for arguments specification.
 939         """
 940         res = self._download_json_handle(
 941             url_or_request, video_id, note=note, errnote=errnote,
 942             transform_source=transform_source, fatal=fatal, encoding=encoding,
 943             data=data, headers=headers, query=query,
 944             expected_status=expected_status)
 945         return res if res is False else res[0]
 946
 947     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 948         if transform_source:
 949             json_string = transform_source(json_string)
 950         try:
 951             return json.loads(json_string)
 952         except ValueError as ve:
 953             errmsg = '%s: Failed to parse JSON ' % video_id
 954             if fatal:
 955                 raise ExtractorError(errmsg, cause=ve)
 956             else:
 957                 self.report_warning(errmsg + str(ve))
 958
 959     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 960         return self._parse_json(
 961             data[data.find('{'):data.rfind('}') + 1],
 962             video_id, transform_source, fatal)
 963
 964     def _download_socket_json_handle(
 965             self, url_or_request, video_id, note='Polling socket',
 966             errnote='Unable to poll socket', transform_source=None,
 967             fatal=True, encoding=None, data=None, headers={}, query={},
 968             expected_status=None):
 969         """
 970         Return a tuple (JSON object, URL handle).
 971
 972         See _download_webpage docstring for arguments specification.
 973         """
 974         res = self._download_webpage_handle(
 975             url_or_request, video_id, note, errnote, fatal=fatal,
 976             encoding=encoding, data=data, headers=headers, query=query,
 977             expected_status=expected_status)
 978         if res is False:
 979             return res
 980         webpage, urlh = res
 981         return self._parse_socket_response_as_json(
 982             webpage, video_id, transform_source=transform_source,
 983             fatal=fatal), urlh
 984
 985     def _download_socket_json(
 986             self, url_or_request, video_id, note='Polling socket',
 987             errnote='Unable to poll socket', transform_source=None,
 988             fatal=True, encoding=None, data=None, headers={}, query={},
 989             expected_status=None):
 990         """
 991         Return the JSON object as a dict.
 992
 993         See _download_webpage docstring for arguments specification.
 994         """
 995         res = self._download_socket_json_handle(
 996             url_or_request, video_id, note=note, errnote=errnote,
 997             transform_source=transform_source, fatal=fatal, encoding=encoding,
 998             data=data, headers=headers, query=query,
 999             expected_status=expected_status)
1000         return res if res is False else res[0]
1001
1002     def report_warning(self, msg, video_id=None, *args, **kwargs):
1003         idstr = '' if video_id is None else '%s: ' % video_id
1004         self._downloader.report_warning(
1005             '[%s] %s%s' % (self.IE_NAME, idstr, msg), *args, **kwargs)
1006
1007     def to_screen(self, msg, *args, **kwargs):
1008         """Print msg to screen, prefixing it with '[ie_name]'"""
1009         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1010
1011     def write_debug(self, msg, *args, **kwargs):
1012         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1013
1014     def get_param(self, name, default=None, *args, **kwargs):
1015         if self._downloader:
1016             return self._downloader.params.get(name, default, *args, **kwargs)
1017         return default
1018
1019     def report_extraction(self, id_or_name):
1020         """Report information extraction."""
1021         self.to_screen('%s: Extracting information' % id_or_name)
1022
1023     def report_download_webpage(self, video_id):
1024         """Report webpage download."""
1025         self.to_screen('%s: Downloading webpage' % video_id)
1026
1027     def report_age_confirmation(self):
1028         """Report attempt to confirm age."""
1029         self.to_screen('Confirming age')
1030
1031     def report_login(self):
1032         """Report attempt to log in."""
1033         self.to_screen('Logging in')
1034
1035     def raise_login_required(
1036             self, msg='This video is only available for registered users',
1037             metadata_available=False, method='any'):
1038         if metadata_available and self.get_param('ignore_no_formats_error'):
1039             self.report_warning(msg)
1040         if method is not None:
1041             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1042         raise ExtractorError(msg, expected=True)
1043
1044     def raise_geo_restricted(
1045             self, msg='This video is not available from your location due to geo restriction',
1046             countries=None, metadata_available=False):
1047         if metadata_available and self.get_param('ignore_no_formats_error'):
1048             self.report_warning(msg)
1049         else:
1050             raise GeoRestrictedError(msg, countries=countries)
1051
1052     def raise_no_formats(self, msg, expected=False, video_id=None):
1053         if expected and self.get_param('ignore_no_formats_error'):
1054             self.report_warning(msg, video_id)
1055         else:
1056             raise ExtractorError(msg, expected=expected, video_id=video_id)
1057
1058     # Methods for following #608
1059     @staticmethod
1060     def url_result(url, ie=None, video_id=None, video_title=None):
1061         """Returns a URL that points to a page that should be processed"""
1062         # TODO: ie should be the class used for getting the info
1063         video_info = {'_type': 'url',
1064                       'url': url,
1065                       'ie_key': ie}
1066         if video_id is not None:
1067             video_info['id'] = video_id
1068         if video_title is not None:
1069             video_info['title'] = video_title
1070         return video_info
1071
1072     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1073         urls = orderedSet(
1074             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1075             for m in matches)
1076         return self.playlist_result(
1077             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1078
1079     @staticmethod
1080     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1081         """Returns a playlist"""
1082         video_info = {'_type': 'playlist',
1083                       'entries': entries}
1084         video_info.update(kwargs)
1085         if playlist_id:
1086             video_info['id'] = playlist_id
1087         if playlist_title:
1088             video_info['title'] = playlist_title
1089         if playlist_description is not None:
1090             video_info['description'] = playlist_description
1091         return video_info
1092
1093     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1094         """
1095         Perform a regex search on the given string, using a single or a list of
1096         patterns returning the first matching group.
1097         In case of failure return a default value or raise a WARNING or a
1098         RegexNotFoundError, depending on fatal, specifying the field name.
1099         """
1100         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1101             mobj = re.search(pattern, string, flags)
1102         else:
1103             for p in pattern:
1104                 mobj = re.search(p, string, flags)
1105                 if mobj:
1106                     break
1107
1108         if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1109             _name = '\033[0;34m%s\033[0m' % name
1110         else:
1111             _name = name
1112
1113         if mobj:
1114             if group is None:
1115                 # return the first matching group
1116                 return next(g for g in mobj.groups() if g is not None)
1117             elif isinstance(group, (list, tuple)):
1118                 return tuple(mobj.group(g) for g in group)
1119             else:
1120                 return mobj.group(group)
1121         elif default is not NO_DEFAULT:
1122             return default
1123         elif fatal:
1124             raise RegexNotFoundError('Unable to extract %s' % _name)
1125         else:
1126             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1127             return None
1128
1129     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1130         """
1131         Like _search_regex, but strips HTML tags and unescapes entities.
1132         """
1133         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1134         if res:
1135             return clean_html(res).strip()
1136         else:
1137             return res
1138
1139     def _get_netrc_login_info(self, netrc_machine=None):
1140         username = None
1141         password = None
1142         netrc_machine = netrc_machine or self._NETRC_MACHINE
1143
1144         if self.get_param('usenetrc', False):
1145             try:
1146                 info = netrc.netrc().authenticators(netrc_machine)
1147                 if info is not None:
1148                     username = info[0]
1149                     password = info[2]
1150                 else:
1151                     raise netrc.NetrcParseError(
1152                         'No authenticators for %s' % netrc_machine)
1153             except (IOError, netrc.NetrcParseError) as err:
1154                 self.report_warning(
1155                     'parsing .netrc: %s' % error_to_compat_str(err))
1156
1157         return username, password
1158
1159     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1160         """
1161         Get the login info as (username, password)
1162         First look for the manually specified credentials using username_option
1163         and password_option as keys in params dictionary. If no such credentials
1164         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1165         value.
1166         If there's no info available, return (None, None)
1167         """
1168
1169         # Attempt to use provided username and password or .netrc data
1170         username = self.get_param(username_option)
1171         if username is not None:
1172             password = self.get_param(password_option)
1173         else:
1174             username, password = self._get_netrc_login_info(netrc_machine)
1175
1176         return username, password
1177
1178     def _get_tfa_info(self, note='two-factor verification code'):
1179         """
1180         Get the two-factor authentication info
1181         TODO - asking the user will be required for sms/phone verify
1182         currently just uses the command line option
1183         If there's no info available, return None
1184         """
1185
1186         tfa = self.get_param('twofactor')
1187         if tfa is not None:
1188             return tfa
1189
1190         return compat_getpass('Type %s and press [Return]: ' % note)
1191
1192     # Helper functions for extracting OpenGraph info
1193     @staticmethod
1194     def _og_regexes(prop):
1195         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1196         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1197                        % {'prop': re.escape(prop)})
1198         template = r'<meta[^>]+?%s[^>]+?%s'
1199         return [
1200             template % (property_re, content_re),
1201             template % (content_re, property_re),
1202         ]
1203
1204     @staticmethod
1205     def _meta_regex(prop):
1206         return r'''(?isx)<meta
1207                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1208                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1209
1210     def _og_search_property(self, prop, html, name=None, **kargs):
1211         prop = variadic(prop)
1212         if name is None:
1213             name = 'OpenGraph %s' % prop[0]
1214         og_regexes = []
1215         for p in prop:
1216             og_regexes.extend(self._og_regexes(p))
1217         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1218         if escaped is None:
1219             return None
1220         return unescapeHTML(escaped)
1221
1222     def _og_search_thumbnail(self, html, **kargs):
1223         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1224
1225     def _og_search_description(self, html, **kargs):
1226         return self._og_search_property('description', html, fatal=False, **kargs)
1227
1228     def _og_search_title(self, html, **kargs):
1229         return self._og_search_property('title', html, **kargs)
1230
1231     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1232         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1233         if secure:
1234             regexes = self._og_regexes('video:secure_url') + regexes
1235         return self._html_search_regex(regexes, html, name, **kargs)
1236
1237     def _og_search_url(self, html, **kargs):
1238         return self._og_search_property('url', html, **kargs)
1239
1240     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1241         name = variadic(name)
1242         if display_name is None:
1243             display_name = name[0]
1244         return self._html_search_regex(
1245             [self._meta_regex(n) for n in name],
1246             html, display_name, fatal=fatal, group='content', **kwargs)
1247
1248     def _dc_search_uploader(self, html):
1249         return self._html_search_meta('dc.creator', html, 'uploader')
1250
1251     def _rta_search(self, html):
1252         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1253         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1254                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1255                      html):
1256             return 18
1257         return 0
1258
1259     def _media_rating_search(self, html):
1260         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1261         rating = self._html_search_meta('rating', html)
1262
1263         if not rating:
1264             return None
1265
1266         RATING_TABLE = {
1267             'safe for kids': 0,
1268             'general': 8,
1269             '14 years': 14,
1270             'mature': 17,
1271             'restricted': 19,
1272         }
1273         return RATING_TABLE.get(rating.lower())
1274
1275     def _family_friendly_search(self, html):
1276         # See http://schema.org/VideoObject
1277         family_friendly = self._html_search_meta(
1278             'isFamilyFriendly', html, default=None)
1279
1280         if not family_friendly:
1281             return None
1282
1283         RATING_TABLE = {
1284             '1': 0,
1285             'true': 0,
1286             '0': 18,
1287             'false': 18,
1288         }
1289         return RATING_TABLE.get(family_friendly.lower())
1290
1291     def _twitter_search_player(self, html):
1292         return self._html_search_meta('twitter:player', html,
1293                                       'twitter card player')
1294
1295     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1296         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1297         default = kwargs.get('default', NO_DEFAULT)
1298         # JSON-LD may be malformed and thus `fatal` should be respected.
1299         # At the same time `default` may be passed that assumes `fatal=False`
1300         # for _search_regex. Let's simulate the same behavior here as well.
1301         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1302         json_ld = []
1303         for mobj in json_ld_list:
1304             json_ld_item = self._parse_json(
1305                 mobj.group('json_ld'), video_id, fatal=fatal)
1306             if not json_ld_item:
1307                 continue
1308             if isinstance(json_ld_item, dict):
1309                 json_ld.append(json_ld_item)
1310             elif isinstance(json_ld_item, (list, tuple)):
1311                 json_ld.extend(json_ld_item)
1312         if json_ld:
1313             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1314         if json_ld:
1315             return json_ld
1316         if default is not NO_DEFAULT:
1317             return default
1318         elif fatal:
1319             raise RegexNotFoundError('Unable to extract JSON-LD')
1320         else:
1321             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1322             return {}
1323
1324     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1325         if isinstance(json_ld, compat_str):
1326             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1327         if not json_ld:
1328             return {}
1329         info = {}
1330         if not isinstance(json_ld, (list, tuple, dict)):
1331             return info
1332         if isinstance(json_ld, dict):
1333             json_ld = [json_ld]
1334
1335         INTERACTION_TYPE_MAP = {
1336             'CommentAction': 'comment',
1337             'AgreeAction': 'like',
1338             'DisagreeAction': 'dislike',
1339             'LikeAction': 'like',
1340             'DislikeAction': 'dislike',
1341             'ListenAction': 'view',
1342             'WatchAction': 'view',
1343             'ViewAction': 'view',
1344         }
1345
1346         def extract_interaction_type(e):
1347             interaction_type = e.get('interactionType')
1348             if isinstance(interaction_type, dict):
1349                 interaction_type = interaction_type.get('@type')
1350             return str_or_none(interaction_type)
1351
1352         def extract_interaction_statistic(e):
1353             interaction_statistic = e.get('interactionStatistic')
1354             if isinstance(interaction_statistic, dict):
1355                 interaction_statistic = [interaction_statistic]
1356             if not isinstance(interaction_statistic, list):
1357                 return
1358             for is_e in interaction_statistic:
1359                 if not isinstance(is_e, dict):
1360                     continue
1361                 if is_e.get('@type') != 'InteractionCounter':
1362                     continue
1363                 interaction_type = extract_interaction_type(is_e)
1364                 if not interaction_type:
1365                     continue
1366                 # For interaction count some sites provide string instead of
1367                 # an integer (as per spec) with non digit characters (e.g. ",")
1368                 # so extracting count with more relaxed str_to_int
1369                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1370                 if interaction_count is None:
1371                     continue
1372                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1373                 if not count_kind:
1374                     continue
1375                 count_key = '%s_count' % count_kind
1376                 if info.get(count_key) is not None:
1377                     continue
1378                 info[count_key] = interaction_count
1379
1380         def extract_video_object(e):
1381             assert e['@type'] == 'VideoObject'
1382             author = e.get('author')
1383             info.update({
1384                 'url': url_or_none(e.get('contentUrl')),
1385                 'title': unescapeHTML(e.get('name')),
1386                 'description': unescapeHTML(e.get('description')),
1387                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1388                 'duration': parse_duration(e.get('duration')),
1389                 'timestamp': unified_timestamp(e.get('uploadDate')),
1390                 # author can be an instance of 'Organization' or 'Person' types.
1391                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1392                 # however some websites are using 'Text' type instead.
1393                 # 1. https://schema.org/VideoObject
1394                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1395                 'filesize': float_or_none(e.get('contentSize')),
1396                 'tbr': int_or_none(e.get('bitrate')),
1397                 'width': int_or_none(e.get('width')),
1398                 'height': int_or_none(e.get('height')),
1399                 'view_count': int_or_none(e.get('interactionCount')),
1400             })
1401             extract_interaction_statistic(e)
1402
1403         for e in json_ld:
1404             if '@context' in e:
1405                 item_type = e.get('@type')
1406                 if expected_type is not None and expected_type != item_type:
1407                     continue
1408                 if item_type in ('TVEpisode', 'Episode'):
1409                     episode_name = unescapeHTML(e.get('name'))
1410                     info.update({
1411                         'episode': episode_name,
1412                         'episode_number': int_or_none(e.get('episodeNumber')),
1413                         'description': unescapeHTML(e.get('description')),
1414                     })
1415                     if not info.get('title') and episode_name:
1416                         info['title'] = episode_name
1417                     part_of_season = e.get('partOfSeason')
1418                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1419                         info.update({
1420                             'season': unescapeHTML(part_of_season.get('name')),
1421                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1422                         })
1423                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1424                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1425                         info['series'] = unescapeHTML(part_of_series.get('name'))
1426                 elif item_type == 'Movie':
1427                     info.update({
1428                         'title': unescapeHTML(e.get('name')),
1429                         'description': unescapeHTML(e.get('description')),
1430                         'duration': parse_duration(e.get('duration')),
1431                         'timestamp': unified_timestamp(e.get('dateCreated')),
1432                     })
1433                 elif item_type in ('Article', 'NewsArticle'):
1434                     info.update({
1435                         'timestamp': parse_iso8601(e.get('datePublished')),
1436                         'title': unescapeHTML(e.get('headline')),
1437                         'description': unescapeHTML(e.get('articleBody')),
1438                     })
1439                 elif item_type == 'VideoObject':
1440                     extract_video_object(e)
1441                     if expected_type is None:
1442                         continue
1443                     else:
1444                         break
1445                 video = e.get('video')
1446                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1447                     extract_video_object(video)
1448                 if expected_type is None:
1449                     continue
1450                 else:
1451                     break
1452         return dict((k, v) for k, v in info.items() if v is not None)
1453
1454     @staticmethod
1455     def _hidden_inputs(html):
1456         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1457         hidden_inputs = {}
1458         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1459             attrs = extract_attributes(input)
1460             if not input:
1461                 continue
1462             if attrs.get('type') not in ('hidden', 'submit'):
1463                 continue
1464             name = attrs.get('name') or attrs.get('id')
1465             value = attrs.get('value')
1466             if name and value is not None:
1467                 hidden_inputs[name] = value
1468         return hidden_inputs
1469
1470     def _form_hidden_inputs(self, form_id, html):
1471         form = self._search_regex(
1472             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1473             html, '%s form' % form_id, group='form')
1474         return self._hidden_inputs(form)
1475
1476     class FormatSort:
1477         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1478
1479         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1480                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1481                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1482         ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
1483                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1484                         'fps', 'fs_approx', 'source', 'format_id')
1485
1486         settings = {
1487             'vcodec': {'type': 'ordered', 'regex': True,
1488                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1489             'acodec': {'type': 'ordered', 'regex': True,
1490                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1491             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1492                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1493             'vext': {'type': 'ordered', 'field': 'video_ext',
1494                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1495                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1496             'aext': {'type': 'ordered', 'field': 'audio_ext',
1497                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1498                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1499             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1500             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple', 'default': 1,
1501                            'field': ('vcodec', 'acodec'),
1502                            'function': lambda it: int(any(v != 'none' for v in it))},
1503             'ie_pref': {'priority': True, 'type': 'extractor'},
1504             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1505             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1506             'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
1507             'quality': {'convert': 'float_none', 'default': -1},
1508             'filesize': {'convert': 'bytes'},
1509             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1510             'id': {'convert': 'string', 'field': 'format_id'},
1511             'height': {'convert': 'float_none'},
1512             'width': {'convert': 'float_none'},
1513             'fps': {'convert': 'float_none'},
1514             'tbr': {'convert': 'float_none'},
1515             'vbr': {'convert': 'float_none'},
1516             'abr': {'convert': 'float_none'},
1517             'asr': {'convert': 'float_none'},
1518             'source': {'convert': 'ignore', 'field': 'source_preference'},
1519
1520             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1521             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1522             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1523             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1524             'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1525
1526             # Most of these exist only for compatibility reasons
1527             'dimension': {'type': 'alias', 'field': 'res'},
1528             'resolution': {'type': 'alias', 'field': 'res'},
1529             'extension': {'type': 'alias', 'field': 'ext'},
1530             'bitrate': {'type': 'alias', 'field': 'br'},
1531             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1532             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1533             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1534             'framerate': {'type': 'alias', 'field': 'fps'},
1535             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1536             'protocol': {'type': 'alias', 'field': 'proto'},
1537             'source_preference': {'type': 'alias', 'field': 'source'},
1538             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1539             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1540             'samplerate': {'type': 'alias', 'field': 'asr'},
1541             'video_ext': {'type': 'alias', 'field': 'vext'},
1542             'audio_ext': {'type': 'alias', 'field': 'aext'},
1543             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1544             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1545             'video': {'type': 'alias', 'field': 'hasvid'},
1546             'has_video': {'type': 'alias', 'field': 'hasvid'},
1547             'audio': {'type': 'alias', 'field': 'hasaud'},
1548             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1549             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1550             'preference': {'type': 'alias', 'field': 'ie_pref'},
1551             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1552             'format_id': {'type': 'alias', 'field': 'id'},
1553         }
1554
1555         _order = []
1556
1557         def _get_field_setting(self, field, key):
1558             if field not in self.settings:
1559                 self.settings[field] = {}
1560             propObj = self.settings[field]
1561             if key not in propObj:
1562                 type = propObj.get('type')
1563                 if key == 'field':
1564                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1565                 elif key == 'convert':
1566                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1567                 else:
1568                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1569                 propObj[key] = default
1570             return propObj[key]
1571
1572         def _resolve_field_value(self, field, value, convertNone=False):
1573             if value is None:
1574                 if not convertNone:
1575                     return None
1576             else:
1577                 value = value.lower()
1578             conversion = self._get_field_setting(field, 'convert')
1579             if conversion == 'ignore':
1580                 return None
1581             if conversion == 'string':
1582                 return value
1583             elif conversion == 'float_none':
1584                 return float_or_none(value)
1585             elif conversion == 'bytes':
1586                 return FileDownloader.parse_bytes(value)
1587             elif conversion == 'order':
1588                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1589                 use_regex = self._get_field_setting(field, 'regex')
1590                 list_length = len(order_list)
1591                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1592                 if use_regex and value is not None:
1593                     for i, regex in enumerate(order_list):
1594                         if regex and re.match(regex, value):
1595                             return list_length - i
1596                     return list_length - empty_pos  # not in list
1597                 else:  # not regex or  value = None
1598                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1599             else:
1600                 if value.isnumeric():
1601                     return float(value)
1602                 else:
1603                     self.settings[field]['convert'] = 'string'
1604                     return value
1605
1606         def evaluate_params(self, params, sort_extractor):
1607             self._use_free_order = params.get('prefer_free_formats', False)
1608             self._sort_user = params.get('format_sort', [])
1609             self._sort_extractor = sort_extractor
1610
1611             def add_item(field, reverse, closest, limit_text):
1612                 field = field.lower()
1613                 if field in self._order:
1614                     return
1615                 self._order.append(field)
1616                 limit = self._resolve_field_value(field, limit_text)
1617                 data = {
1618                     'reverse': reverse,
1619                     'closest': False if limit is None else closest,
1620                     'limit_text': limit_text,
1621                     'limit': limit}
1622                 if field in self.settings:
1623                     self.settings[field].update(data)
1624                 else:
1625                     self.settings[field] = data
1626
1627             sort_list = (
1628                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1629                 + (tuple() if params.get('format_sort_force', False)
1630                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1631                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1632
1633             for item in sort_list:
1634                 match = re.match(self.regex, item)
1635                 if match is None:
1636                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1637                 field = match.group('field')
1638                 if field is None:
1639                     continue
1640                 if self._get_field_setting(field, 'type') == 'alias':
1641                     field = self._get_field_setting(field, 'field')
1642                 reverse = match.group('reverse') is not None
1643                 closest = match.group('separator') == '~'
1644                 limit_text = match.group('limit')
1645
1646                 has_limit = limit_text is not None
1647                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1648                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1649
1650                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1651                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1652                 limit_count = len(limits)
1653                 for (i, f) in enumerate(fields):
1654                     add_item(f, reverse, closest,
1655                              limits[i] if i < limit_count
1656                              else limits[0] if has_limit and not has_multiple_limits
1657                              else None)
1658
1659         def print_verbose_info(self, write_debug):
1660             if self._sort_user:
1661                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1662             if self._sort_extractor:
1663                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1664             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1665                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1666                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1667                               self._get_field_setting(field, 'limit_text'),
1668                               self._get_field_setting(field, 'limit'))
1669                 if self._get_field_setting(field, 'limit_text') is not None else '')
1670                 for field in self._order if self._get_field_setting(field, 'visible')]))
1671
1672         def _calculate_field_preference_from_value(self, format, field, type, value):
1673             reverse = self._get_field_setting(field, 'reverse')
1674             closest = self._get_field_setting(field, 'closest')
1675             limit = self._get_field_setting(field, 'limit')
1676
1677             if type == 'extractor':
1678                 maximum = self._get_field_setting(field, 'max')
1679                 if value is None or (maximum is not None and value >= maximum):
1680                     value = -1
1681             elif type == 'boolean':
1682                 in_list = self._get_field_setting(field, 'in_list')
1683                 not_in_list = self._get_field_setting(field, 'not_in_list')
1684                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1685             elif type == 'ordered':
1686                 value = self._resolve_field_value(field, value, True)
1687
1688             # try to convert to number
1689             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1690             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1691             if is_num:
1692                 value = val_num
1693
1694             return ((-10, 0) if value is None
1695                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1696                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1697                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1698                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1699                     else (-1, value, 0))
1700
1701         def _calculate_field_preference(self, format, field):
1702             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1703             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1704             if type == 'multiple':
1705                 type = 'field'  # Only 'field' is allowed in multiple for now
1706                 actual_fields = self._get_field_setting(field, 'field')
1707
1708                 def wrapped_function(values):
1709                     values = tuple(filter(lambda x: x is not None, values))
1710                     return self._get_field_setting(field, 'function')(values) if values else None
1711
1712                 value = wrapped_function((get_value(f) for f in actual_fields))
1713             else:
1714                 value = get_value(field)
1715             return self._calculate_field_preference_from_value(format, field, type, value)
1716
1717         def calculate_preference(self, format):
1718             # Determine missing protocol
1719             if not format.get('protocol'):
1720                 format['protocol'] = determine_protocol(format)
1721
1722             # Determine missing ext
1723             if not format.get('ext') and 'url' in format:
1724                 format['ext'] = determine_ext(format['url'])
1725             if format.get('vcodec') == 'none':
1726                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1727                 format['video_ext'] = 'none'
1728             else:
1729                 format['video_ext'] = format['ext']
1730                 format['audio_ext'] = 'none'
1731             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1732             #    format['preference'] = -1000
1733
1734             # Determine missing bitrates
1735             if format.get('tbr') is None:
1736                 if format.get('vbr') is not None and format.get('abr') is not None:
1737                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1738             else:
1739                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1740                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1741                 if format.get('acodec') != "none" and format.get('abr') is None:
1742                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1743
1744             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1745
1746     def _sort_formats(self, formats, field_preference=[]):
1747         if not formats:
1748             if self.get_param('ignore_no_formats_error'):
1749                 return
1750             raise ExtractorError('No video formats found')
1751         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1752         format_sort.evaluate_params(self._downloader.params, field_preference)
1753         if self.get_param('verbose', False):
1754             format_sort.print_verbose_info(self._downloader.write_debug)
1755         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1756
1757     def _check_formats(self, formats, video_id):
1758         if formats:
1759             formats[:] = filter(
1760                 lambda f: self._is_valid_url(
1761                     f['url'], video_id,
1762                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1763                 formats)
1764
1765     @staticmethod
1766     def _remove_duplicate_formats(formats):
1767         format_urls = set()
1768         unique_formats = []
1769         for f in formats:
1770             if f['url'] not in format_urls:
1771                 format_urls.add(f['url'])
1772                 unique_formats.append(f)
1773         formats[:] = unique_formats
1774
1775     def _is_valid_url(self, url, video_id, item='video', headers={}):
1776         url = self._proto_relative_url(url, scheme='http:')
1777         # For now assume non HTTP(S) URLs always valid
1778         if not (url.startswith('http://') or url.startswith('https://')):
1779             return True
1780         try:
1781             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1782             return True
1783         except ExtractorError as e:
1784             self.to_screen(
1785                 '%s: %s URL is invalid, skipping: %s'
1786                 % (video_id, item, error_to_compat_str(e.cause)))
1787             return False
1788
1789     def http_scheme(self):
1790         """ Either "http:" or "https:", depending on the user's preferences """
1791         return (
1792             'http:'
1793             if self.get_param('prefer_insecure', False)
1794             else 'https:')
1795
1796     def _proto_relative_url(self, url, scheme=None):
1797         if url is None:
1798             return url
1799         if url.startswith('//'):
1800             if scheme is None:
1801                 scheme = self.http_scheme()
1802             return scheme + url
1803         else:
1804             return url
1805
1806     def _sleep(self, timeout, video_id, msg_template=None):
1807         if msg_template is None:
1808             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1809         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1810         self.to_screen(msg)
1811         time.sleep(timeout)
1812
1813     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1814                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1815                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1816         manifest = self._download_xml(
1817             manifest_url, video_id, 'Downloading f4m manifest',
1818             'Unable to download f4m manifest',
1819             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1820             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1821             transform_source=transform_source,
1822             fatal=fatal, data=data, headers=headers, query=query)
1823
1824         if manifest is False:
1825             return []
1826
1827         return self._parse_f4m_formats(
1828             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1829             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1830
1831     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1832                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1833                            fatal=True, m3u8_id=None):
1834         if not isinstance(manifest, compat_etree_Element) and not fatal:
1835             return []
1836
1837         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1838         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1839         if akamai_pv is not None and ';' in akamai_pv.text:
1840             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1841             if playerVerificationChallenge.strip() != '':
1842                 return []
1843
1844         formats = []
1845         manifest_version = '1.0'
1846         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1847         if not media_nodes:
1848             manifest_version = '2.0'
1849             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1850         # Remove unsupported DRM protected media from final formats
1851         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1852         media_nodes = remove_encrypted_media(media_nodes)
1853         if not media_nodes:
1854             return formats
1855
1856         manifest_base_url = get_base_url(manifest)
1857
1858         bootstrap_info = xpath_element(
1859             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1860             'bootstrap info', default=None)
1861
1862         vcodec = None
1863         mime_type = xpath_text(
1864             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1865             'base URL', default=None)
1866         if mime_type and mime_type.startswith('audio/'):
1867             vcodec = 'none'
1868
1869         for i, media_el in enumerate(media_nodes):
1870             tbr = int_or_none(media_el.attrib.get('bitrate'))
1871             width = int_or_none(media_el.attrib.get('width'))
1872             height = int_or_none(media_el.attrib.get('height'))
1873             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1874             # If <bootstrapInfo> is present, the specified f4m is a
1875             # stream-level manifest, and only set-level manifests may refer to
1876             # external resources.  See section 11.4 and section 4 of F4M spec
1877             if bootstrap_info is None:
1878                 media_url = None
1879                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1880                 if manifest_version == '2.0':
1881                     media_url = media_el.attrib.get('href')
1882                 if media_url is None:
1883                     media_url = media_el.attrib.get('url')
1884                 if not media_url:
1885                     continue
1886                 manifest_url = (
1887                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1888                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1889                 # If media_url is itself a f4m manifest do the recursive extraction
1890                 # since bitrates in parent manifest (this one) and media_url manifest
1891                 # may differ leading to inability to resolve the format by requested
1892                 # bitrate in f4m downloader
1893                 ext = determine_ext(manifest_url)
1894                 if ext == 'f4m':
1895                     f4m_formats = self._extract_f4m_formats(
1896                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1897                         transform_source=transform_source, fatal=fatal)
1898                     # Sometimes stream-level manifest contains single media entry that
1899                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1900                     # At the same time parent's media entry in set-level manifest may
1901                     # contain it. We will copy it from parent in such cases.
1902                     if len(f4m_formats) == 1:
1903                         f = f4m_formats[0]
1904                         f.update({
1905                             'tbr': f.get('tbr') or tbr,
1906                             'width': f.get('width') or width,
1907                             'height': f.get('height') or height,
1908                             'format_id': f.get('format_id') if not tbr else format_id,
1909                             'vcodec': vcodec,
1910                         })
1911                     formats.extend(f4m_formats)
1912                     continue
1913                 elif ext == 'm3u8':
1914                     formats.extend(self._extract_m3u8_formats(
1915                         manifest_url, video_id, 'mp4', preference=preference,
1916                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1917                     continue
1918             formats.append({
1919                 'format_id': format_id,
1920                 'url': manifest_url,
1921                 'manifest_url': manifest_url,
1922                 'ext': 'flv' if bootstrap_info is not None else None,
1923                 'protocol': 'f4m',
1924                 'tbr': tbr,
1925                 'width': width,
1926                 'height': height,
1927                 'vcodec': vcodec,
1928                 'preference': preference,
1929                 'quality': quality,
1930             })
1931         return formats
1932
1933     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1934         return {
1935             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1936             'url': m3u8_url,
1937             'ext': ext,
1938             'protocol': 'm3u8',
1939             'preference': preference - 100 if preference else -100,
1940             'quality': quality,
1941             'resolution': 'multiple',
1942             'format_note': 'Quality selection URL',
1943         }
1944
1945     def _extract_m3u8_formats(self, *args, **kwargs):
1946         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1947         if subs:
1948             self.report_warning(bug_reports_message(
1949                 "Ignoring subtitle tracks found in the HLS manifest; "
1950                 "if any subtitle tracks are missing,"
1951             ))
1952         return fmts
1953
1954     def _extract_m3u8_formats_and_subtitles(
1955             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1956             preference=None, quality=None, m3u8_id=None, note=None,
1957             errnote=None, fatal=True, live=False, data=None, headers={},
1958             query={}):
1959
1960         res = self._download_webpage_handle(
1961             m3u8_url, video_id,
1962             note='Downloading m3u8 information' if note is None else note,
1963             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1964             fatal=fatal, data=data, headers=headers, query=query)
1965
1966         if res is False:
1967             return [], {}
1968
1969         m3u8_doc, urlh = res
1970         m3u8_url = urlh.geturl()
1971
1972         return self._parse_m3u8_formats_and_subtitles(
1973             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1974             preference=preference, quality=quality, m3u8_id=m3u8_id,
1975             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1976             headers=headers, query=query, video_id=video_id)
1977
1978     def _parse_m3u8_formats_and_subtitles(
1979             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
1980             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1981             errnote=None, fatal=True, data=None, headers={}, query={},
1982             video_id=None):
1983         formats, subtitles = [], {}
1984
1985         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1986             return formats, subtitles
1987
1988         if (not self.get_param('allow_unplayable_formats')
1989                 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)):  # Apple FairPlay
1990             return formats, subtitles
1991
1992         def format_url(url):
1993             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
1994
1995         if self.get_param('hls_split_discontinuity', False):
1996             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1997                 if not m3u8_doc:
1998                     if not manifest_url:
1999                         return []
2000                     m3u8_doc = self._download_webpage(
2001                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2002                         note=False, errnote='Failed to download m3u8 playlist information')
2003                     if m3u8_doc is False:
2004                         return []
2005                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2006
2007         else:
2008             def _extract_m3u8_playlist_indices(*args, **kwargs):
2009                 return [None]
2010
2011         # References:
2012         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2013         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2014         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2015
2016         # We should try extracting formats only from master playlists [1, 4.3.4],
2017         # i.e. playlists that describe available qualities. On the other hand
2018         # media playlists [1, 4.3.3] should be returned as is since they contain
2019         # just the media without qualities renditions.
2020         # Fortunately, master playlist can be easily distinguished from media
2021         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2022         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2023         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2024         # media playlist and MUST NOT appear in master playlist thus we can
2025         # clearly detect media playlist with this criterion.
2026
2027         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2028             formats = [{
2029                 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
2030                 'format_index': idx,
2031                 'url': m3u8_url,
2032                 'ext': ext,
2033                 'protocol': entry_protocol,
2034                 'preference': preference,
2035                 'quality': quality,
2036             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2037
2038             return formats, subtitles
2039
2040         groups = {}
2041         last_stream_inf = {}
2042
2043         def extract_media(x_media_line):
2044             media = parse_m3u8_attributes(x_media_line)
2045             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2046             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2047             if not (media_type and group_id and name):
2048                 return
2049             groups.setdefault(group_id, []).append(media)
2050             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2051             if media_type == 'SUBTITLES':
2052                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2053                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2054                 # However, lack of URI has been spotted in the wild.
2055                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2056                 if not media.get('URI'):
2057                     return
2058                 url = format_url(media['URI'])
2059                 sub_info = {
2060                     'url': url,
2061                     'ext': determine_ext(url),
2062                 }
2063                 if sub_info['ext'] == 'm3u8':
2064                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2065                     # files may contain is WebVTT:
2066                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2067                     sub_info['ext'] = 'vtt'
2068                     sub_info['protocol'] = 'm3u8_native'
2069                 lang = media.get('LANGUAGE') or 'und'
2070                 subtitles.setdefault(lang, []).append(sub_info)
2071             if media_type not in ('VIDEO', 'AUDIO'):
2072                 return
2073             media_url = media.get('URI')
2074             if media_url:
2075                 manifest_url = format_url(media_url)
2076                 formats.extend({
2077                     'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
2078                     'format_note': name,
2079                     'format_index': idx,
2080                     'url': manifest_url,
2081                     'manifest_url': m3u8_url,
2082                     'language': media.get('LANGUAGE'),
2083                     'ext': ext,
2084                     'protocol': entry_protocol,
2085                     'preference': preference,
2086                     'quality': quality,
2087                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2088                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2089
2090         def build_stream_name():
2091             # Despite specification does not mention NAME attribute for
2092             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2093             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2094             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2095             stream_name = last_stream_inf.get('NAME')
2096             if stream_name:
2097                 return stream_name
2098             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2099             # from corresponding rendition group
2100             stream_group_id = last_stream_inf.get('VIDEO')
2101             if not stream_group_id:
2102                 return
2103             stream_group = groups.get(stream_group_id)
2104             if not stream_group:
2105                 return stream_group_id
2106             rendition = stream_group[0]
2107             return rendition.get('NAME') or stream_group_id
2108
2109         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2110         # chance to detect video only formats when EXT-X-STREAM-INF tags
2111         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2112         for line in m3u8_doc.splitlines():
2113             if line.startswith('#EXT-X-MEDIA:'):
2114                 extract_media(line)
2115
2116         for line in m3u8_doc.splitlines():
2117             if line.startswith('#EXT-X-STREAM-INF:'):
2118                 last_stream_inf = parse_m3u8_attributes(line)
2119             elif line.startswith('#') or not line.strip():
2120                 continue
2121             else:
2122                 tbr = float_or_none(
2123                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2124                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2125                 manifest_url = format_url(line.strip())
2126
2127                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2128                     format_id = [m3u8_id, None, idx]
2129                     # Bandwidth of live streams may differ over time thus making
2130                     # format_id unpredictable. So it's better to keep provided
2131                     # format_id intact.
2132                     if not live:
2133                         stream_name = build_stream_name()
2134                         format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
2135                     f = {
2136                         'format_id': '-'.join(map(str, filter(None, format_id))),
2137                         'format_index': idx,
2138                         'url': manifest_url,
2139                         'manifest_url': m3u8_url,
2140                         'tbr': tbr,
2141                         'ext': ext,
2142                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2143                         'protocol': entry_protocol,
2144                         'preference': preference,
2145                         'quality': quality,
2146                     }
2147                     resolution = last_stream_inf.get('RESOLUTION')
2148                     if resolution:
2149                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2150                         if mobj:
2151                             f['width'] = int(mobj.group('width'))
2152                             f['height'] = int(mobj.group('height'))
2153                     # Unified Streaming Platform
2154                     mobj = re.search(
2155                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2156                     if mobj:
2157                         abr, vbr = mobj.groups()
2158                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2159                         f.update({
2160                             'vbr': vbr,
2161                             'abr': abr,
2162                         })
2163                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2164                     f.update(codecs)
2165                     audio_group_id = last_stream_inf.get('AUDIO')
2166                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2167                     # references a rendition group MUST have a CODECS attribute.
2168                     # However, this is not always respected, for example, [2]
2169                     # contains EXT-X-STREAM-INF tag which references AUDIO
2170                     # rendition group but does not have CODECS and despite
2171                     # referencing an audio group it represents a complete
2172                     # (with audio and video) format. So, for such cases we will
2173                     # ignore references to rendition groups and treat them
2174                     # as complete formats.
2175                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2176                         audio_group = groups.get(audio_group_id)
2177                         if audio_group and audio_group[0].get('URI'):
2178                             # TODO: update acodec for audio only formats with
2179                             # the same GROUP-ID
2180                             f['acodec'] = 'none'
2181                     if not f.get('ext'):
2182                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2183                     formats.append(f)
2184
2185                     # for DailyMotion
2186                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2187                     if progressive_uri:
2188                         http_f = f.copy()
2189                         del http_f['manifest_url']
2190                         http_f.update({
2191                             'format_id': f['format_id'].replace('hls-', 'http-'),
2192                             'protocol': 'http',
2193                             'url': progressive_uri,
2194                         })
2195                         formats.append(http_f)
2196
2197                 last_stream_inf = {}
2198         return formats, subtitles
2199
2200     @staticmethod
2201     def _xpath_ns(path, namespace=None):
2202         if not namespace:
2203             return path
2204         out = []
2205         for c in path.split('/'):
2206             if not c or c == '.':
2207                 out.append(c)
2208             else:
2209                 out.append('{%s}%s' % (namespace, c))
2210         return '/'.join(out)
2211
2212     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2213         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2214
2215         if smil is False:
2216             assert not fatal
2217             return []
2218
2219         namespace = self._parse_smil_namespace(smil)
2220
2221         fmts = self._parse_smil_formats(
2222             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2223         subs = self._parse_smil_subtitles(
2224             smil, namespace=namespace)
2225
2226         return fmts, subs
2227
2228     def _extract_smil_formats(self, *args, **kwargs):
2229         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2230         if subs:
2231             self.report_warning(bug_reports_message(
2232                 "Ignoring subtitle tracks found in the SMIL manifest; "
2233                 "if any subtitle tracks are missing,"
2234             ))
2235         return fmts
2236
2237     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2238         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2239         if smil is False:
2240             return {}
2241         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2242
2243     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2244         return self._download_xml(
2245             smil_url, video_id, 'Downloading SMIL file',
2246             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2247
2248     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2249         namespace = self._parse_smil_namespace(smil)
2250
2251         formats = self._parse_smil_formats(
2252             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2253         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2254
2255         video_id = os.path.splitext(url_basename(smil_url))[0]
2256         title = None
2257         description = None
2258         upload_date = None
2259         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2260             name = meta.attrib.get('name')
2261             content = meta.attrib.get('content')
2262             if not name or not content:
2263                 continue
2264             if not title and name == 'title':
2265                 title = content
2266             elif not description and name in ('description', 'abstract'):
2267                 description = content
2268             elif not upload_date and name == 'date':
2269                 upload_date = unified_strdate(content)
2270
2271         thumbnails = [{
2272             'id': image.get('type'),
2273             'url': image.get('src'),
2274             'width': int_or_none(image.get('width')),
2275             'height': int_or_none(image.get('height')),
2276         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2277
2278         return {
2279             'id': video_id,
2280             'title': title or video_id,
2281             'description': description,
2282             'upload_date': upload_date,
2283             'thumbnails': thumbnails,
2284             'formats': formats,
2285             'subtitles': subtitles,
2286         }
2287
2288     def _parse_smil_namespace(self, smil):
2289         return self._search_regex(
2290             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2291
2292     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2293         base = smil_url
2294         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2295             b = meta.get('base') or meta.get('httpBase')
2296             if b:
2297                 base = b
2298                 break
2299
2300         formats = []
2301         rtmp_count = 0
2302         http_count = 0
2303         m3u8_count = 0
2304
2305         srcs = []
2306         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2307         for medium in media:
2308             src = medium.get('src')
2309             if not src or src in srcs:
2310                 continue
2311             srcs.append(src)
2312
2313             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2314             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2315             width = int_or_none(medium.get('width'))
2316             height = int_or_none(medium.get('height'))
2317             proto = medium.get('proto')
2318             ext = medium.get('ext')
2319             src_ext = determine_ext(src)
2320             streamer = medium.get('streamer') or base
2321
2322             if proto == 'rtmp' or streamer.startswith('rtmp'):
2323                 rtmp_count += 1
2324                 formats.append({
2325                     'url': streamer,
2326                     'play_path': src,
2327                     'ext': 'flv',
2328                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2329                     'tbr': bitrate,
2330                     'filesize': filesize,
2331                     'width': width,
2332                     'height': height,
2333                 })
2334                 if transform_rtmp_url:
2335                     streamer, src = transform_rtmp_url(streamer, src)
2336                     formats[-1].update({
2337                         'url': streamer,
2338                         'play_path': src,
2339                     })
2340                 continue
2341
2342             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2343             src_url = src_url.strip()
2344
2345             if proto == 'm3u8' or src_ext == 'm3u8':
2346                 m3u8_formats = self._extract_m3u8_formats(
2347                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2348                 if len(m3u8_formats) == 1:
2349                     m3u8_count += 1
2350                     m3u8_formats[0].update({
2351                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2352                         'tbr': bitrate,
2353                         'width': width,
2354                         'height': height,
2355                     })
2356                 formats.extend(m3u8_formats)
2357             elif src_ext == 'f4m':
2358                 f4m_url = src_url
2359                 if not f4m_params:
2360                     f4m_params = {
2361                         'hdcore': '3.2.0',
2362                         'plugin': 'flowplayer-3.2.0.1',
2363                     }
2364                 f4m_url += '&' if '?' in f4m_url else '?'
2365                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2366                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2367             elif src_ext == 'mpd':
2368                 formats.extend(self._extract_mpd_formats(
2369                     src_url, video_id, mpd_id='dash', fatal=False))
2370             elif re.search(r'\.ism/[Mm]anifest', src_url):
2371                 formats.extend(self._extract_ism_formats(
2372                     src_url, video_id, ism_id='mss', fatal=False))
2373             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2374                 http_count += 1
2375                 formats.append({
2376                     'url': src_url,
2377                     'ext': ext or src_ext or 'flv',
2378                     'format_id': 'http-%d' % (bitrate or http_count),
2379                     'tbr': bitrate,
2380                     'filesize': filesize,
2381                     'width': width,
2382                     'height': height,
2383                 })
2384
2385         return formats
2386
2387     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2388         urls = []
2389         subtitles = {}
2390         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2391             src = textstream.get('src')
2392             if not src or src in urls:
2393                 continue
2394             urls.append(src)
2395             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2396             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2397             subtitles.setdefault(lang, []).append({
2398                 'url': src,
2399                 'ext': ext,
2400             })
2401         return subtitles
2402
2403     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2404         xspf = self._download_xml(
2405             xspf_url, playlist_id, 'Downloading xpsf playlist',
2406             'Unable to download xspf manifest', fatal=fatal)
2407         if xspf is False:
2408             return []
2409         return self._parse_xspf(
2410             xspf, playlist_id, xspf_url=xspf_url,
2411             xspf_base_url=base_url(xspf_url))
2412
2413     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2414         NS_MAP = {
2415             'xspf': 'http://xspf.org/ns/0/',
2416             's1': 'http://static.streamone.nl/player/ns/0',
2417         }
2418
2419         entries = []
2420         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2421             title = xpath_text(
2422                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2423             description = xpath_text(
2424                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2425             thumbnail = xpath_text(
2426                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2427             duration = float_or_none(
2428                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2429
2430             formats = []
2431             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2432                 format_url = urljoin(xspf_base_url, location.text)
2433                 if not format_url:
2434                     continue
2435                 formats.append({
2436                     'url': format_url,
2437                     'manifest_url': xspf_url,
2438                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2439                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2440                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2441                 })
2442             self._sort_formats(formats)
2443
2444             entries.append({
2445                 'id': playlist_id,
2446                 'title': title,
2447                 'description': description,
2448                 'thumbnail': thumbnail,
2449                 'duration': duration,
2450                 'formats': formats,
2451             })
2452         return entries
2453
2454     def _extract_mpd_formats(self, *args, **kwargs):
2455         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2456         if subs:
2457             self.report_warning(bug_reports_message(
2458                 "Ignoring subtitle tracks found in the DASH manifest; "
2459                 "if any subtitle tracks are missing,"
2460             ))
2461         return fmts
2462
2463     def _extract_mpd_formats_and_subtitles(
2464             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2465             fatal=True, data=None, headers={}, query={}):
2466         res = self._download_xml_handle(
2467             mpd_url, video_id,
2468             note='Downloading MPD manifest' if note is None else note,
2469             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2470             fatal=fatal, data=data, headers=headers, query=query)
2471         if res is False:
2472             return [], {}
2473         mpd_doc, urlh = res
2474         if mpd_doc is None:
2475             return [], {}
2476         mpd_base_url = base_url(urlh.geturl())
2477
2478         return self._parse_mpd_formats_and_subtitles(
2479             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2480
2481     def _parse_mpd_formats(self, *args, **kwargs):
2482         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2483         if subs:
2484             self.report_warning(bug_reports_message(
2485                 "Ignoring subtitle tracks found in the DASH manifest; "
2486                 "if any subtitle tracks are missing,"
2487             ))
2488         return fmts
2489
2490     def _parse_mpd_formats_and_subtitles(
2491             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2492         """
2493         Parse formats from MPD manifest.
2494         References:
2495          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2496             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2497          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2498         """
2499         if not self.get_param('dynamic_mpd', True):
2500             if mpd_doc.get('type') == 'dynamic':
2501                 return [], {}
2502
2503         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2504
2505         def _add_ns(path):
2506             return self._xpath_ns(path, namespace)
2507
2508         def is_drm_protected(element):
2509             return element.find(_add_ns('ContentProtection')) is not None
2510
2511         def extract_multisegment_info(element, ms_parent_info):
2512             ms_info = ms_parent_info.copy()
2513
2514             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2515             # common attributes and elements.  We will only extract relevant
2516             # for us.
2517             def extract_common(source):
2518                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2519                 if segment_timeline is not None:
2520                     s_e = segment_timeline.findall(_add_ns('S'))
2521                     if s_e:
2522                         ms_info['total_number'] = 0
2523                         ms_info['s'] = []
2524                         for s in s_e:
2525                             r = int(s.get('r', 0))
2526                             ms_info['total_number'] += 1 + r
2527                             ms_info['s'].append({
2528                                 't': int(s.get('t', 0)),
2529                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2530                                 'd': int(s.attrib['d']),
2531                                 'r': r,
2532                             })
2533                 start_number = source.get('startNumber')
2534                 if start_number:
2535                     ms_info['start_number'] = int(start_number)
2536                 timescale = source.get('timescale')
2537                 if timescale:
2538                     ms_info['timescale'] = int(timescale)
2539                 segment_duration = source.get('duration')
2540                 if segment_duration:
2541                     ms_info['segment_duration'] = float(segment_duration)
2542
2543             def extract_Initialization(source):
2544                 initialization = source.find(_add_ns('Initialization'))
2545                 if initialization is not None:
2546                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2547
2548             segment_list = element.find(_add_ns('SegmentList'))
2549             if segment_list is not None:
2550                 extract_common(segment_list)
2551                 extract_Initialization(segment_list)
2552                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2553                 if segment_urls_e:
2554                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2555             else:
2556                 segment_template = element.find(_add_ns('SegmentTemplate'))
2557                 if segment_template is not None:
2558                     extract_common(segment_template)
2559                     media = segment_template.get('media')
2560                     if media:
2561                         ms_info['media'] = media
2562                     initialization = segment_template.get('initialization')
2563                     if initialization:
2564                         ms_info['initialization'] = initialization
2565                     else:
2566                         extract_Initialization(segment_template)
2567             return ms_info
2568
2569         skip_unplayable = not self.get_param('allow_unplayable_formats')
2570
2571         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2572         formats = []
2573         subtitles = {}
2574         for period in mpd_doc.findall(_add_ns('Period')):
2575             period_duration = parse_duration(period.get('duration')) or mpd_duration
2576             period_ms_info = extract_multisegment_info(period, {
2577                 'start_number': 1,
2578                 'timescale': 1,
2579             })
2580             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2581                 if skip_unplayable and is_drm_protected(adaptation_set):
2582                     continue
2583                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2584                 for representation in adaptation_set.findall(_add_ns('Representation')):
2585                     if skip_unplayable and is_drm_protected(representation):
2586                         continue
2587                     representation_attrib = adaptation_set.attrib.copy()
2588                     representation_attrib.update(representation.attrib)
2589                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2590                     mime_type = representation_attrib['mimeType']
2591                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2592
2593                     if content_type in ('video', 'audio', 'text') or mime_type == 'image/jpeg':
2594                         base_url = ''
2595                         for element in (representation, adaptation_set, period, mpd_doc):
2596                             base_url_e = element.find(_add_ns('BaseURL'))
2597                             if base_url_e is not None:
2598                                 base_url = base_url_e.text + base_url
2599                                 if re.match(r'^https?://', base_url):
2600                                     break
2601                         if mpd_base_url and not re.match(r'^https?://', base_url):
2602                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2603                                 mpd_base_url += '/'
2604                             base_url = mpd_base_url + base_url
2605                         representation_id = representation_attrib.get('id')
2606                         lang = representation_attrib.get('lang')
2607                         url_el = representation.find(_add_ns('BaseURL'))
2608                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2609                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2610                         if representation_id is not None:
2611                             format_id = representation_id
2612                         else:
2613                             format_id = content_type
2614                         if mpd_id:
2615                             format_id = mpd_id + '-' + format_id
2616                         if content_type in ('video', 'audio'):
2617                             f = {
2618                                 'format_id': format_id,
2619                                 'manifest_url': mpd_url,
2620                                 'ext': mimetype2ext(mime_type),
2621                                 'width': int_or_none(representation_attrib.get('width')),
2622                                 'height': int_or_none(representation_attrib.get('height')),
2623                                 'tbr': float_or_none(bandwidth, 1000),
2624                                 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2625                                 'fps': int_or_none(representation_attrib.get('frameRate')),
2626                                 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2627                                 'format_note': 'DASH %s' % content_type,
2628                                 'filesize': filesize,
2629                                 'container': mimetype2ext(mime_type) + '_dash',
2630                             }
2631                             f.update(parse_codecs(representation_attrib.get('codecs')))
2632                         elif content_type == 'text':
2633                             f = {
2634                                 'ext': mimetype2ext(mime_type),
2635                                 'manifest_url': mpd_url,
2636                                 'filesize': filesize,
2637                             }
2638                         elif mime_type == 'image/jpeg':
2639                             # See test case in VikiIE
2640                             # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2641                             f = {
2642                                 'format_id': format_id,
2643                                 'ext': 'mhtml',
2644                                 'manifest_url': mpd_url,
2645                                 'format_note': 'DASH storyboards (jpeg)',
2646                                 'acodec': 'none',
2647                                 'vcodec': 'none',
2648                             }
2649                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2650
2651                         def prepare_template(template_name, identifiers):
2652                             tmpl = representation_ms_info[template_name]
2653                             # First of, % characters outside $...$ templates
2654                             # must be escaped by doubling for proper processing
2655                             # by % operator string formatting used further (see
2656                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2657                             t = ''
2658                             in_template = False
2659                             for c in tmpl:
2660                                 t += c
2661                                 if c == '$':
2662                                     in_template = not in_template
2663                                 elif c == '%' and not in_template:
2664                                     t += c
2665                             # Next, $...$ templates are translated to their
2666                             # %(...) counterparts to be used with % operator
2667                             if representation_id is not None:
2668                                 t = t.replace('$RepresentationID$', representation_id)
2669                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2670                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2671                             t.replace('$$', '$')
2672                             return t
2673
2674                         # @initialization is a regular template like @media one
2675                         # so it should be handled just the same way (see
2676                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2677                         if 'initialization' in representation_ms_info:
2678                             initialization_template = prepare_template(
2679                                 'initialization',
2680                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2681                                 # $Time$ shall not be included for @initialization thus
2682                                 # only $Bandwidth$ remains
2683                                 ('Bandwidth', ))
2684                             representation_ms_info['initialization_url'] = initialization_template % {
2685                                 'Bandwidth': bandwidth,
2686                             }
2687
2688                         def location_key(location):
2689                             return 'url' if re.match(r'^https?://', location) else 'path'
2690
2691                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2692
2693                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2694                             media_location_key = location_key(media_template)
2695
2696                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2697                             # can't be used at the same time
2698                             if '%(Number' in media_template and 's' not in representation_ms_info:
2699                                 segment_duration = None
2700                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2701                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2702                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2703                                 representation_ms_info['fragments'] = [{
2704                                     media_location_key: media_template % {
2705                                         'Number': segment_number,
2706                                         'Bandwidth': bandwidth,
2707                                     },
2708                                     'duration': segment_duration,
2709                                 } for segment_number in range(
2710                                     representation_ms_info['start_number'],
2711                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2712                             else:
2713                                 # $Number*$ or $Time$ in media template with S list available
2714                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2715                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2716                                 representation_ms_info['fragments'] = []
2717                                 segment_time = 0
2718                                 segment_d = None
2719                                 segment_number = representation_ms_info['start_number']
2720
2721                                 def add_segment_url():
2722                                     segment_url = media_template % {
2723                                         'Time': segment_time,
2724                                         'Bandwidth': bandwidth,
2725                                         'Number': segment_number,
2726                                     }
2727                                     representation_ms_info['fragments'].append({
2728                                         media_location_key: segment_url,
2729                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2730                                     })
2731
2732                                 for num, s in enumerate(representation_ms_info['s']):
2733                                     segment_time = s.get('t') or segment_time
2734                                     segment_d = s['d']
2735                                     add_segment_url()
2736                                     segment_number += 1
2737                                     for r in range(s.get('r', 0)):
2738                                         segment_time += segment_d
2739                                         add_segment_url()
2740                                         segment_number += 1
2741                                     segment_time += segment_d
2742                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2743                             # No media template
2744                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2745                             # or any YouTube dashsegments video
2746                             fragments = []
2747                             segment_index = 0
2748                             timescale = representation_ms_info['timescale']
2749                             for s in representation_ms_info['s']:
2750                                 duration = float_or_none(s['d'], timescale)
2751                                 for r in range(s.get('r', 0) + 1):
2752                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2753                                     fragments.append({
2754                                         location_key(segment_uri): segment_uri,
2755                                         'duration': duration,
2756                                     })
2757                                     segment_index += 1
2758                             representation_ms_info['fragments'] = fragments
2759                         elif 'segment_urls' in representation_ms_info:
2760                             # Segment URLs with no SegmentTimeline
2761                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2762                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2763                             fragments = []
2764                             segment_duration = float_or_none(
2765                                 representation_ms_info['segment_duration'],
2766                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2767                             for segment_url in representation_ms_info['segment_urls']:
2768                                 fragment = {
2769                                     location_key(segment_url): segment_url,
2770                                 }
2771                                 if segment_duration:
2772                                     fragment['duration'] = segment_duration
2773                                 fragments.append(fragment)
2774                             representation_ms_info['fragments'] = fragments
2775                         # If there is a fragments key available then we correctly recognized fragmented media.
2776                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2777                         # assumption is not necessarily correct since we may simply have no support for
2778                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2779                         if 'fragments' in representation_ms_info:
2780                             f.update({
2781                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2782                                 'url': mpd_url or base_url,
2783                                 'fragment_base_url': base_url,
2784                                 'fragments': [],
2785                                 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2786                             })
2787                             if 'initialization_url' in representation_ms_info:
2788                                 initialization_url = representation_ms_info['initialization_url']
2789                                 if not f.get('url'):
2790                                     f['url'] = initialization_url
2791                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2792                             f['fragments'].extend(representation_ms_info['fragments'])
2793                         else:
2794                             # Assuming direct URL to unfragmented media.
2795                             f['url'] = base_url
2796                         if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2797                             formats.append(f)
2798                         elif content_type == 'text':
2799                             subtitles.setdefault(lang or 'und', []).append(f)
2800                     else:
2801                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2802         return formats, subtitles
2803
2804     def _extract_ism_formats(self, *args, **kwargs):
2805         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2806         if subs:
2807             self.report_warning(bug_reports_message(
2808                 "Ignoring subtitle tracks found in the ISM manifest; "
2809                 "if any subtitle tracks are missing,"
2810             ))
2811         return fmts
2812
2813     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2814         res = self._download_xml_handle(
2815             ism_url, video_id,
2816             note='Downloading ISM manifest' if note is None else note,
2817             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2818             fatal=fatal, data=data, headers=headers, query=query)
2819         if res is False:
2820             return [], {}
2821         ism_doc, urlh = res
2822         if ism_doc is None:
2823             return [], {}
2824
2825         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2826
2827     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2828         """
2829         Parse formats from ISM manifest.
2830         References:
2831          1. [MS-SSTR]: Smooth Streaming Protocol,
2832             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2833         """
2834         if ism_doc.get('IsLive') == 'TRUE':
2835             return [], {}
2836         if (not self.get_param('allow_unplayable_formats')
2837                 and ism_doc.find('Protection') is not None):
2838             return [], {}
2839
2840         duration = int(ism_doc.attrib['Duration'])
2841         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2842
2843         formats = []
2844         subtitles = {}
2845         for stream in ism_doc.findall('StreamIndex'):
2846             stream_type = stream.get('Type')
2847             if stream_type not in ('video', 'audio', 'text'):
2848                 continue
2849             url_pattern = stream.attrib['Url']
2850             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2851             stream_name = stream.get('Name')
2852             stream_language = stream.get('Language', 'und')
2853             for track in stream.findall('QualityLevel'):
2854                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2855                 # TODO: add support for WVC1 and WMAP
2856                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2857                     self.report_warning('%s is not a supported codec' % fourcc)
2858                     continue
2859                 tbr = int(track.attrib['Bitrate']) // 1000
2860                 # [1] does not mention Width and Height attributes. However,
2861                 # they're often present while MaxWidth and MaxHeight are
2862                 # missing, so should be used as fallbacks
2863                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2864                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2865                 sampling_rate = int_or_none(track.get('SamplingRate'))
2866
2867                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2868                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2869
2870                 fragments = []
2871                 fragment_ctx = {
2872                     'time': 0,
2873                 }
2874                 stream_fragments = stream.findall('c')
2875                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2876                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2877                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2878                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2879                     if not fragment_ctx['duration']:
2880                         try:
2881                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2882                         except IndexError:
2883                             next_fragment_time = duration
2884                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2885                     for _ in range(fragment_repeat):
2886                         fragments.append({
2887                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2888                             'duration': fragment_ctx['duration'] / stream_timescale,
2889                         })
2890                         fragment_ctx['time'] += fragment_ctx['duration']
2891
2892                 format_id = []
2893                 if ism_id:
2894                     format_id.append(ism_id)
2895                 if stream_name:
2896                     format_id.append(stream_name)
2897                 format_id.append(compat_str(tbr))
2898
2899                 if stream_type == 'text':
2900                     subtitles.setdefault(stream_language, []).append({
2901                         'ext': 'ismt',
2902                         'protocol': 'ism',
2903                         'url': ism_url,
2904                         'manifest_url': ism_url,
2905                         'fragments': fragments,
2906                         '_download_params': {
2907                             'stream_type': stream_type,
2908                             'duration': duration,
2909                             'timescale': stream_timescale,
2910                             'fourcc': fourcc,
2911                             'language': stream_language,
2912                             'codec_private_data': track.get('CodecPrivateData'),
2913                         }
2914                     })
2915                 elif stream_type in ('video', 'audio'):
2916                     formats.append({
2917                         'format_id': '-'.join(format_id),
2918                         'url': ism_url,
2919                         'manifest_url': ism_url,
2920                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2921                         'width': width,
2922                         'height': height,
2923                         'tbr': tbr,
2924                         'asr': sampling_rate,
2925                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2926                         'acodec': 'none' if stream_type == 'video' else fourcc,
2927                         'protocol': 'ism',
2928                         'fragments': fragments,
2929                         '_download_params': {
2930                             'stream_type': stream_type,
2931                             'duration': duration,
2932                             'timescale': stream_timescale,
2933                             'width': width or 0,
2934                             'height': height or 0,
2935                             'fourcc': fourcc,
2936                             'language': stream_language,
2937                             'codec_private_data': track.get('CodecPrivateData'),
2938                             'sampling_rate': sampling_rate,
2939                             'channels': int_or_none(track.get('Channels', 2)),
2940                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2941                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2942                         },
2943                     })
2944         return formats, subtitles
2945
2946     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2947         def absolute_url(item_url):
2948             return urljoin(base_url, item_url)
2949
2950         def parse_content_type(content_type):
2951             if not content_type:
2952                 return {}
2953             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2954             if ctr:
2955                 mimetype, codecs = ctr.groups()
2956                 f = parse_codecs(codecs)
2957                 f['ext'] = mimetype2ext(mimetype)
2958                 return f
2959             return {}
2960
2961         def _media_formats(src, cur_media_type, type_info={}):
2962             full_url = absolute_url(src)
2963             ext = type_info.get('ext') or determine_ext(full_url)
2964             if ext == 'm3u8':
2965                 is_plain_url = False
2966                 formats = self._extract_m3u8_formats(
2967                     full_url, video_id, ext='mp4',
2968                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2969                     preference=preference, quality=quality, fatal=False)
2970             elif ext == 'mpd':
2971                 is_plain_url = False
2972                 formats = self._extract_mpd_formats(
2973                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2974             else:
2975                 is_plain_url = True
2976                 formats = [{
2977                     'url': full_url,
2978                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2979                 }]
2980             return is_plain_url, formats
2981
2982         entries = []
2983         # amp-video and amp-audio are very similar to their HTML5 counterparts
2984         # so we wll include them right here (see
2985         # https://www.ampproject.org/docs/reference/components/amp-video)
2986         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2987         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2988         media_tags = [(media_tag, media_tag_name, media_type, '')
2989                       for media_tag, media_tag_name, media_type
2990                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2991         media_tags.extend(re.findall(
2992             # We only allow video|audio followed by a whitespace or '>'.
2993             # Allowing more characters may end up in significant slow down (see
2994             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2995             # http://www.porntrex.com/maps/videositemap.xml).
2996             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2997         for media_tag, _, media_type, media_content in media_tags:
2998             media_info = {
2999                 'formats': [],
3000                 'subtitles': {},
3001             }
3002             media_attributes = extract_attributes(media_tag)
3003             src = strip_or_none(media_attributes.get('src'))
3004             if src:
3005                 _, formats = _media_formats(src, media_type)
3006                 media_info['formats'].extend(formats)
3007             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3008             if media_content:
3009                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3010                     s_attr = extract_attributes(source_tag)
3011                     # data-video-src and data-src are non standard but seen
3012                     # several times in the wild
3013                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3014                     if not src:
3015                         continue
3016                     f = parse_content_type(s_attr.get('type'))
3017                     is_plain_url, formats = _media_formats(src, media_type, f)
3018                     if is_plain_url:
3019                         # width, height, res, label and title attributes are
3020                         # all not standard but seen several times in the wild
3021                         labels = [
3022                             s_attr.get(lbl)
3023                             for lbl in ('label', 'title')
3024                             if str_or_none(s_attr.get(lbl))
3025                         ]
3026                         width = int_or_none(s_attr.get('width'))
3027                         height = (int_or_none(s_attr.get('height'))
3028                                   or int_or_none(s_attr.get('res')))
3029                         if not width or not height:
3030                             for lbl in labels:
3031                                 resolution = parse_resolution(lbl)
3032                                 if not resolution:
3033                                     continue
3034                                 width = width or resolution.get('width')
3035                                 height = height or resolution.get('height')
3036                         for lbl in labels:
3037                             tbr = parse_bitrate(lbl)
3038                             if tbr:
3039                                 break
3040                         else:
3041                             tbr = None
3042                         f.update({
3043                             'width': width,
3044                             'height': height,
3045                             'tbr': tbr,
3046                             'format_id': s_attr.get('label') or s_attr.get('title'),
3047                         })
3048                         f.update(formats[0])
3049                         media_info['formats'].append(f)
3050                     else:
3051                         media_info['formats'].extend(formats)
3052                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3053                     track_attributes = extract_attributes(track_tag)
3054                     kind = track_attributes.get('kind')
3055                     if not kind or kind in ('subtitles', 'captions'):
3056                         src = strip_or_none(track_attributes.get('src'))
3057                         if not src:
3058                             continue
3059                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3060                         media_info['subtitles'].setdefault(lang, []).append({
3061                             'url': absolute_url(src),
3062                         })
3063             for f in media_info['formats']:
3064                 f.setdefault('http_headers', {})['Referer'] = base_url
3065             if media_info['formats'] or media_info['subtitles']:
3066                 entries.append(media_info)
3067         return entries
3068
3069     def _extract_akamai_formats(self, *args, **kwargs):
3070         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3071         if subs:
3072             self.report_warning(bug_reports_message(
3073                 "Ignoring subtitle tracks found in the manifests; "
3074                 "if any subtitle tracks are missing,"
3075             ))
3076         return fmts
3077
3078     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3079         signed = 'hdnea=' in manifest_url
3080         if not signed:
3081             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3082             manifest_url = re.sub(
3083                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3084                 '', manifest_url).strip('?')
3085
3086         formats = []
3087         subtitles = {}
3088
3089         hdcore_sign = 'hdcore=3.7.0'
3090         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3091         hds_host = hosts.get('hds')
3092         if hds_host:
3093             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3094         if 'hdcore=' not in f4m_url:
3095             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3096         f4m_formats = self._extract_f4m_formats(
3097             f4m_url, video_id, f4m_id='hds', fatal=False)
3098         for entry in f4m_formats:
3099             entry.update({'extra_param_to_segment_url': hdcore_sign})
3100         formats.extend(f4m_formats)
3101
3102         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3103         hls_host = hosts.get('hls')
3104         if hls_host:
3105             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3106         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3107             m3u8_url, video_id, 'mp4', 'm3u8_native',
3108             m3u8_id='hls', fatal=False)
3109         formats.extend(m3u8_formats)
3110         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3111
3112         http_host = hosts.get('http')
3113         if http_host and m3u8_formats and not signed:
3114             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3115             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3116             qualities_length = len(qualities)
3117             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3118                 i = 0
3119                 for f in m3u8_formats:
3120                     if f['vcodec'] != 'none':
3121                         for protocol in ('http', 'https'):
3122                             http_f = f.copy()
3123                             del http_f['manifest_url']
3124                             http_url = re.sub(
3125                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3126                             http_f.update({
3127                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3128                                 'url': http_url,
3129                                 'protocol': protocol,
3130                             })
3131                             formats.append(http_f)
3132                         i += 1
3133
3134         return formats, subtitles
3135
3136     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3137         query = compat_urlparse.urlparse(url).query
3138         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3139         mobj = re.search(
3140             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3141         url_base = mobj.group('url')
3142         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3143         formats = []
3144
3145         def manifest_url(manifest):
3146             m_url = '%s/%s' % (http_base_url, manifest)
3147             if query:
3148                 m_url += '?%s' % query
3149             return m_url
3150
3151         if 'm3u8' not in skip_protocols:
3152             formats.extend(self._extract_m3u8_formats(
3153                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3154                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3155         if 'f4m' not in skip_protocols:
3156             formats.extend(self._extract_f4m_formats(
3157                 manifest_url('manifest.f4m'),
3158                 video_id, f4m_id='hds', fatal=False))
3159         if 'dash' not in skip_protocols:
3160             formats.extend(self._extract_mpd_formats(
3161                 manifest_url('manifest.mpd'),
3162                 video_id, mpd_id='dash', fatal=False))
3163         if re.search(r'(?:/smil:|\.smil)', url_base):
3164             if 'smil' not in skip_protocols:
3165                 rtmp_formats = self._extract_smil_formats(
3166                     manifest_url('jwplayer.smil'),
3167                     video_id, fatal=False)
3168                 for rtmp_format in rtmp_formats:
3169                     rtsp_format = rtmp_format.copy()
3170                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3171                     del rtsp_format['play_path']
3172                     del rtsp_format['ext']
3173                     rtsp_format.update({
3174                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3175                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3176                         'protocol': 'rtsp',
3177                     })
3178                     formats.extend([rtmp_format, rtsp_format])
3179         else:
3180             for protocol in ('rtmp', 'rtsp'):
3181                 if protocol not in skip_protocols:
3182                     formats.append({
3183                         'url': '%s:%s' % (protocol, url_base),
3184                         'format_id': protocol,
3185                         'protocol': protocol,
3186                     })
3187         return formats
3188
3189     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3190         mobj = re.search(
3191             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3192             webpage)
3193         if mobj:
3194             try:
3195                 jwplayer_data = self._parse_json(mobj.group('options'),
3196                                                  video_id=video_id,
3197                                                  transform_source=transform_source)
3198             except ExtractorError:
3199                 pass
3200             else:
3201                 if isinstance(jwplayer_data, dict):
3202                     return jwplayer_data
3203
3204     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3205         jwplayer_data = self._find_jwplayer_data(
3206             webpage, video_id, transform_source=js_to_json)
3207         return self._parse_jwplayer_data(
3208             jwplayer_data, video_id, *args, **kwargs)
3209
3210     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3211                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3212         # JWPlayer backward compatibility: flattened playlists
3213         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3214         if 'playlist' not in jwplayer_data:
3215             jwplayer_data = {'playlist': [jwplayer_data]}
3216
3217         entries = []
3218
3219         # JWPlayer backward compatibility: single playlist item
3220         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3221         if not isinstance(jwplayer_data['playlist'], list):
3222             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3223
3224         for video_data in jwplayer_data['playlist']:
3225             # JWPlayer backward compatibility: flattened sources
3226             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3227             if 'sources' not in video_data:
3228                 video_data['sources'] = [video_data]
3229
3230             this_video_id = video_id or video_data['mediaid']
3231
3232             formats = self._parse_jwplayer_formats(
3233                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3234                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3235
3236             subtitles = {}
3237             tracks = video_data.get('tracks')
3238             if tracks and isinstance(tracks, list):
3239                 for track in tracks:
3240                     if not isinstance(track, dict):
3241                         continue
3242                     track_kind = track.get('kind')
3243                     if not track_kind or not isinstance(track_kind, compat_str):
3244                         continue
3245                     if track_kind.lower() not in ('captions', 'subtitles'):
3246                         continue
3247                     track_url = urljoin(base_url, track.get('file'))
3248                     if not track_url:
3249                         continue
3250                     subtitles.setdefault(track.get('label') or 'en', []).append({
3251                         'url': self._proto_relative_url(track_url)
3252                     })
3253
3254             entry = {
3255                 'id': this_video_id,
3256                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3257                 'description': clean_html(video_data.get('description')),
3258                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3259                 'timestamp': int_or_none(video_data.get('pubdate')),
3260                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3261                 'subtitles': subtitles,
3262             }
3263             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3264             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3265                 entry.update({
3266                     '_type': 'url_transparent',
3267                     'url': formats[0]['url'],
3268                 })
3269             else:
3270                 self._sort_formats(formats)
3271                 entry['formats'] = formats
3272             entries.append(entry)
3273         if len(entries) == 1:
3274             return entries[0]
3275         else:
3276             return self.playlist_result(entries)
3277
3278     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3279                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3280         urls = []
3281         formats = []
3282         for source in jwplayer_sources_data:
3283             if not isinstance(source, dict):
3284                 continue
3285             source_url = urljoin(
3286                 base_url, self._proto_relative_url(source.get('file')))
3287             if not source_url or source_url in urls:
3288                 continue
3289             urls.append(source_url)
3290             source_type = source.get('type') or ''
3291             ext = mimetype2ext(source_type) or determine_ext(source_url)
3292             if source_type == 'hls' or ext == 'm3u8':
3293                 formats.extend(self._extract_m3u8_formats(
3294                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3295                     m3u8_id=m3u8_id, fatal=False))
3296             elif source_type == 'dash' or ext == 'mpd':
3297                 formats.extend(self._extract_mpd_formats(
3298                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3299             elif ext == 'smil':
3300                 formats.extend(self._extract_smil_formats(
3301                     source_url, video_id, fatal=False))
3302             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3303             elif source_type.startswith('audio') or ext in (
3304                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3305                 formats.append({
3306                     'url': source_url,
3307                     'vcodec': 'none',
3308                     'ext': ext,
3309                 })
3310             else:
3311                 height = int_or_none(source.get('height'))
3312                 if height is None:
3313                     # Often no height is provided but there is a label in
3314                     # format like "1080p", "720p SD", or 1080.
3315                     height = int_or_none(self._search_regex(
3316                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3317                         'height', default=None))
3318                 a_format = {
3319                     'url': source_url,
3320                     'width': int_or_none(source.get('width')),
3321                     'height': height,
3322                     'tbr': int_or_none(source.get('bitrate')),
3323                     'ext': ext,
3324                 }
3325                 if source_url.startswith('rtmp'):
3326                     a_format['ext'] = 'flv'
3327                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3328                     # of jwplayer.flash.swf
3329                     rtmp_url_parts = re.split(
3330                         r'((?:mp4|mp3|flv):)', source_url, 1)
3331                     if len(rtmp_url_parts) == 3:
3332                         rtmp_url, prefix, play_path = rtmp_url_parts
3333                         a_format.update({
3334                             'url': rtmp_url,
3335                             'play_path': prefix + play_path,
3336                         })
3337                     if rtmp_params:
3338                         a_format.update(rtmp_params)
3339                 formats.append(a_format)
3340         return formats
3341
3342     def _live_title(self, name):
3343         """ Generate the title for a live video """
3344         now = datetime.datetime.now()
3345         now_str = now.strftime('%Y-%m-%d %H:%M')
3346         return name + ' ' + now_str
3347
3348     def _int(self, v, name, fatal=False, **kwargs):
3349         res = int_or_none(v, **kwargs)
3350         if 'get_attr' in kwargs:
3351             print(getattr(v, kwargs['get_attr']))
3352         if res is None:
3353             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3354             if fatal:
3355                 raise ExtractorError(msg)
3356             else:
3357                 self.report_warning(msg)
3358         return res
3359
3360     def _float(self, v, name, fatal=False, **kwargs):
3361         res = float_or_none(v, **kwargs)
3362         if res is None:
3363             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3364             if fatal:
3365                 raise ExtractorError(msg)
3366             else:
3367                 self.report_warning(msg)
3368         return res
3369
3370     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3371                     path='/', secure=False, discard=False, rest={}, **kwargs):
3372         cookie = compat_cookiejar_Cookie(
3373             0, name, value, port, port is not None, domain, True,
3374             domain.startswith('.'), path, True, secure, expire_time,
3375             discard, None, None, rest)
3376         self._downloader.cookiejar.set_cookie(cookie)
3377
3378     def _get_cookies(self, url):
3379         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3380         req = sanitized_Request(url)
3381         self._downloader.cookiejar.add_cookie_header(req)
3382         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3383
3384     def _apply_first_set_cookie_header(self, url_handle, cookie):
3385         """
3386         Apply first Set-Cookie header instead of the last. Experimental.
3387
3388         Some sites (e.g. [1-3]) may serve two cookies under the same name
3389         in Set-Cookie header and expect the first (old) one to be set rather
3390         than second (new). However, as of RFC6265 the newer one cookie
3391         should be set into cookie store what actually happens.
3392         We will workaround this issue by resetting the cookie to
3393         the first one manually.
3394         1. https://new.vk.com/
3395         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3396         3. https://learning.oreilly.com/
3397         """
3398         for header, cookies in url_handle.headers.items():
3399             if header.lower() != 'set-cookie':
3400                 continue
3401             if sys.version_info[0] >= 3:
3402                 cookies = cookies.encode('iso-8859-1')
3403             cookies = cookies.decode('utf-8')
3404             cookie_value = re.search(
3405                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3406             if cookie_value:
3407                 value, domain = cookie_value.groups()
3408                 self._set_cookie(domain, cookie, value)
3409                 break
3410
3411     def get_testcases(self, include_onlymatching=False):
3412         t = getattr(self, '_TEST', None)
3413         if t:
3414             assert not hasattr(self, '_TESTS'), \
3415                 '%s has _TEST and _TESTS' % type(self).__name__
3416             tests = [t]
3417         else:
3418             tests = getattr(self, '_TESTS', [])
3419         for t in tests:
3420             if not include_onlymatching and t.get('only_matching', False):
3421                 continue
3422             t['name'] = type(self).__name__[:-len('IE')]
3423             yield t
3424
3425     def is_suitable(self, age_limit):
3426         """ Test whether the extractor is generally suitable for the given
3427         age limit (i.e. pornographic sites are not, all others usually are) """
3428
3429         any_restricted = False
3430         for tc in self.get_testcases(include_onlymatching=False):
3431             if tc.get('playlist', []):
3432                 tc = tc['playlist'][0]
3433             is_restricted = age_restricted(
3434                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3435             if not is_restricted:
3436                 return True
3437             any_restricted = any_restricted or is_restricted
3438         return not any_restricted
3439
3440     def extract_subtitles(self, *args, **kwargs):
3441         if (self.get_param('writesubtitles', False)
3442                 or self.get_param('listsubtitles')):
3443             return self._get_subtitles(*args, **kwargs)
3444         return {}
3445
3446     def _get_subtitles(self, *args, **kwargs):
3447         raise NotImplementedError('This method must be implemented by subclasses')
3448
3449     @staticmethod
3450     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3451         """ Merge subtitle items for one language. Items with duplicated URLs
3452         will be dropped. """
3453         list1_urls = set([item['url'] for item in subtitle_list1])
3454         ret = list(subtitle_list1)
3455         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3456         return ret
3457
3458     @classmethod
3459     def _merge_subtitles(cls, *dicts, target=None):
3460         """ Merge subtitle dictionaries, language by language. """
3461         if target is None:
3462             target = {}
3463         for d in dicts:
3464             for lang, subs in d.items():
3465                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3466         return target
3467
3468     def extract_automatic_captions(self, *args, **kwargs):
3469         if (self.get_param('writeautomaticsub', False)
3470                 or self.get_param('listsubtitles')):
3471             return self._get_automatic_captions(*args, **kwargs)
3472         return {}
3473
3474     def _get_automatic_captions(self, *args, **kwargs):
3475         raise NotImplementedError('This method must be implemented by subclasses')
3476
3477     def mark_watched(self, *args, **kwargs):
3478         if (self.get_param('mark_watched', False)
3479                 and (self._get_login_info()[0] is not None
3480                      or self.get_param('cookiefile') is not None)):
3481             self._mark_watched(*args, **kwargs)
3482
3483     def _mark_watched(self, *args, **kwargs):
3484         raise NotImplementedError('This method must be implemented by subclasses')
3485
3486     def geo_verification_headers(self):
3487         headers = {}
3488         geo_verification_proxy = self.get_param('geo_verification_proxy')
3489         if geo_verification_proxy:
3490             headers['Ytdl-request-proxy'] = geo_verification_proxy
3491         return headers
3492
3493     def _generic_id(self, url):
3494         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3495
3496     def _generic_title(self, url):
3497         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3498
3499     @staticmethod
3500     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3501         all_known = all(map(
3502             lambda x: x is not None,
3503             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3504         return (
3505             'private' if is_private
3506             else 'premium_only' if needs_premium
3507             else 'subscriber_only' if needs_subscription
3508             else 'needs_auth' if needs_auth
3509             else 'unlisted' if is_unlisted
3510             else 'public' if all_known
3511             else None)
3512
3513     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3514         '''
3515         @returns            A list of values for the extractor argument given by "key"
3516                             or "default" if no such key is present
3517         @param default      The default value to return when the key is not present (default: [])
3518         @param casesense    When false, the values are converted to lower case
3519         '''
3520         val = traverse_obj(
3521             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3522         if val is None:
3523             return [] if default is NO_DEFAULT else default
3524         return list(val) if casesense else [x.lower() for x in val]
3525
3526
3527 class SearchInfoExtractor(InfoExtractor):
3528     """
3529     Base class for paged search queries extractors.
3530     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3531     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3532     """
3533
3534     @classmethod
3535     def _make_valid_url(cls):
3536         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3537
3538     @classmethod
3539     def suitable(cls, url):
3540         return re.match(cls._make_valid_url(), url) is not None
3541
3542     def _real_extract(self, query):
3543         mobj = re.match(self._make_valid_url(), query)
3544         if mobj is None:
3545             raise ExtractorError('Invalid search query "%s"' % query)
3546
3547         prefix = mobj.group('prefix')
3548         query = mobj.group('query')
3549         if prefix == '':
3550             return self._get_n_results(query, 1)
3551         elif prefix == 'all':
3552             return self._get_n_results(query, self._MAX_RESULTS)
3553         else:
3554             n = int(prefix)
3555             if n <= 0:
3556                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3557             elif n > self._MAX_RESULTS:
3558                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3559                 n = self._MAX_RESULTS
3560             return self._get_n_results(query, n)
3561
3562     def _get_n_results(self, query, n):
3563         """Get a specified number of results for a query"""
3564         raise NotImplementedError('This method must be implemented by subclasses')
3565
3566     @property
3567     def SEARCH_KEY(self):
3568         return self._SEARCH_KEY