yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar_Cookie,
  18     compat_cookies_SimpleCookie,
  19     compat_etree_Element,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader import FileDownloader
  33 from ..downloader.f4m import (
  34     get_base_url,
  35     remove_encrypted_media,
  36 )
  37 from ..utils import (
  38     NO_DEFAULT,
  39     age_restricted,
  40     base_url,
  41     bug_reports_message,
  42     clean_html,
  43     compiled_regex_type,
  44     determine_ext,
  45     determine_protocol,
  46     dict_get,
  47     error_to_compat_str,
  48     ExtractorError,
  49     extract_attributes,
  50     fix_xml_ampersands,
  51     float_or_none,
  52     GeoRestrictedError,
  53     GeoUtils,
  54     int_or_none,
  55     js_to_json,
  56     JSON_LD_RE,
  57     mimetype2ext,
  58     network_exceptions,
  59     orderedSet,
  60     parse_bitrate,
  61     parse_codecs,
  62     parse_duration,
  63     parse_iso8601,
  64     parse_m3u8_attributes,
  65     parse_resolution,
  66     RegexNotFoundError,
  67     sanitized_Request,
  68     sanitize_filename,
  69     str_or_none,
  70     str_to_int,
  71     strip_or_none,
  72     traverse_obj,
  73     unescapeHTML,
  74     unified_strdate,
  75     unified_timestamp,
  76     update_Request,
  77     update_url_query,
  78     urljoin,
  79     url_basename,
  80     url_or_none,
  81     variadic,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor(object):
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped.
 107
 108     Additionally, it must contain either a formats entry or a url one:
 109
 110     formats:        A list of dictionaries for each format available, ordered
 111                     from worst to best quality.
 112
 113                     Potential fields:
 114                     * url        The mandatory URL representing the media:
 115                                    for plain file media - HTTP URL of this file,
 116                                    for RTMP - RTMP URL,
 117                                    for HLS - URL of the M3U8 media playlist,
 118                                    for HDS - URL of the F4M manifest,
 119                                    for DASH
 120                                      - HTTP URL to plain file media (in case of
 121                                        unfragmented media)
 122                                      - URL of the MPD manifest or base URL
 123                                        representing the media if MPD manifest
 124                                        is parsed from a string (in case of
 125                                        fragmented media)
 126                                    for MSS - URL of the ISM manifest.
 127                     * manifest_url
 128                                  The URL of the manifest file in case of
 129                                  fragmented media:
 130                                    for HLS - URL of the M3U8 master playlist,
 131                                    for HDS - URL of the F4M manifest,
 132                                    for DASH - URL of the MPD manifest,
 133                                    for MSS - URL of the ISM manifest.
 134                     * ext        Will be calculated from URL if missing
 135                     * format     A human-readable description of the format
 136                                  ("mp4 container with h264/opus").
 137                                  Calculated from the format_id, width, height.
 138                                  and format_note fields if missing.
 139                     * format_id  A short description of the format
 140                                  ("mp4_h264_opus" or "19").
 141                                 Technically optional, but strongly recommended.
 142                     * format_note Additional info about the format
 143                                  ("3D" or "DASH video")
 144                     * width      Width of the video, if known
 145                     * height     Height of the video, if known
 146                     * resolution Textual description of width and height
 147                     * tbr        Average bitrate of audio and video in KBit/s
 148                     * abr        Average audio bitrate in KBit/s
 149                     * acodec     Name of the audio codec in use
 150                     * asr        Audio sampling rate in Hertz
 151                     * vbr        Average video bitrate in KBit/s
 152                     * fps        Frame rate
 153                     * vcodec     Name of the video codec in use
 154                     * container  Name of the container format
 155                     * filesize   The number of bytes, if known in advance
 156                     * filesize_approx  An estimate for the number of bytes
 157                     * player_url SWF Player URL (used for rtmpdump).
 158                     * protocol   The protocol that will be used for the actual
 159                                  download, lower-case.
 160                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 161                                  "m3u8", "m3u8_native" or "http_dash_segments".
 162                     * fragment_base_url
 163                                  Base URL for fragments. Each fragment's path
 164                                  value (if present) will be relative to
 165                                  this URL.
 166                     * fragments  A list of fragments of a fragmented media.
 167                                  Each fragment entry must contain either an url
 168                                  or a path. If an url is present it should be
 169                                  considered by a client. Otherwise both path and
 170                                  fragment_base_url must be present. Here is
 171                                  the list of all potential fields:
 172                                  * "url" - fragment's URL
 173                                  * "path" - fragment's path relative to
 174                                             fragment_base_url
 175                                  * "duration" (optional, int or float)
 176                                  * "filesize" (optional, int)
 177                     * preference Order number of this format. If this field is
 178                                  present and not None, the formats get sorted
 179                                  by this field, regardless of all other values.
 180                                  -1 for default (order by other properties),
 181                                  -2 or smaller for less than default.
 182                                  < -1000 to hide the format (if there is
 183                                     another one which is strictly better)
 184                     * language   Language code, e.g. "de" or "en-US".
 185                     * language_preference  Is this in the language mentioned in
 186                                  the URL?
 187                                  10 if it's what the URL is about,
 188                                  -1 for default (don't know),
 189                                  -10 otherwise, other values reserved for now.
 190                     * quality    Order number of the video quality of this
 191                                  format, irrespective of the file format.
 192                                  -1 for default (order by other properties),
 193                                  -2 or smaller for less than default.
 194                     * source_preference  Order number for this video source
 195                                   (quality takes higher priority)
 196                                  -1 for default (order by other properties),
 197                                  -2 or smaller for less than default.
 198                     * http_headers  A dictionary of additional HTTP headers
 199                                  to add to the request.
 200                     * stretched_ratio  If given and not 1, indicates that the
 201                                  video's pixels are not square.
 202                                  width : height ratio as float.
 203                     * no_resume  The server does not support resuming the
 204                                  (HTTP or RTMP) download. Boolean.
 205                     * downloader_options  A dictionary of downloader options as
 206                                  described in FileDownloader
 207                     RTMP formats can also have the additional fields: page_url,
 208                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 209                     rtmp_protocol, rtmp_real_time
 210
 211     url:            Final video URL.
 212     ext:            Video filename extension.
 213     format:         The video format, defaults to ext (used for --get-format)
 214     player_url:     SWF Player URL (used for rtmpdump).
 215
 216     The following fields are optional:
 217
 218     alt_title:      A secondary title of the video.
 219     display_id      An alternative identifier for the video, not necessarily
 220                     unique, but available before title. Typically, id is
 221                     something like "4234987", title "Dancing naked mole rats",
 222                     and display_id "dancing-naked-mole-rats"
 223     thumbnails:     A list of dictionaries, with the following entries:
 224                         * "id" (optional, string) - Thumbnail format ID
 225                         * "url"
 226                         * "preference" (optional, int) - quality of the image
 227                         * "width" (optional, int)
 228                         * "height" (optional, int)
 229                         * "resolution" (optional, string "{width}x{height}",
 230                                         deprecated)
 231                         * "filesize" (optional, int)
 232                         * "_test_url" (optional, bool) - If true, test the URL
 233     thumbnail:      Full URL to a video thumbnail image.
 234     description:    Full video description.
 235     uploader:       Full name of the video uploader.
 236     license:        License name the video is licensed under.
 237     creator:        The creator of the video.
 238     release_timestamp: UNIX timestamp of the moment the video was released.
 239     release_date:   The date (YYYYMMDD) when the video was released.
 240     timestamp:      UNIX timestamp of the moment the video was uploaded
 241     upload_date:    Video upload date (YYYYMMDD).
 242                     If not explicitly set, calculated from timestamp.
 243     uploader_id:    Nickname or id of the video uploader.
 244     uploader_url:   Full URL to a personal webpage of the video uploader.
 245     channel:        Full name of the channel the video is uploaded on.
 246                     Note that channel fields may or may not repeat uploader
 247                     fields. This depends on a particular extractor.
 248     channel_id:     Id of the channel.
 249     channel_url:    Full URL to a channel webpage.
 250     location:       Physical location where the video was filmed.
 251     subtitles:      The available subtitles as a dictionary in the format
 252                     {tag: subformats}. "tag" is usually a language code, and
 253                     "subformats" is a list sorted from lower to higher
 254                     preference, each element is a dictionary with the "ext"
 255                     entry and one of:
 256                         * "data": The subtitles file contents
 257                         * "url": A URL pointing to the subtitles file
 258                     It can optionally also have:
 259                         * "name": Name or description of the subtitles
 260                     "ext" will be calculated from URL if missing
 261     automatic_captions: Like 'subtitles'; contains automatically generated
 262                     captions instead of normal subtitles
 263     duration:       Length of the video in seconds, as an integer or float.
 264     view_count:     How many users have watched the video on the platform.
 265     like_count:     Number of positive ratings of the video
 266     dislike_count:  Number of negative ratings of the video
 267     repost_count:   Number of reposts of the video
 268     average_rating: Average rating give by users, the scale used depends on the webpage
 269     comment_count:  Number of comments on the video
 270     comments:       A list of comments, each with one or more of the following
 271                     properties (all but one of text or html optional):
 272                         * "author" - human-readable name of the comment author
 273                         * "author_id" - user ID of the comment author
 274                         * "author_thumbnail" - The thumbnail of the comment author
 275                         * "id" - Comment ID
 276                         * "html" - Comment as HTML
 277                         * "text" - Plain text of the comment
 278                         * "timestamp" - UNIX timestamp of comment
 279                         * "parent" - ID of the comment this one is replying to.
 280                                      Set to "root" to indicate that this is a
 281                                      comment to the original video.
 282                         * "like_count" - Number of positive ratings of the comment
 283                         * "dislike_count" - Number of negative ratings of the comment
 284                         * "is_favorited" - Whether the comment is marked as
 285                                            favorite by the video uploader
 286                         * "author_is_uploader" - Whether the comment is made by
 287                                                  the video uploader
 288     age_limit:      Age restriction for the video, as an integer (years)
 289     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 290                     should allow to get the same result again. (It will be set
 291                     by YoutubeDL if it's missing)
 292     categories:     A list of categories that the video falls in, for example
 293                     ["Sports", "Berlin"]
 294     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 295     cast:           A list of the video cast
 296     is_live:        True, False, or None (=unknown). Whether this video is a
 297                     live stream that goes on instead of a fixed-length video.
 298     was_live:       True, False, or None (=unknown). Whether this video was
 299                     originally a live stream.
 300     live_status:    'is_live', 'upcoming', 'was_live', 'not_live' or None (=unknown)
 301                     If absent, automatically set from is_live, was_live
 302     start_time:     Time in seconds where the reproduction should start, as
 303                     specified in the URL.
 304     end_time:       Time in seconds where the reproduction should end, as
 305                     specified in the URL.
 306     chapters:       A list of dictionaries, with the following entries:
 307                         * "start_time" - The start time of the chapter in seconds
 308                         * "end_time" - The end time of the chapter in seconds
 309                         * "title" (optional, string)
 310     playable_in_embed: Whether this video is allowed to play in embedded
 311                     players on other sites. Can be True (=always allowed),
 312                     False (=never allowed), None (=unknown), or a string
 313                     specifying the criteria for embedability (Eg: 'whitelist')
 314     availability:   Under what condition the video is available. One of
 315                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 316                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 317                     to set it
 318     __post_extractor: A function to be called just before the metadata is
 319                     written to either disk, logger or console. The function
 320                     must return a dict which will be added to the info_dict.
 321                     This is usefull for additional information that is
 322                     time-consuming to extract. Note that the fields thus
 323                     extracted will not be available to output template and
 324                     match_filter. So, only "comments" and "comment_count" are
 325                     currently allowed to be extracted via this method.
 326
 327     The following fields should only be used when the video belongs to some logical
 328     chapter or section:
 329
 330     chapter:        Name or title of the chapter the video belongs to.
 331     chapter_number: Number of the chapter the video belongs to, as an integer.
 332     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 333
 334     The following fields should only be used when the video is an episode of some
 335     series, programme or podcast:
 336
 337     series:         Title of the series or programme the video episode belongs to.
 338     season:         Title of the season the video episode belongs to.
 339     season_number:  Number of the season the video episode belongs to, as an integer.
 340     season_id:      Id of the season the video episode belongs to, as a unicode string.
 341     episode:        Title of the video episode. Unlike mandatory video title field,
 342                     this field should denote the exact title of the video episode
 343                     without any kind of decoration.
 344     episode_number: Number of the video episode within a season, as an integer.
 345     episode_id:     Id of the video episode, as a unicode string.
 346
 347     The following fields should only be used when the media is a track or a part of
 348     a music album:
 349
 350     track:          Title of the track.
 351     track_number:   Number of the track within an album or a disc, as an integer.
 352     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 353                     as a unicode string.
 354     artist:         Artist(s) of the track.
 355     genre:          Genre(s) of the track.
 356     album:          Title of the album the track belongs to.
 357     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 358     album_artist:   List of all artists appeared on the album (e.g.
 359                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 360                     and compilations).
 361     disc_number:    Number of the disc or other physical medium the track belongs to,
 362                     as an integer.
 363     release_year:   Year (YYYY) when the album was released.
 364
 365     Unless mentioned otherwise, the fields should be Unicode strings.
 366
 367     Unless mentioned otherwise, None is equivalent to absence of information.
 368
 369
 370     _type "playlist" indicates multiple videos.
 371     There must be a key "entries", which is a list, an iterable, or a PagedList
 372     object, each element of which is a valid dictionary by this specification.
 373
 374     Additionally, playlists can have "id", "title", and any other relevent
 375     attributes with the same semantics as videos (see above).
 376
 377
 378     _type "multi_video" indicates that there are multiple videos that
 379     form a single show, for examples multiple acts of an opera or TV episode.
 380     It must have an entries key like a playlist and contain all the keys
 381     required for a video at the same time.
 382
 383
 384     _type "url" indicates that the video must be extracted from another
 385     location, possibly by a different extractor. Its only required key is:
 386     "url" - the next URL to extract.
 387     The key "ie_key" can be set to the class name (minus the trailing "IE",
 388     e.g. "Youtube") if the extractor class is known in advance.
 389     Additionally, the dictionary may have any properties of the resolved entity
 390     known in advance, for example "title" if the title of the referred video is
 391     known ahead of time.
 392
 393
 394     _type "url_transparent" entities have the same specification as "url", but
 395     indicate that the given additional information is more precise than the one
 396     associated with the resolved URL.
 397     This is useful when a site employs a video service that hosts the video and
 398     its technical metadata, but that video service does not embed a useful
 399     title, description etc.
 400
 401
 402     Subclasses of this one should re-define the _real_initialize() and
 403     _real_extract() methods and define a _VALID_URL regexp.
 404     Probably, they should also be added to the list of extractors.
 405
 406     _GEO_BYPASS attribute may be set to False in order to disable
 407     geo restriction bypass mechanisms for a particular extractor.
 408     Though it won't disable explicit geo restriction bypass based on
 409     country code provided with geo_bypass_country.
 410
 411     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 412     countries for this extractor. One of these countries will be used by
 413     geo restriction bypass mechanism right away in order to bypass
 414     geo restriction, of course, if the mechanism is not disabled.
 415
 416     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 417     IP blocks in CIDR notation for this extractor. One of these IP blocks
 418     will be used by geo restriction bypass mechanism similarly
 419     to _GEO_COUNTRIES.
 420
 421     Finally, the _WORKING attribute should be set to False for broken IEs
 422     in order to warn the users and skip the tests.
 423     """
 424
 425     _ready = False
 426     _downloader = None
 427     _x_forwarded_for_ip = None
 428     _GEO_BYPASS = True
 429     _GEO_COUNTRIES = None
 430     _GEO_IP_BLOCKS = None
 431     _WORKING = True
 432
 433     _LOGIN_HINTS = {
 434         'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
 435         'cookies': (
 436             'Use --cookies for the authentication. '
 437             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to pass cookies'),
 438         'password': 'Use --username and --password or --netrc to provide account credentials',
 439     }
 440
 441     def __init__(self, downloader=None):
 442         """Constructor. Receives an optional downloader."""
 443         self._ready = False
 444         self._x_forwarded_for_ip = None
 445         self.set_downloader(downloader)
 446
 447     @classmethod
 448     def suitable(cls, url):
 449         """Receives a URL and returns True if suitable for this IE."""
 450
 451         # This does not use has/getattr intentionally - we want to know whether
 452         # we have cached the regexp for *this* class, whereas getattr would also
 453         # match the superclass
 454         if '_VALID_URL_RE' not in cls.__dict__:
 455             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 456         return cls._VALID_URL_RE.match(url) is not None
 457
 458     @classmethod
 459     def _match_id(cls, url):
 460         if '_VALID_URL_RE' not in cls.__dict__:
 461             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 462         m = cls._VALID_URL_RE.match(url)
 463         assert m
 464         return compat_str(m.group('id'))
 465
 466     @classmethod
 467     def working(cls):
 468         """Getter method for _WORKING."""
 469         return cls._WORKING
 470
 471     def initialize(self):
 472         """Initializes an instance (authentication, etc)."""
 473         self._initialize_geo_bypass({
 474             'countries': self._GEO_COUNTRIES,
 475             'ip_blocks': self._GEO_IP_BLOCKS,
 476         })
 477         if not self._ready:
 478             self._real_initialize()
 479             self._ready = True
 480
 481     def _initialize_geo_bypass(self, geo_bypass_context):
 482         """
 483         Initialize geo restriction bypass mechanism.
 484
 485         This method is used to initialize geo bypass mechanism based on faking
 486         X-Forwarded-For HTTP header. A random country from provided country list
 487         is selected and a random IP belonging to this country is generated. This
 488         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 489         HTTP requests.
 490
 491         This method will be used for initial geo bypass mechanism initialization
 492         during the instance initialization with _GEO_COUNTRIES and
 493         _GEO_IP_BLOCKS.
 494
 495         You may also manually call it from extractor's code if geo bypass
 496         information is not available beforehand (e.g. obtained during
 497         extraction) or due to some other reason. In this case you should pass
 498         this information in geo bypass context passed as first argument. It may
 499         contain following fields:
 500
 501         countries:  List of geo unrestricted countries (similar
 502                     to _GEO_COUNTRIES)
 503         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 504                     (similar to _GEO_IP_BLOCKS)
 505
 506         """
 507         if not self._x_forwarded_for_ip:
 508
 509             # Geo bypass mechanism is explicitly disabled by user
 510             if not self.get_param('geo_bypass', True):
 511                 return
 512
 513             if not geo_bypass_context:
 514                 geo_bypass_context = {}
 515
 516             # Backward compatibility: previously _initialize_geo_bypass
 517             # expected a list of countries, some 3rd party code may still use
 518             # it this way
 519             if isinstance(geo_bypass_context, (list, tuple)):
 520                 geo_bypass_context = {
 521                     'countries': geo_bypass_context,
 522                 }
 523
 524             # The whole point of geo bypass mechanism is to fake IP
 525             # as X-Forwarded-For HTTP header based on some IP block or
 526             # country code.
 527
 528             # Path 1: bypassing based on IP block in CIDR notation
 529
 530             # Explicit IP block specified by user, use it right away
 531             # regardless of whether extractor is geo bypassable or not
 532             ip_block = self.get_param('geo_bypass_ip_block', None)
 533
 534             # Otherwise use random IP block from geo bypass context but only
 535             # if extractor is known as geo bypassable
 536             if not ip_block:
 537                 ip_blocks = geo_bypass_context.get('ip_blocks')
 538                 if self._GEO_BYPASS and ip_blocks:
 539                     ip_block = random.choice(ip_blocks)
 540
 541             if ip_block:
 542                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 543                 self._downloader.write_debug(
 544                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 545                 return
 546
 547             # Path 2: bypassing based on country code
 548
 549             # Explicit country code specified by user, use it right away
 550             # regardless of whether extractor is geo bypassable or not
 551             country = self.get_param('geo_bypass_country', None)
 552
 553             # Otherwise use random country code from geo bypass context but
 554             # only if extractor is known as geo bypassable
 555             if not country:
 556                 countries = geo_bypass_context.get('countries')
 557                 if self._GEO_BYPASS and countries:
 558                     country = random.choice(countries)
 559
 560             if country:
 561                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 562                 self._downloader.write_debug(
 563                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 564
 565     def extract(self, url):
 566         """Extracts URL information and returns it in list of dicts."""
 567         try:
 568             for _ in range(2):
 569                 try:
 570                     self.initialize()
 571                     self.write_debug('Extracting URL: %s' % url)
 572                     ie_result = self._real_extract(url)
 573                     if ie_result is None:
 574                         return None
 575                     if self._x_forwarded_for_ip:
 576                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 577                     subtitles = ie_result.get('subtitles')
 578                     if (subtitles and 'live_chat' in subtitles
 579                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 580                         del subtitles['live_chat']
 581                     return ie_result
 582                 except GeoRestrictedError as e:
 583                     if self.__maybe_fake_ip_and_retry(e.countries):
 584                         continue
 585                     raise
 586         except ExtractorError:
 587             raise
 588         except compat_http_client.IncompleteRead as e:
 589             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 590         except (KeyError, StopIteration) as e:
 591             raise ExtractorError('An extractor error has occurred.', cause=e)
 592
 593     def __maybe_fake_ip_and_retry(self, countries):
 594         if (not self.get_param('geo_bypass_country', None)
 595                 and self._GEO_BYPASS
 596                 and self.get_param('geo_bypass', True)
 597                 and not self._x_forwarded_for_ip
 598                 and countries):
 599             country_code = random.choice(countries)
 600             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 601             if self._x_forwarded_for_ip:
 602                 self.report_warning(
 603                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 604                     % (self._x_forwarded_for_ip, country_code.upper()))
 605                 return True
 606         return False
 607
 608     def set_downloader(self, downloader):
 609         """Sets the downloader for this IE."""
 610         self._downloader = downloader
 611
 612     def _real_initialize(self):
 613         """Real initialization process. Redefine in subclasses."""
 614         pass
 615
 616     def _real_extract(self, url):
 617         """Real extraction process. Redefine in subclasses."""
 618         pass
 619
 620     @classmethod
 621     def ie_key(cls):
 622         """A string for getting the InfoExtractor with get_info_extractor"""
 623         return compat_str(cls.__name__[:-2])
 624
 625     @property
 626     def IE_NAME(self):
 627         return compat_str(type(self).__name__[:-2])
 628
 629     @staticmethod
 630     def __can_accept_status_code(err, expected_status):
 631         assert isinstance(err, compat_urllib_error.HTTPError)
 632         if expected_status is None:
 633             return False
 634         elif callable(expected_status):
 635             return expected_status(err.code) is True
 636         else:
 637             return err.code in variadic(expected_status)
 638
 639     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 640         """
 641         Return the response handle.
 642
 643         See _download_webpage docstring for arguments specification.
 644         """
 645         if not self._downloader._first_webpage_request:
 646             sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
 647             if sleep_interval > 0:
 648                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 649                 time.sleep(sleep_interval)
 650         else:
 651             self._downloader._first_webpage_request = False
 652
 653         if note is None:
 654             self.report_download_webpage(video_id)
 655         elif note is not False:
 656             if video_id is None:
 657                 self.to_screen('%s' % (note,))
 658             else:
 659                 self.to_screen('%s: %s' % (video_id, note))
 660
 661         # Some sites check X-Forwarded-For HTTP header in order to figure out
 662         # the origin of the client behind proxy. This allows bypassing geo
 663         # restriction by faking this header's value to IP that belongs to some
 664         # geo unrestricted country. We will do so once we encounter any
 665         # geo restriction error.
 666         if self._x_forwarded_for_ip:
 667             if 'X-Forwarded-For' not in headers:
 668                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 669
 670         if isinstance(url_or_request, compat_urllib_request.Request):
 671             url_or_request = update_Request(
 672                 url_or_request, data=data, headers=headers, query=query)
 673         else:
 674             if query:
 675                 url_or_request = update_url_query(url_or_request, query)
 676             if data is not None or headers:
 677                 url_or_request = sanitized_Request(url_or_request, data, headers)
 678         try:
 679             return self._downloader.urlopen(url_or_request)
 680         except network_exceptions as err:
 681             if isinstance(err, compat_urllib_error.HTTPError):
 682                 if self.__can_accept_status_code(err, expected_status):
 683                     # Retain reference to error to prevent file object from
 684                     # being closed before it can be read. Works around the
 685                     # effects of <https://bugs.python.org/issue15002>
 686                     # introduced in Python 3.4.1.
 687                     err.fp._error = err
 688                     return err.fp
 689
 690             if errnote is False:
 691                 return False
 692             if errnote is None:
 693                 errnote = 'Unable to download webpage'
 694
 695             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 696             if fatal:
 697                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 698             else:
 699                 self.report_warning(errmsg)
 700                 return False
 701
 702     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 703         """
 704         Return a tuple (page content as string, URL handle).
 705
 706         See _download_webpage docstring for arguments specification.
 707         """
 708         # Strip hashes from the URL (#1038)
 709         if isinstance(url_or_request, (compat_str, str)):
 710             url_or_request = url_or_request.partition('#')[0]
 711
 712         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 713         if urlh is False:
 714             assert not fatal
 715             return False
 716         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 717         return (content, urlh)
 718
 719     @staticmethod
 720     def _guess_encoding_from_content(content_type, webpage_bytes):
 721         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 722         if m:
 723             encoding = m.group(1)
 724         else:
 725             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 726                           webpage_bytes[:1024])
 727             if m:
 728                 encoding = m.group(1).decode('ascii')
 729             elif webpage_bytes.startswith(b'\xff\xfe'):
 730                 encoding = 'utf-16'
 731             else:
 732                 encoding = 'utf-8'
 733
 734         return encoding
 735
 736     def __check_blocked(self, content):
 737         first_block = content[:512]
 738         if ('<title>Access to this site is blocked</title>' in content
 739                 and 'Websense' in first_block):
 740             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 741             blocked_iframe = self._html_search_regex(
 742                 r'<iframe src="([^"]+)"', content,
 743                 'Websense information URL', default=None)
 744             if blocked_iframe:
 745                 msg += ' Visit %s for more details' % blocked_iframe
 746             raise ExtractorError(msg, expected=True)
 747         if '<title>The URL you requested has been blocked</title>' in first_block:
 748             msg = (
 749                 'Access to this webpage has been blocked by Indian censorship. '
 750                 'Use a VPN or proxy server (with --proxy) to route around it.')
 751             block_msg = self._html_search_regex(
 752                 r'</h1><p>(.*?)</p>',
 753                 content, 'block message', default=None)
 754             if block_msg:
 755                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 756             raise ExtractorError(msg, expected=True)
 757         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 758                 and 'blocklist.rkn.gov.ru' in content):
 759             raise ExtractorError(
 760                 'Access to this webpage has been blocked by decision of the Russian government. '
 761                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 762                 expected=True)
 763
 764     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 765         content_type = urlh.headers.get('Content-Type', '')
 766         webpage_bytes = urlh.read()
 767         if prefix is not None:
 768             webpage_bytes = prefix + webpage_bytes
 769         if not encoding:
 770             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 771         if self.get_param('dump_intermediate_pages', False):
 772             self.to_screen('Dumping request to ' + urlh.geturl())
 773             dump = base64.b64encode(webpage_bytes).decode('ascii')
 774             self._downloader.to_screen(dump)
 775         if self.get_param('write_pages', False):
 776             basen = '%s_%s' % (video_id, urlh.geturl())
 777             if len(basen) > 240:
 778                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 779                 basen = basen[:240 - len(h)] + h
 780             raw_filename = basen + '.dump'
 781             filename = sanitize_filename(raw_filename, restricted=True)
 782             self.to_screen('Saving request to ' + filename)
 783             # Working around MAX_PATH limitation on Windows (see
 784             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 785             if compat_os_name == 'nt':
 786                 absfilepath = os.path.abspath(filename)
 787                 if len(absfilepath) > 259:
 788                     filename = '\\\\?\\' + absfilepath
 789             with open(filename, 'wb') as outf:
 790                 outf.write(webpage_bytes)
 791
 792         try:
 793             content = webpage_bytes.decode(encoding, 'replace')
 794         except LookupError:
 795             content = webpage_bytes.decode('utf-8', 'replace')
 796
 797         self.__check_blocked(content)
 798
 799         return content
 800
 801     def _download_webpage(
 802             self, url_or_request, video_id, note=None, errnote=None,
 803             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 804             headers={}, query={}, expected_status=None):
 805         """
 806         Return the data of the page as a string.
 807
 808         Arguments:
 809         url_or_request -- plain text URL as a string or
 810             a compat_urllib_request.Requestobject
 811         video_id -- Video/playlist/item identifier (string)
 812
 813         Keyword arguments:
 814         note -- note printed before downloading (string)
 815         errnote -- note printed in case of an error (string)
 816         fatal -- flag denoting whether error should be considered fatal,
 817             i.e. whether it should cause ExtractionError to be raised,
 818             otherwise a warning will be reported and extraction continued
 819         tries -- number of tries
 820         timeout -- sleep interval between tries
 821         encoding -- encoding for a page content decoding, guessed automatically
 822             when not explicitly specified
 823         data -- POST data (bytes)
 824         headers -- HTTP headers (dict)
 825         query -- URL query (dict)
 826         expected_status -- allows to accept failed HTTP requests (non 2xx
 827             status code) by explicitly specifying a set of accepted status
 828             codes. Can be any of the following entities:
 829                 - an integer type specifying an exact failed status code to
 830                   accept
 831                 - a list or a tuple of integer types specifying a list of
 832                   failed status codes to accept
 833                 - a callable accepting an actual failed status code and
 834                   returning True if it should be accepted
 835             Note that this argument does not affect success status codes (2xx)
 836             which are always accepted.
 837         """
 838
 839         success = False
 840         try_count = 0
 841         while success is False:
 842             try:
 843                 res = self._download_webpage_handle(
 844                     url_or_request, video_id, note, errnote, fatal,
 845                     encoding=encoding, data=data, headers=headers, query=query,
 846                     expected_status=expected_status)
 847                 success = True
 848             except compat_http_client.IncompleteRead as e:
 849                 try_count += 1
 850                 if try_count >= tries:
 851                     raise e
 852                 self._sleep(timeout, video_id)
 853         if res is False:
 854             return res
 855         else:
 856             content, _ = res
 857             return content
 858
 859     def _download_xml_handle(
 860             self, url_or_request, video_id, note='Downloading XML',
 861             errnote='Unable to download XML', transform_source=None,
 862             fatal=True, encoding=None, data=None, headers={}, query={},
 863             expected_status=None):
 864         """
 865         Return a tuple (xml as an compat_etree_Element, URL handle).
 866
 867         See _download_webpage docstring for arguments specification.
 868         """
 869         res = self._download_webpage_handle(
 870             url_or_request, video_id, note, errnote, fatal=fatal,
 871             encoding=encoding, data=data, headers=headers, query=query,
 872             expected_status=expected_status)
 873         if res is False:
 874             return res
 875         xml_string, urlh = res
 876         return self._parse_xml(
 877             xml_string, video_id, transform_source=transform_source,
 878             fatal=fatal), urlh
 879
 880     def _download_xml(
 881             self, url_or_request, video_id,
 882             note='Downloading XML', errnote='Unable to download XML',
 883             transform_source=None, fatal=True, encoding=None,
 884             data=None, headers={}, query={}, expected_status=None):
 885         """
 886         Return the xml as an compat_etree_Element.
 887
 888         See _download_webpage docstring for arguments specification.
 889         """
 890         res = self._download_xml_handle(
 891             url_or_request, video_id, note=note, errnote=errnote,
 892             transform_source=transform_source, fatal=fatal, encoding=encoding,
 893             data=data, headers=headers, query=query,
 894             expected_status=expected_status)
 895         return res if res is False else res[0]
 896
 897     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 898         if transform_source:
 899             xml_string = transform_source(xml_string)
 900         try:
 901             return compat_etree_fromstring(xml_string.encode('utf-8'))
 902         except compat_xml_parse_error as ve:
 903             errmsg = '%s: Failed to parse XML ' % video_id
 904             if fatal:
 905                 raise ExtractorError(errmsg, cause=ve)
 906             else:
 907                 self.report_warning(errmsg + str(ve))
 908
 909     def _download_json_handle(
 910             self, url_or_request, video_id, note='Downloading JSON metadata',
 911             errnote='Unable to download JSON metadata', transform_source=None,
 912             fatal=True, encoding=None, data=None, headers={}, query={},
 913             expected_status=None):
 914         """
 915         Return a tuple (JSON object, URL handle).
 916
 917         See _download_webpage docstring for arguments specification.
 918         """
 919         res = self._download_webpage_handle(
 920             url_or_request, video_id, note, errnote, fatal=fatal,
 921             encoding=encoding, data=data, headers=headers, query=query,
 922             expected_status=expected_status)
 923         if res is False:
 924             return res
 925         json_string, urlh = res
 926         return self._parse_json(
 927             json_string, video_id, transform_source=transform_source,
 928             fatal=fatal), urlh
 929
 930     def _download_json(
 931             self, url_or_request, video_id, note='Downloading JSON metadata',
 932             errnote='Unable to download JSON metadata', transform_source=None,
 933             fatal=True, encoding=None, data=None, headers={}, query={},
 934             expected_status=None):
 935         """
 936         Return the JSON object as a dict.
 937
 938         See _download_webpage docstring for arguments specification.
 939         """
 940         res = self._download_json_handle(
 941             url_or_request, video_id, note=note, errnote=errnote,
 942             transform_source=transform_source, fatal=fatal, encoding=encoding,
 943             data=data, headers=headers, query=query,
 944             expected_status=expected_status)
 945         return res if res is False else res[0]
 946
 947     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 948         if transform_source:
 949             json_string = transform_source(json_string)
 950         try:
 951             return json.loads(json_string)
 952         except ValueError as ve:
 953             errmsg = '%s: Failed to parse JSON ' % video_id
 954             if fatal:
 955                 raise ExtractorError(errmsg, cause=ve)
 956             else:
 957                 self.report_warning(errmsg + str(ve))
 958
 959     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 960         return self._parse_json(
 961             data[data.find('{'):data.rfind('}') + 1],
 962             video_id, transform_source, fatal)
 963
 964     def _download_socket_json_handle(
 965             self, url_or_request, video_id, note='Polling socket',
 966             errnote='Unable to poll socket', transform_source=None,
 967             fatal=True, encoding=None, data=None, headers={}, query={},
 968             expected_status=None):
 969         """
 970         Return a tuple (JSON object, URL handle).
 971
 972         See _download_webpage docstring for arguments specification.
 973         """
 974         res = self._download_webpage_handle(
 975             url_or_request, video_id, note, errnote, fatal=fatal,
 976             encoding=encoding, data=data, headers=headers, query=query,
 977             expected_status=expected_status)
 978         if res is False:
 979             return res
 980         webpage, urlh = res
 981         return self._parse_socket_response_as_json(
 982             webpage, video_id, transform_source=transform_source,
 983             fatal=fatal), urlh
 984
 985     def _download_socket_json(
 986             self, url_or_request, video_id, note='Polling socket',
 987             errnote='Unable to poll socket', transform_source=None,
 988             fatal=True, encoding=None, data=None, headers={}, query={},
 989             expected_status=None):
 990         """
 991         Return the JSON object as a dict.
 992
 993         See _download_webpage docstring for arguments specification.
 994         """
 995         res = self._download_socket_json_handle(
 996             url_or_request, video_id, note=note, errnote=errnote,
 997             transform_source=transform_source, fatal=fatal, encoding=encoding,
 998             data=data, headers=headers, query=query,
 999             expected_status=expected_status)
1000         return res if res is False else res[0]
1001
1002     def report_warning(self, msg, video_id=None, *args, **kwargs):
1003         idstr = '' if video_id is None else '%s: ' % video_id
1004         self._downloader.report_warning(
1005             '[%s] %s%s' % (self.IE_NAME, idstr, msg), *args, **kwargs)
1006
1007     def to_screen(self, msg, *args, **kwargs):
1008         """Print msg to screen, prefixing it with '[ie_name]'"""
1009         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1010
1011     def write_debug(self, msg, *args, **kwargs):
1012         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1013
1014     def get_param(self, name, default=None, *args, **kwargs):
1015         if self._downloader:
1016             return self._downloader.params.get(name, default, *args, **kwargs)
1017         return default
1018
1019     def report_extraction(self, id_or_name):
1020         """Report information extraction."""
1021         self.to_screen('%s: Extracting information' % id_or_name)
1022
1023     def report_download_webpage(self, video_id):
1024         """Report webpage download."""
1025         self.to_screen('%s: Downloading webpage' % video_id)
1026
1027     def report_age_confirmation(self):
1028         """Report attempt to confirm age."""
1029         self.to_screen('Confirming age')
1030
1031     def report_login(self):
1032         """Report attempt to log in."""
1033         self.to_screen('Logging in')
1034
1035     def raise_login_required(
1036             self, msg='This video is only available for registered users',
1037             metadata_available=False, method='any'):
1038         if metadata_available and self.get_param('ignore_no_formats_error'):
1039             self.report_warning(msg)
1040         if method is not None:
1041             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1042         raise ExtractorError(msg, expected=True)
1043
1044     def raise_geo_restricted(
1045             self, msg='This video is not available from your location due to geo restriction',
1046             countries=None, metadata_available=False):
1047         if metadata_available and self.get_param('ignore_no_formats_error'):
1048             self.report_warning(msg)
1049         else:
1050             raise GeoRestrictedError(msg, countries=countries)
1051
1052     def raise_no_formats(self, msg, expected=False, video_id=None):
1053         if expected and self.get_param('ignore_no_formats_error'):
1054             self.report_warning(msg, video_id)
1055         else:
1056             raise ExtractorError(msg, expected=expected, video_id=video_id)
1057
1058     # Methods for following #608
1059     @staticmethod
1060     def url_result(url, ie=None, video_id=None, video_title=None):
1061         """Returns a URL that points to a page that should be processed"""
1062         # TODO: ie should be the class used for getting the info
1063         video_info = {'_type': 'url',
1064                       'url': url,
1065                       'ie_key': ie}
1066         if video_id is not None:
1067             video_info['id'] = video_id
1068         if video_title is not None:
1069             video_info['title'] = video_title
1070         return video_info
1071
1072     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1073         urls = orderedSet(
1074             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1075             for m in matches)
1076         return self.playlist_result(
1077             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1078
1079     @staticmethod
1080     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1081         """Returns a playlist"""
1082         video_info = {'_type': 'playlist',
1083                       'entries': entries}
1084         video_info.update(kwargs)
1085         if playlist_id:
1086             video_info['id'] = playlist_id
1087         if playlist_title:
1088             video_info['title'] = playlist_title
1089         if playlist_description is not None:
1090             video_info['description'] = playlist_description
1091         return video_info
1092
1093     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1094         """
1095         Perform a regex search on the given string, using a single or a list of
1096         patterns returning the first matching group.
1097         In case of failure return a default value or raise a WARNING or a
1098         RegexNotFoundError, depending on fatal, specifying the field name.
1099         """
1100         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1101             mobj = re.search(pattern, string, flags)
1102         else:
1103             for p in pattern:
1104                 mobj = re.search(p, string, flags)
1105                 if mobj:
1106                     break
1107
1108         if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1109             _name = '\033[0;34m%s\033[0m' % name
1110         else:
1111             _name = name
1112
1113         if mobj:
1114             if group is None:
1115                 # return the first matching group
1116                 return next(g for g in mobj.groups() if g is not None)
1117             elif isinstance(group, (list, tuple)):
1118                 return tuple(mobj.group(g) for g in group)
1119             else:
1120                 return mobj.group(group)
1121         elif default is not NO_DEFAULT:
1122             return default
1123         elif fatal:
1124             raise RegexNotFoundError('Unable to extract %s' % _name)
1125         else:
1126             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1127             return None
1128
1129     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1130         """
1131         Like _search_regex, but strips HTML tags and unescapes entities.
1132         """
1133         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1134         if res:
1135             return clean_html(res).strip()
1136         else:
1137             return res
1138
1139     def _get_netrc_login_info(self, netrc_machine=None):
1140         username = None
1141         password = None
1142         netrc_machine = netrc_machine or self._NETRC_MACHINE
1143
1144         if self.get_param('usenetrc', False):
1145             try:
1146                 info = netrc.netrc().authenticators(netrc_machine)
1147                 if info is not None:
1148                     username = info[0]
1149                     password = info[2]
1150                 else:
1151                     raise netrc.NetrcParseError(
1152                         'No authenticators for %s' % netrc_machine)
1153             except (IOError, netrc.NetrcParseError) as err:
1154                 self.report_warning(
1155                     'parsing .netrc: %s' % error_to_compat_str(err))
1156
1157         return username, password
1158
1159     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1160         """
1161         Get the login info as (username, password)
1162         First look for the manually specified credentials using username_option
1163         and password_option as keys in params dictionary. If no such credentials
1164         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1165         value.
1166         If there's no info available, return (None, None)
1167         """
1168
1169         # Attempt to use provided username and password or .netrc data
1170         username = self.get_param(username_option)
1171         if username is not None:
1172             password = self.get_param(password_option)
1173         else:
1174             username, password = self._get_netrc_login_info(netrc_machine)
1175
1176         return username, password
1177
1178     def _get_tfa_info(self, note='two-factor verification code'):
1179         """
1180         Get the two-factor authentication info
1181         TODO - asking the user will be required for sms/phone verify
1182         currently just uses the command line option
1183         If there's no info available, return None
1184         """
1185
1186         tfa = self.get_param('twofactor')
1187         if tfa is not None:
1188             return tfa
1189
1190         return compat_getpass('Type %s and press [Return]: ' % note)
1191
1192     # Helper functions for extracting OpenGraph info
1193     @staticmethod
1194     def _og_regexes(prop):
1195         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1196         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1197                        % {'prop': re.escape(prop)})
1198         template = r'<meta[^>]+?%s[^>]+?%s'
1199         return [
1200             template % (property_re, content_re),
1201             template % (content_re, property_re),
1202         ]
1203
1204     @staticmethod
1205     def _meta_regex(prop):
1206         return r'''(?isx)<meta
1207                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1208                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1209
1210     def _og_search_property(self, prop, html, name=None, **kargs):
1211         prop = variadic(prop)
1212         if name is None:
1213             name = 'OpenGraph %s' % prop[0]
1214         og_regexes = []
1215         for p in prop:
1216             og_regexes.extend(self._og_regexes(p))
1217         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1218         if escaped is None:
1219             return None
1220         return unescapeHTML(escaped)
1221
1222     def _og_search_thumbnail(self, html, **kargs):
1223         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1224
1225     def _og_search_description(self, html, **kargs):
1226         return self._og_search_property('description', html, fatal=False, **kargs)
1227
1228     def _og_search_title(self, html, **kargs):
1229         return self._og_search_property('title', html, **kargs)
1230
1231     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1232         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1233         if secure:
1234             regexes = self._og_regexes('video:secure_url') + regexes
1235         return self._html_search_regex(regexes, html, name, **kargs)
1236
1237     def _og_search_url(self, html, **kargs):
1238         return self._og_search_property('url', html, **kargs)
1239
1240     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1241         name = variadic(name)
1242         if display_name is None:
1243             display_name = name[0]
1244         return self._html_search_regex(
1245             [self._meta_regex(n) for n in name],
1246             html, display_name, fatal=fatal, group='content', **kwargs)
1247
1248     def _dc_search_uploader(self, html):
1249         return self._html_search_meta('dc.creator', html, 'uploader')
1250
1251     def _rta_search(self, html):
1252         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1253         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1254                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1255                      html):
1256             return 18
1257         return 0
1258
1259     def _media_rating_search(self, html):
1260         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1261         rating = self._html_search_meta('rating', html)
1262
1263         if not rating:
1264             return None
1265
1266         RATING_TABLE = {
1267             'safe for kids': 0,
1268             'general': 8,
1269             '14 years': 14,
1270             'mature': 17,
1271             'restricted': 19,
1272         }
1273         return RATING_TABLE.get(rating.lower())
1274
1275     def _family_friendly_search(self, html):
1276         # See http://schema.org/VideoObject
1277         family_friendly = self._html_search_meta(
1278             'isFamilyFriendly', html, default=None)
1279
1280         if not family_friendly:
1281             return None
1282
1283         RATING_TABLE = {
1284             '1': 0,
1285             'true': 0,
1286             '0': 18,
1287             'false': 18,
1288         }
1289         return RATING_TABLE.get(family_friendly.lower())
1290
1291     def _twitter_search_player(self, html):
1292         return self._html_search_meta('twitter:player', html,
1293                                       'twitter card player')
1294
1295     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1296         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1297         default = kwargs.get('default', NO_DEFAULT)
1298         # JSON-LD may be malformed and thus `fatal` should be respected.
1299         # At the same time `default` may be passed that assumes `fatal=False`
1300         # for _search_regex. Let's simulate the same behavior here as well.
1301         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1302         json_ld = []
1303         for mobj in json_ld_list:
1304             json_ld_item = self._parse_json(
1305                 mobj.group('json_ld'), video_id, fatal=fatal)
1306             if not json_ld_item:
1307                 continue
1308             if isinstance(json_ld_item, dict):
1309                 json_ld.append(json_ld_item)
1310             elif isinstance(json_ld_item, (list, tuple)):
1311                 json_ld.extend(json_ld_item)
1312         if json_ld:
1313             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1314         if json_ld:
1315             return json_ld
1316         if default is not NO_DEFAULT:
1317             return default
1318         elif fatal:
1319             raise RegexNotFoundError('Unable to extract JSON-LD')
1320         else:
1321             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1322             return {}
1323
1324     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1325         if isinstance(json_ld, compat_str):
1326             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1327         if not json_ld:
1328             return {}
1329         info = {}
1330         if not isinstance(json_ld, (list, tuple, dict)):
1331             return info
1332         if isinstance(json_ld, dict):
1333             json_ld = [json_ld]
1334
1335         INTERACTION_TYPE_MAP = {
1336             'CommentAction': 'comment',
1337             'AgreeAction': 'like',
1338             'DisagreeAction': 'dislike',
1339             'LikeAction': 'like',
1340             'DislikeAction': 'dislike',
1341             'ListenAction': 'view',
1342             'WatchAction': 'view',
1343             'ViewAction': 'view',
1344         }
1345
1346         def extract_interaction_type(e):
1347             interaction_type = e.get('interactionType')
1348             if isinstance(interaction_type, dict):
1349                 interaction_type = interaction_type.get('@type')
1350             return str_or_none(interaction_type)
1351
1352         def extract_interaction_statistic(e):
1353             interaction_statistic = e.get('interactionStatistic')
1354             if isinstance(interaction_statistic, dict):
1355                 interaction_statistic = [interaction_statistic]
1356             if not isinstance(interaction_statistic, list):
1357                 return
1358             for is_e in interaction_statistic:
1359                 if not isinstance(is_e, dict):
1360                     continue
1361                 if is_e.get('@type') != 'InteractionCounter':
1362                     continue
1363                 interaction_type = extract_interaction_type(is_e)
1364                 if not interaction_type:
1365                     continue
1366                 # For interaction count some sites provide string instead of
1367                 # an integer (as per spec) with non digit characters (e.g. ",")
1368                 # so extracting count with more relaxed str_to_int
1369                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1370                 if interaction_count is None:
1371                     continue
1372                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1373                 if not count_kind:
1374                     continue
1375                 count_key = '%s_count' % count_kind
1376                 if info.get(count_key) is not None:
1377                     continue
1378                 info[count_key] = interaction_count
1379
1380         def extract_video_object(e):
1381             assert e['@type'] == 'VideoObject'
1382             author = e.get('author')
1383             info.update({
1384                 'url': url_or_none(e.get('contentUrl')),
1385                 'title': unescapeHTML(e.get('name')),
1386                 'description': unescapeHTML(e.get('description')),
1387                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1388                 'duration': parse_duration(e.get('duration')),
1389                 'timestamp': unified_timestamp(e.get('uploadDate')),
1390                 # author can be an instance of 'Organization' or 'Person' types.
1391                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1392                 # however some websites are using 'Text' type instead.
1393                 # 1. https://schema.org/VideoObject
1394                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1395                 'filesize': float_or_none(e.get('contentSize')),
1396                 'tbr': int_or_none(e.get('bitrate')),
1397                 'width': int_or_none(e.get('width')),
1398                 'height': int_or_none(e.get('height')),
1399                 'view_count': int_or_none(e.get('interactionCount')),
1400             })
1401             extract_interaction_statistic(e)
1402
1403         for e in json_ld:
1404             if '@context' in e:
1405                 item_type = e.get('@type')
1406                 if expected_type is not None and expected_type != item_type:
1407                     continue
1408                 if item_type in ('TVEpisode', 'Episode'):
1409                     episode_name = unescapeHTML(e.get('name'))
1410                     info.update({
1411                         'episode': episode_name,
1412                         'episode_number': int_or_none(e.get('episodeNumber')),
1413                         'description': unescapeHTML(e.get('description')),
1414                     })
1415                     if not info.get('title') and episode_name:
1416                         info['title'] = episode_name
1417                     part_of_season = e.get('partOfSeason')
1418                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1419                         info.update({
1420                             'season': unescapeHTML(part_of_season.get('name')),
1421                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1422                         })
1423                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1424                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1425                         info['series'] = unescapeHTML(part_of_series.get('name'))
1426                 elif item_type == 'Movie':
1427                     info.update({
1428                         'title': unescapeHTML(e.get('name')),
1429                         'description': unescapeHTML(e.get('description')),
1430                         'duration': parse_duration(e.get('duration')),
1431                         'timestamp': unified_timestamp(e.get('dateCreated')),
1432                     })
1433                 elif item_type in ('Article', 'NewsArticle'):
1434                     info.update({
1435                         'timestamp': parse_iso8601(e.get('datePublished')),
1436                         'title': unescapeHTML(e.get('headline')),
1437                         'description': unescapeHTML(e.get('articleBody')),
1438                     })
1439                 elif item_type == 'VideoObject':
1440                     extract_video_object(e)
1441                     if expected_type is None:
1442                         continue
1443                     else:
1444                         break
1445                 video = e.get('video')
1446                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1447                     extract_video_object(video)
1448                 if expected_type is None:
1449                     continue
1450                 else:
1451                     break
1452         return dict((k, v) for k, v in info.items() if v is not None)
1453
1454     @staticmethod
1455     def _hidden_inputs(html):
1456         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1457         hidden_inputs = {}
1458         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1459             attrs = extract_attributes(input)
1460             if not input:
1461                 continue
1462             if attrs.get('type') not in ('hidden', 'submit'):
1463                 continue
1464             name = attrs.get('name') or attrs.get('id')
1465             value = attrs.get('value')
1466             if name and value is not None:
1467                 hidden_inputs[name] = value
1468         return hidden_inputs
1469
1470     def _form_hidden_inputs(self, form_id, html):
1471         form = self._search_regex(
1472             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1473             html, '%s form' % form_id, group='form')
1474         return self._hidden_inputs(form)
1475
1476     class FormatSort:
1477         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1478
1479         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1480                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1481                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1482         ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
1483                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1484                         'fps', 'fs_approx', 'source', 'format_id')
1485
1486         settings = {
1487             'vcodec': {'type': 'ordered', 'regex': True,
1488                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1489             'acodec': {'type': 'ordered', 'regex': True,
1490                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1491             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1492                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1493             'vext': {'type': 'ordered', 'field': 'video_ext',
1494                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1495                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1496             'aext': {'type': 'ordered', 'field': 'audio_ext',
1497                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1498                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1499             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1500             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1501                            'field': ('vcodec', 'acodec'),
1502                            'function': lambda it: int(any(v != 'none' for v in it))},
1503             'ie_pref': {'priority': True, 'type': 'extractor'},
1504             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1505             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1506             'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
1507             'quality': {'convert': 'float_none', 'default': -1},
1508             'filesize': {'convert': 'bytes'},
1509             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1510             'id': {'convert': 'string', 'field': 'format_id'},
1511             'height': {'convert': 'float_none'},
1512             'width': {'convert': 'float_none'},
1513             'fps': {'convert': 'float_none'},
1514             'tbr': {'convert': 'float_none'},
1515             'vbr': {'convert': 'float_none'},
1516             'abr': {'convert': 'float_none'},
1517             'asr': {'convert': 'float_none'},
1518             'source': {'convert': 'ignore', 'field': 'source_preference'},
1519
1520             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1521             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1522             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1523             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1524             'res': {'type': 'multiple', 'field': ('height', 'width'),
1525                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1526
1527             # Most of these exist only for compatibility reasons
1528             'dimension': {'type': 'alias', 'field': 'res'},
1529             'resolution': {'type': 'alias', 'field': 'res'},
1530             'extension': {'type': 'alias', 'field': 'ext'},
1531             'bitrate': {'type': 'alias', 'field': 'br'},
1532             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1533             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1534             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1535             'framerate': {'type': 'alias', 'field': 'fps'},
1536             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1537             'protocol': {'type': 'alias', 'field': 'proto'},
1538             'source_preference': {'type': 'alias', 'field': 'source'},
1539             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1540             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1541             'samplerate': {'type': 'alias', 'field': 'asr'},
1542             'video_ext': {'type': 'alias', 'field': 'vext'},
1543             'audio_ext': {'type': 'alias', 'field': 'aext'},
1544             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1545             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1546             'video': {'type': 'alias', 'field': 'hasvid'},
1547             'has_video': {'type': 'alias', 'field': 'hasvid'},
1548             'audio': {'type': 'alias', 'field': 'hasaud'},
1549             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1550             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1551             'preference': {'type': 'alias', 'field': 'ie_pref'},
1552             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1553             'format_id': {'type': 'alias', 'field': 'id'},
1554         }
1555
1556         _order = []
1557
1558         def _get_field_setting(self, field, key):
1559             if field not in self.settings:
1560                 self.settings[field] = {}
1561             propObj = self.settings[field]
1562             if key not in propObj:
1563                 type = propObj.get('type')
1564                 if key == 'field':
1565                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1566                 elif key == 'convert':
1567                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1568                 else:
1569                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1570                 propObj[key] = default
1571             return propObj[key]
1572
1573         def _resolve_field_value(self, field, value, convertNone=False):
1574             if value is None:
1575                 if not convertNone:
1576                     return None
1577             else:
1578                 value = value.lower()
1579             conversion = self._get_field_setting(field, 'convert')
1580             if conversion == 'ignore':
1581                 return None
1582             if conversion == 'string':
1583                 return value
1584             elif conversion == 'float_none':
1585                 return float_or_none(value)
1586             elif conversion == 'bytes':
1587                 return FileDownloader.parse_bytes(value)
1588             elif conversion == 'order':
1589                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1590                 use_regex = self._get_field_setting(field, 'regex')
1591                 list_length = len(order_list)
1592                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1593                 if use_regex and value is not None:
1594                     for i, regex in enumerate(order_list):
1595                         if regex and re.match(regex, value):
1596                             return list_length - i
1597                     return list_length - empty_pos  # not in list
1598                 else:  # not regex or  value = None
1599                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1600             else:
1601                 if value.isnumeric():
1602                     return float(value)
1603                 else:
1604                     self.settings[field]['convert'] = 'string'
1605                     return value
1606
1607         def evaluate_params(self, params, sort_extractor):
1608             self._use_free_order = params.get('prefer_free_formats', False)
1609             self._sort_user = params.get('format_sort', [])
1610             self._sort_extractor = sort_extractor
1611
1612             def add_item(field, reverse, closest, limit_text):
1613                 field = field.lower()
1614                 if field in self._order:
1615                     return
1616                 self._order.append(field)
1617                 limit = self._resolve_field_value(field, limit_text)
1618                 data = {
1619                     'reverse': reverse,
1620                     'closest': False if limit is None else closest,
1621                     'limit_text': limit_text,
1622                     'limit': limit}
1623                 if field in self.settings:
1624                     self.settings[field].update(data)
1625                 else:
1626                     self.settings[field] = data
1627
1628             sort_list = (
1629                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1630                 + (tuple() if params.get('format_sort_force', False)
1631                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1632                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1633
1634             for item in sort_list:
1635                 match = re.match(self.regex, item)
1636                 if match is None:
1637                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1638                 field = match.group('field')
1639                 if field is None:
1640                     continue
1641                 if self._get_field_setting(field, 'type') == 'alias':
1642                     field = self._get_field_setting(field, 'field')
1643                 reverse = match.group('reverse') is not None
1644                 closest = match.group('separator') == '~'
1645                 limit_text = match.group('limit')
1646
1647                 has_limit = limit_text is not None
1648                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1649                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1650
1651                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1652                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1653                 limit_count = len(limits)
1654                 for (i, f) in enumerate(fields):
1655                     add_item(f, reverse, closest,
1656                              limits[i] if i < limit_count
1657                              else limits[0] if has_limit and not has_multiple_limits
1658                              else None)
1659
1660         def print_verbose_info(self, write_debug):
1661             if self._sort_user:
1662                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1663             if self._sort_extractor:
1664                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1665             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1666                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1667                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1668                               self._get_field_setting(field, 'limit_text'),
1669                               self._get_field_setting(field, 'limit'))
1670                 if self._get_field_setting(field, 'limit_text') is not None else '')
1671                 for field in self._order if self._get_field_setting(field, 'visible')]))
1672
1673         def _calculate_field_preference_from_value(self, format, field, type, value):
1674             reverse = self._get_field_setting(field, 'reverse')
1675             closest = self._get_field_setting(field, 'closest')
1676             limit = self._get_field_setting(field, 'limit')
1677
1678             if type == 'extractor':
1679                 maximum = self._get_field_setting(field, 'max')
1680                 if value is None or (maximum is not None and value >= maximum):
1681                     value = -1
1682             elif type == 'boolean':
1683                 in_list = self._get_field_setting(field, 'in_list')
1684                 not_in_list = self._get_field_setting(field, 'not_in_list')
1685                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1686             elif type == 'ordered':
1687                 value = self._resolve_field_value(field, value, True)
1688
1689             # try to convert to number
1690             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1691             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1692             if is_num:
1693                 value = val_num
1694
1695             return ((-10, 0) if value is None
1696                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1697                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1698                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1699                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1700                     else (-1, value, 0))
1701
1702         def _calculate_field_preference(self, format, field):
1703             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1704             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1705             if type == 'multiple':
1706                 type = 'field'  # Only 'field' is allowed in multiple for now
1707                 actual_fields = self._get_field_setting(field, 'field')
1708
1709                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1710             else:
1711                 value = get_value(field)
1712             return self._calculate_field_preference_from_value(format, field, type, value)
1713
1714         def calculate_preference(self, format):
1715             # Determine missing protocol
1716             if not format.get('protocol'):
1717                 format['protocol'] = determine_protocol(format)
1718
1719             # Determine missing ext
1720             if not format.get('ext') and 'url' in format:
1721                 format['ext'] = determine_ext(format['url'])
1722             if format.get('vcodec') == 'none':
1723                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1724                 format['video_ext'] = 'none'
1725             else:
1726                 format['video_ext'] = format['ext']
1727                 format['audio_ext'] = 'none'
1728             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1729             #    format['preference'] = -1000
1730
1731             # Determine missing bitrates
1732             if format.get('tbr') is None:
1733                 if format.get('vbr') is not None and format.get('abr') is not None:
1734                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1735             else:
1736                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1737                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1738                 if format.get('acodec') != "none" and format.get('abr') is None:
1739                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1740
1741             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1742
1743     def _sort_formats(self, formats, field_preference=[]):
1744         if not formats:
1745             if self.get_param('ignore_no_formats_error'):
1746                 return
1747             raise ExtractorError('No video formats found')
1748         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1749         format_sort.evaluate_params(self._downloader.params, field_preference)
1750         if self.get_param('verbose', False):
1751             format_sort.print_verbose_info(self._downloader.write_debug)
1752         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1753
1754     def _check_formats(self, formats, video_id):
1755         if formats:
1756             formats[:] = filter(
1757                 lambda f: self._is_valid_url(
1758                     f['url'], video_id,
1759                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1760                 formats)
1761
1762     @staticmethod
1763     def _remove_duplicate_formats(formats):
1764         format_urls = set()
1765         unique_formats = []
1766         for f in formats:
1767             if f['url'] not in format_urls:
1768                 format_urls.add(f['url'])
1769                 unique_formats.append(f)
1770         formats[:] = unique_formats
1771
1772     def _is_valid_url(self, url, video_id, item='video', headers={}):
1773         url = self._proto_relative_url(url, scheme='http:')
1774         # For now assume non HTTP(S) URLs always valid
1775         if not (url.startswith('http://') or url.startswith('https://')):
1776             return True
1777         try:
1778             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1779             return True
1780         except ExtractorError as e:
1781             self.to_screen(
1782                 '%s: %s URL is invalid, skipping: %s'
1783                 % (video_id, item, error_to_compat_str(e.cause)))
1784             return False
1785
1786     def http_scheme(self):
1787         """ Either "http:" or "https:", depending on the user's preferences """
1788         return (
1789             'http:'
1790             if self.get_param('prefer_insecure', False)
1791             else 'https:')
1792
1793     def _proto_relative_url(self, url, scheme=None):
1794         if url is None:
1795             return url
1796         if url.startswith('//'):
1797             if scheme is None:
1798                 scheme = self.http_scheme()
1799             return scheme + url
1800         else:
1801             return url
1802
1803     def _sleep(self, timeout, video_id, msg_template=None):
1804         if msg_template is None:
1805             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1806         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1807         self.to_screen(msg)
1808         time.sleep(timeout)
1809
1810     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1811                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1812                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1813         manifest = self._download_xml(
1814             manifest_url, video_id, 'Downloading f4m manifest',
1815             'Unable to download f4m manifest',
1816             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1817             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1818             transform_source=transform_source,
1819             fatal=fatal, data=data, headers=headers, query=query)
1820
1821         if manifest is False:
1822             return []
1823
1824         return self._parse_f4m_formats(
1825             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1826             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1827
1828     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1829                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1830                            fatal=True, m3u8_id=None):
1831         if not isinstance(manifest, compat_etree_Element) and not fatal:
1832             return []
1833
1834         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1835         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1836         if akamai_pv is not None and ';' in akamai_pv.text:
1837             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1838             if playerVerificationChallenge.strip() != '':
1839                 return []
1840
1841         formats = []
1842         manifest_version = '1.0'
1843         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1844         if not media_nodes:
1845             manifest_version = '2.0'
1846             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1847         # Remove unsupported DRM protected media from final formats
1848         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1849         media_nodes = remove_encrypted_media(media_nodes)
1850         if not media_nodes:
1851             return formats
1852
1853         manifest_base_url = get_base_url(manifest)
1854
1855         bootstrap_info = xpath_element(
1856             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1857             'bootstrap info', default=None)
1858
1859         vcodec = None
1860         mime_type = xpath_text(
1861             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1862             'base URL', default=None)
1863         if mime_type and mime_type.startswith('audio/'):
1864             vcodec = 'none'
1865
1866         for i, media_el in enumerate(media_nodes):
1867             tbr = int_or_none(media_el.attrib.get('bitrate'))
1868             width = int_or_none(media_el.attrib.get('width'))
1869             height = int_or_none(media_el.attrib.get('height'))
1870             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1871             # If <bootstrapInfo> is present, the specified f4m is a
1872             # stream-level manifest, and only set-level manifests may refer to
1873             # external resources.  See section 11.4 and section 4 of F4M spec
1874             if bootstrap_info is None:
1875                 media_url = None
1876                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1877                 if manifest_version == '2.0':
1878                     media_url = media_el.attrib.get('href')
1879                 if media_url is None:
1880                     media_url = media_el.attrib.get('url')
1881                 if not media_url:
1882                     continue
1883                 manifest_url = (
1884                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1885                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1886                 # If media_url is itself a f4m manifest do the recursive extraction
1887                 # since bitrates in parent manifest (this one) and media_url manifest
1888                 # may differ leading to inability to resolve the format by requested
1889                 # bitrate in f4m downloader
1890                 ext = determine_ext(manifest_url)
1891                 if ext == 'f4m':
1892                     f4m_formats = self._extract_f4m_formats(
1893                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1894                         transform_source=transform_source, fatal=fatal)
1895                     # Sometimes stream-level manifest contains single media entry that
1896                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1897                     # At the same time parent's media entry in set-level manifest may
1898                     # contain it. We will copy it from parent in such cases.
1899                     if len(f4m_formats) == 1:
1900                         f = f4m_formats[0]
1901                         f.update({
1902                             'tbr': f.get('tbr') or tbr,
1903                             'width': f.get('width') or width,
1904                             'height': f.get('height') or height,
1905                             'format_id': f.get('format_id') if not tbr else format_id,
1906                             'vcodec': vcodec,
1907                         })
1908                     formats.extend(f4m_formats)
1909                     continue
1910                 elif ext == 'm3u8':
1911                     formats.extend(self._extract_m3u8_formats(
1912                         manifest_url, video_id, 'mp4', preference=preference,
1913                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1914                     continue
1915             formats.append({
1916                 'format_id': format_id,
1917                 'url': manifest_url,
1918                 'manifest_url': manifest_url,
1919                 'ext': 'flv' if bootstrap_info is not None else None,
1920                 'protocol': 'f4m',
1921                 'tbr': tbr,
1922                 'width': width,
1923                 'height': height,
1924                 'vcodec': vcodec,
1925                 'preference': preference,
1926                 'quality': quality,
1927             })
1928         return formats
1929
1930     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1931         return {
1932             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1933             'url': m3u8_url,
1934             'ext': ext,
1935             'protocol': 'm3u8',
1936             'preference': preference - 100 if preference else -100,
1937             'quality': quality,
1938             'resolution': 'multiple',
1939             'format_note': 'Quality selection URL',
1940         }
1941
1942     def _extract_m3u8_formats(self, *args, **kwargs):
1943         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1944         if subs:
1945             self.report_warning(bug_reports_message(
1946                 "Ignoring subtitle tracks found in the HLS manifest; "
1947                 "if any subtitle tracks are missing,"
1948             ))
1949         return fmts
1950
1951     def _extract_m3u8_formats_and_subtitles(
1952             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1953             preference=None, quality=None, m3u8_id=None, note=None,
1954             errnote=None, fatal=True, live=False, data=None, headers={},
1955             query={}):
1956
1957         res = self._download_webpage_handle(
1958             m3u8_url, video_id,
1959             note='Downloading m3u8 information' if note is None else note,
1960             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1961             fatal=fatal, data=data, headers=headers, query=query)
1962
1963         if res is False:
1964             return [], {}
1965
1966         m3u8_doc, urlh = res
1967         m3u8_url = urlh.geturl()
1968
1969         return self._parse_m3u8_formats_and_subtitles(
1970             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1971             preference=preference, quality=quality, m3u8_id=m3u8_id,
1972             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1973             headers=headers, query=query, video_id=video_id)
1974
1975     def _parse_m3u8_formats_and_subtitles(
1976             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
1977             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1978             errnote=None, fatal=True, data=None, headers={}, query={},
1979             video_id=None):
1980         formats, subtitles = [], {}
1981
1982         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1983             return formats, subtitles
1984
1985         if (not self.get_param('allow_unplayable_formats')
1986                 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)):  # Apple FairPlay
1987             return formats, subtitles
1988
1989         def format_url(url):
1990             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
1991
1992         if self.get_param('hls_split_discontinuity', False):
1993             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1994                 if not m3u8_doc:
1995                     if not manifest_url:
1996                         return []
1997                     m3u8_doc = self._download_webpage(
1998                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1999                         note=False, errnote='Failed to download m3u8 playlist information')
2000                     if m3u8_doc is False:
2001                         return []
2002                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2003
2004         else:
2005             def _extract_m3u8_playlist_indices(*args, **kwargs):
2006                 return [None]
2007
2008         # References:
2009         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2010         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2011         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2012
2013         # We should try extracting formats only from master playlists [1, 4.3.4],
2014         # i.e. playlists that describe available qualities. On the other hand
2015         # media playlists [1, 4.3.3] should be returned as is since they contain
2016         # just the media without qualities renditions.
2017         # Fortunately, master playlist can be easily distinguished from media
2018         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2019         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2020         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2021         # media playlist and MUST NOT appear in master playlist thus we can
2022         # clearly detect media playlist with this criterion.
2023
2024         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2025             formats = [{
2026                 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
2027                 'format_index': idx,
2028                 'url': m3u8_url,
2029                 'ext': ext,
2030                 'protocol': entry_protocol,
2031                 'preference': preference,
2032                 'quality': quality,
2033             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2034
2035             return formats, subtitles
2036
2037         groups = {}
2038         last_stream_inf = {}
2039
2040         def extract_media(x_media_line):
2041             media = parse_m3u8_attributes(x_media_line)
2042             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2043             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2044             if not (media_type and group_id and name):
2045                 return
2046             groups.setdefault(group_id, []).append(media)
2047             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2048             if media_type == 'SUBTITLES':
2049                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2050                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2051                 # However, lack of URI has been spotted in the wild.
2052                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2053                 if not media.get('URI'):
2054                     return
2055                 url = format_url(media['URI'])
2056                 sub_info = {
2057                     'url': url,
2058                     'ext': determine_ext(url),
2059                 }
2060                 if sub_info['ext'] == 'm3u8':
2061                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2062                     # files may contain is WebVTT:
2063                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2064                     sub_info['ext'] = 'vtt'
2065                     sub_info['protocol'] = 'm3u8_native'
2066                 lang = media.get('LANGUAGE') or 'und'
2067                 subtitles.setdefault(lang, []).append(sub_info)
2068             if media_type not in ('VIDEO', 'AUDIO'):
2069                 return
2070             media_url = media.get('URI')
2071             if media_url:
2072                 manifest_url = format_url(media_url)
2073                 formats.extend({
2074                     'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
2075                     'format_note': name,
2076                     'format_index': idx,
2077                     'url': manifest_url,
2078                     'manifest_url': m3u8_url,
2079                     'language': media.get('LANGUAGE'),
2080                     'ext': ext,
2081                     'protocol': entry_protocol,
2082                     'preference': preference,
2083                     'quality': quality,
2084                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2085                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2086
2087         def build_stream_name():
2088             # Despite specification does not mention NAME attribute for
2089             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2090             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2091             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2092             stream_name = last_stream_inf.get('NAME')
2093             if stream_name:
2094                 return stream_name
2095             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2096             # from corresponding rendition group
2097             stream_group_id = last_stream_inf.get('VIDEO')
2098             if not stream_group_id:
2099                 return
2100             stream_group = groups.get(stream_group_id)
2101             if not stream_group:
2102                 return stream_group_id
2103             rendition = stream_group[0]
2104             return rendition.get('NAME') or stream_group_id
2105
2106         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2107         # chance to detect video only formats when EXT-X-STREAM-INF tags
2108         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2109         for line in m3u8_doc.splitlines():
2110             if line.startswith('#EXT-X-MEDIA:'):
2111                 extract_media(line)
2112
2113         for line in m3u8_doc.splitlines():
2114             if line.startswith('#EXT-X-STREAM-INF:'):
2115                 last_stream_inf = parse_m3u8_attributes(line)
2116             elif line.startswith('#') or not line.strip():
2117                 continue
2118             else:
2119                 tbr = float_or_none(
2120                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2121                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2122                 manifest_url = format_url(line.strip())
2123
2124                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2125                     format_id = [m3u8_id, None, idx]
2126                     # Bandwidth of live streams may differ over time thus making
2127                     # format_id unpredictable. So it's better to keep provided
2128                     # format_id intact.
2129                     if not live:
2130                         stream_name = build_stream_name()
2131                         format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
2132                     f = {
2133                         'format_id': '-'.join(map(str, filter(None, format_id))),
2134                         'format_index': idx,
2135                         'url': manifest_url,
2136                         'manifest_url': m3u8_url,
2137                         'tbr': tbr,
2138                         'ext': ext,
2139                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2140                         'protocol': entry_protocol,
2141                         'preference': preference,
2142                         'quality': quality,
2143                     }
2144                     resolution = last_stream_inf.get('RESOLUTION')
2145                     if resolution:
2146                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2147                         if mobj:
2148                             f['width'] = int(mobj.group('width'))
2149                             f['height'] = int(mobj.group('height'))
2150                     # Unified Streaming Platform
2151                     mobj = re.search(
2152                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2153                     if mobj:
2154                         abr, vbr = mobj.groups()
2155                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2156                         f.update({
2157                             'vbr': vbr,
2158                             'abr': abr,
2159                         })
2160                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2161                     f.update(codecs)
2162                     audio_group_id = last_stream_inf.get('AUDIO')
2163                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2164                     # references a rendition group MUST have a CODECS attribute.
2165                     # However, this is not always respected, for example, [2]
2166                     # contains EXT-X-STREAM-INF tag which references AUDIO
2167                     # rendition group but does not have CODECS and despite
2168                     # referencing an audio group it represents a complete
2169                     # (with audio and video) format. So, for such cases we will
2170                     # ignore references to rendition groups and treat them
2171                     # as complete formats.
2172                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2173                         audio_group = groups.get(audio_group_id)
2174                         if audio_group and audio_group[0].get('URI'):
2175                             # TODO: update acodec for audio only formats with
2176                             # the same GROUP-ID
2177                             f['acodec'] = 'none'
2178                     if not f.get('ext'):
2179                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2180                     formats.append(f)
2181
2182                     # for DailyMotion
2183                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2184                     if progressive_uri:
2185                         http_f = f.copy()
2186                         del http_f['manifest_url']
2187                         http_f.update({
2188                             'format_id': f['format_id'].replace('hls-', 'http-'),
2189                             'protocol': 'http',
2190                             'url': progressive_uri,
2191                         })
2192                         formats.append(http_f)
2193
2194                 last_stream_inf = {}
2195         return formats, subtitles
2196
2197     @staticmethod
2198     def _xpath_ns(path, namespace=None):
2199         if not namespace:
2200             return path
2201         out = []
2202         for c in path.split('/'):
2203             if not c or c == '.':
2204                 out.append(c)
2205             else:
2206                 out.append('{%s}%s' % (namespace, c))
2207         return '/'.join(out)
2208
2209     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2210         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2211
2212         if smil is False:
2213             assert not fatal
2214             return []
2215
2216         namespace = self._parse_smil_namespace(smil)
2217
2218         fmts = self._parse_smil_formats(
2219             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2220         subs = self._parse_smil_subtitles(
2221             smil, namespace=namespace)
2222
2223         return fmts, subs
2224
2225     def _extract_smil_formats(self, *args, **kwargs):
2226         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2227         if subs:
2228             self.report_warning(bug_reports_message(
2229                 "Ignoring subtitle tracks found in the SMIL manifest; "
2230                 "if any subtitle tracks are missing,"
2231             ))
2232         return fmts
2233
2234     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2235         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2236         if smil is False:
2237             return {}
2238         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2239
2240     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2241         return self._download_xml(
2242             smil_url, video_id, 'Downloading SMIL file',
2243             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2244
2245     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2246         namespace = self._parse_smil_namespace(smil)
2247
2248         formats = self._parse_smil_formats(
2249             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2250         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2251
2252         video_id = os.path.splitext(url_basename(smil_url))[0]
2253         title = None
2254         description = None
2255         upload_date = None
2256         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2257             name = meta.attrib.get('name')
2258             content = meta.attrib.get('content')
2259             if not name or not content:
2260                 continue
2261             if not title and name == 'title':
2262                 title = content
2263             elif not description and name in ('description', 'abstract'):
2264                 description = content
2265             elif not upload_date and name == 'date':
2266                 upload_date = unified_strdate(content)
2267
2268         thumbnails = [{
2269             'id': image.get('type'),
2270             'url': image.get('src'),
2271             'width': int_or_none(image.get('width')),
2272             'height': int_or_none(image.get('height')),
2273         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2274
2275         return {
2276             'id': video_id,
2277             'title': title or video_id,
2278             'description': description,
2279             'upload_date': upload_date,
2280             'thumbnails': thumbnails,
2281             'formats': formats,
2282             'subtitles': subtitles,
2283         }
2284
2285     def _parse_smil_namespace(self, smil):
2286         return self._search_regex(
2287             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2288
2289     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2290         base = smil_url
2291         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2292             b = meta.get('base') or meta.get('httpBase')
2293             if b:
2294                 base = b
2295                 break
2296
2297         formats = []
2298         rtmp_count = 0
2299         http_count = 0
2300         m3u8_count = 0
2301
2302         srcs = []
2303         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2304         for medium in media:
2305             src = medium.get('src')
2306             if not src or src in srcs:
2307                 continue
2308             srcs.append(src)
2309
2310             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2311             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2312             width = int_or_none(medium.get('width'))
2313             height = int_or_none(medium.get('height'))
2314             proto = medium.get('proto')
2315             ext = medium.get('ext')
2316             src_ext = determine_ext(src)
2317             streamer = medium.get('streamer') or base
2318
2319             if proto == 'rtmp' or streamer.startswith('rtmp'):
2320                 rtmp_count += 1
2321                 formats.append({
2322                     'url': streamer,
2323                     'play_path': src,
2324                     'ext': 'flv',
2325                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2326                     'tbr': bitrate,
2327                     'filesize': filesize,
2328                     'width': width,
2329                     'height': height,
2330                 })
2331                 if transform_rtmp_url:
2332                     streamer, src = transform_rtmp_url(streamer, src)
2333                     formats[-1].update({
2334                         'url': streamer,
2335                         'play_path': src,
2336                     })
2337                 continue
2338
2339             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2340             src_url = src_url.strip()
2341
2342             if proto == 'm3u8' or src_ext == 'm3u8':
2343                 m3u8_formats = self._extract_m3u8_formats(
2344                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2345                 if len(m3u8_formats) == 1:
2346                     m3u8_count += 1
2347                     m3u8_formats[0].update({
2348                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2349                         'tbr': bitrate,
2350                         'width': width,
2351                         'height': height,
2352                     })
2353                 formats.extend(m3u8_formats)
2354             elif src_ext == 'f4m':
2355                 f4m_url = src_url
2356                 if not f4m_params:
2357                     f4m_params = {
2358                         'hdcore': '3.2.0',
2359                         'plugin': 'flowplayer-3.2.0.1',
2360                     }
2361                 f4m_url += '&' if '?' in f4m_url else '?'
2362                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2363                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2364             elif src_ext == 'mpd':
2365                 formats.extend(self._extract_mpd_formats(
2366                     src_url, video_id, mpd_id='dash', fatal=False))
2367             elif re.search(r'\.ism/[Mm]anifest', src_url):
2368                 formats.extend(self._extract_ism_formats(
2369                     src_url, video_id, ism_id='mss', fatal=False))
2370             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2371                 http_count += 1
2372                 formats.append({
2373                     'url': src_url,
2374                     'ext': ext or src_ext or 'flv',
2375                     'format_id': 'http-%d' % (bitrate or http_count),
2376                     'tbr': bitrate,
2377                     'filesize': filesize,
2378                     'width': width,
2379                     'height': height,
2380                 })
2381
2382         return formats
2383
2384     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2385         urls = []
2386         subtitles = {}
2387         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2388             src = textstream.get('src')
2389             if not src or src in urls:
2390                 continue
2391             urls.append(src)
2392             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2393             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2394             subtitles.setdefault(lang, []).append({
2395                 'url': src,
2396                 'ext': ext,
2397             })
2398         return subtitles
2399
2400     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2401         xspf = self._download_xml(
2402             xspf_url, playlist_id, 'Downloading xpsf playlist',
2403             'Unable to download xspf manifest', fatal=fatal)
2404         if xspf is False:
2405             return []
2406         return self._parse_xspf(
2407             xspf, playlist_id, xspf_url=xspf_url,
2408             xspf_base_url=base_url(xspf_url))
2409
2410     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2411         NS_MAP = {
2412             'xspf': 'http://xspf.org/ns/0/',
2413             's1': 'http://static.streamone.nl/player/ns/0',
2414         }
2415
2416         entries = []
2417         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2418             title = xpath_text(
2419                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2420             description = xpath_text(
2421                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2422             thumbnail = xpath_text(
2423                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2424             duration = float_or_none(
2425                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2426
2427             formats = []
2428             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2429                 format_url = urljoin(xspf_base_url, location.text)
2430                 if not format_url:
2431                     continue
2432                 formats.append({
2433                     'url': format_url,
2434                     'manifest_url': xspf_url,
2435                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2436                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2437                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2438                 })
2439             self._sort_formats(formats)
2440
2441             entries.append({
2442                 'id': playlist_id,
2443                 'title': title,
2444                 'description': description,
2445                 'thumbnail': thumbnail,
2446                 'duration': duration,
2447                 'formats': formats,
2448             })
2449         return entries
2450
2451     def _extract_mpd_formats(self, *args, **kwargs):
2452         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2453         if subs:
2454             self.report_warning(bug_reports_message(
2455                 "Ignoring subtitle tracks found in the DASH manifest; "
2456                 "if any subtitle tracks are missing,"
2457             ))
2458         return fmts
2459
2460     def _extract_mpd_formats_and_subtitles(
2461             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2462             fatal=True, data=None, headers={}, query={}):
2463         res = self._download_xml_handle(
2464             mpd_url, video_id,
2465             note='Downloading MPD manifest' if note is None else note,
2466             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2467             fatal=fatal, data=data, headers=headers, query=query)
2468         if res is False:
2469             return [], {}
2470         mpd_doc, urlh = res
2471         if mpd_doc is None:
2472             return [], {}
2473         mpd_base_url = base_url(urlh.geturl())
2474
2475         return self._parse_mpd_formats_and_subtitles(
2476             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2477
2478     def _parse_mpd_formats(self, *args, **kwargs):
2479         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2480         if subs:
2481             self.report_warning(bug_reports_message(
2482                 "Ignoring subtitle tracks found in the DASH manifest; "
2483                 "if any subtitle tracks are missing,"
2484             ))
2485         return fmts
2486
2487     def _parse_mpd_formats_and_subtitles(
2488             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2489         """
2490         Parse formats from MPD manifest.
2491         References:
2492          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2493             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2494          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2495         """
2496         if not self.get_param('dynamic_mpd', True):
2497             if mpd_doc.get('type') == 'dynamic':
2498                 return [], {}
2499
2500         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2501
2502         def _add_ns(path):
2503             return self._xpath_ns(path, namespace)
2504
2505         def is_drm_protected(element):
2506             return element.find(_add_ns('ContentProtection')) is not None
2507
2508         def extract_multisegment_info(element, ms_parent_info):
2509             ms_info = ms_parent_info.copy()
2510
2511             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2512             # common attributes and elements.  We will only extract relevant
2513             # for us.
2514             def extract_common(source):
2515                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2516                 if segment_timeline is not None:
2517                     s_e = segment_timeline.findall(_add_ns('S'))
2518                     if s_e:
2519                         ms_info['total_number'] = 0
2520                         ms_info['s'] = []
2521                         for s in s_e:
2522                             r = int(s.get('r', 0))
2523                             ms_info['total_number'] += 1 + r
2524                             ms_info['s'].append({
2525                                 't': int(s.get('t', 0)),
2526                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2527                                 'd': int(s.attrib['d']),
2528                                 'r': r,
2529                             })
2530                 start_number = source.get('startNumber')
2531                 if start_number:
2532                     ms_info['start_number'] = int(start_number)
2533                 timescale = source.get('timescale')
2534                 if timescale:
2535                     ms_info['timescale'] = int(timescale)
2536                 segment_duration = source.get('duration')
2537                 if segment_duration:
2538                     ms_info['segment_duration'] = float(segment_duration)
2539
2540             def extract_Initialization(source):
2541                 initialization = source.find(_add_ns('Initialization'))
2542                 if initialization is not None:
2543                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2544
2545             segment_list = element.find(_add_ns('SegmentList'))
2546             if segment_list is not None:
2547                 extract_common(segment_list)
2548                 extract_Initialization(segment_list)
2549                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2550                 if segment_urls_e:
2551                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2552             else:
2553                 segment_template = element.find(_add_ns('SegmentTemplate'))
2554                 if segment_template is not None:
2555                     extract_common(segment_template)
2556                     media = segment_template.get('media')
2557                     if media:
2558                         ms_info['media'] = media
2559                     initialization = segment_template.get('initialization')
2560                     if initialization:
2561                         ms_info['initialization'] = initialization
2562                     else:
2563                         extract_Initialization(segment_template)
2564             return ms_info
2565
2566         skip_unplayable = not self.get_param('allow_unplayable_formats')
2567
2568         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2569         formats = []
2570         subtitles = {}
2571         for period in mpd_doc.findall(_add_ns('Period')):
2572             period_duration = parse_duration(period.get('duration')) or mpd_duration
2573             period_ms_info = extract_multisegment_info(period, {
2574                 'start_number': 1,
2575                 'timescale': 1,
2576             })
2577             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2578                 if skip_unplayable and is_drm_protected(adaptation_set):
2579                     continue
2580                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2581                 for representation in adaptation_set.findall(_add_ns('Representation')):
2582                     if skip_unplayable and is_drm_protected(representation):
2583                         continue
2584                     representation_attrib = adaptation_set.attrib.copy()
2585                     representation_attrib.update(representation.attrib)
2586                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2587                     mime_type = representation_attrib['mimeType']
2588                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2589
2590                     if content_type in ('video', 'audio', 'text') or mime_type == 'image/jpeg':
2591                         base_url = ''
2592                         for element in (representation, adaptation_set, period, mpd_doc):
2593                             base_url_e = element.find(_add_ns('BaseURL'))
2594                             if base_url_e is not None:
2595                                 base_url = base_url_e.text + base_url
2596                                 if re.match(r'^https?://', base_url):
2597                                     break
2598                         if mpd_base_url and not re.match(r'^https?://', base_url):
2599                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2600                                 mpd_base_url += '/'
2601                             base_url = mpd_base_url + base_url
2602                         representation_id = representation_attrib.get('id')
2603                         lang = representation_attrib.get('lang')
2604                         url_el = representation.find(_add_ns('BaseURL'))
2605                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2606                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2607                         if representation_id is not None:
2608                             format_id = representation_id
2609                         else:
2610                             format_id = content_type
2611                         if mpd_id:
2612                             format_id = mpd_id + '-' + format_id
2613                         if content_type in ('video', 'audio'):
2614                             f = {
2615                                 'format_id': format_id,
2616                                 'manifest_url': mpd_url,
2617                                 'ext': mimetype2ext(mime_type),
2618                                 'width': int_or_none(representation_attrib.get('width')),
2619                                 'height': int_or_none(representation_attrib.get('height')),
2620                                 'tbr': float_or_none(bandwidth, 1000),
2621                                 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2622                                 'fps': int_or_none(representation_attrib.get('frameRate')),
2623                                 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2624                                 'format_note': 'DASH %s' % content_type,
2625                                 'filesize': filesize,
2626                                 'container': mimetype2ext(mime_type) + '_dash',
2627                             }
2628                             f.update(parse_codecs(representation_attrib.get('codecs')))
2629                         elif content_type == 'text':
2630                             f = {
2631                                 'ext': mimetype2ext(mime_type),
2632                                 'manifest_url': mpd_url,
2633                                 'filesize': filesize,
2634                             }
2635                         elif mime_type == 'image/jpeg':
2636                             # See test case in VikiIE
2637                             # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2638                             f = {
2639                                 'format_id': format_id,
2640                                 'ext': 'mhtml',
2641                                 'manifest_url': mpd_url,
2642                                 'format_note': 'DASH storyboards (jpeg)',
2643                                 'acodec': 'none',
2644                                 'vcodec': 'none',
2645                             }
2646                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2647
2648                         def prepare_template(template_name, identifiers):
2649                             tmpl = representation_ms_info[template_name]
2650                             # First of, % characters outside $...$ templates
2651                             # must be escaped by doubling for proper processing
2652                             # by % operator string formatting used further (see
2653                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2654                             t = ''
2655                             in_template = False
2656                             for c in tmpl:
2657                                 t += c
2658                                 if c == '$':
2659                                     in_template = not in_template
2660                                 elif c == '%' and not in_template:
2661                                     t += c
2662                             # Next, $...$ templates are translated to their
2663                             # %(...) counterparts to be used with % operator
2664                             if representation_id is not None:
2665                                 t = t.replace('$RepresentationID$', representation_id)
2666                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2667                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2668                             t.replace('$$', '$')
2669                             return t
2670
2671                         # @initialization is a regular template like @media one
2672                         # so it should be handled just the same way (see
2673                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2674                         if 'initialization' in representation_ms_info:
2675                             initialization_template = prepare_template(
2676                                 'initialization',
2677                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2678                                 # $Time$ shall not be included for @initialization thus
2679                                 # only $Bandwidth$ remains
2680                                 ('Bandwidth', ))
2681                             representation_ms_info['initialization_url'] = initialization_template % {
2682                                 'Bandwidth': bandwidth,
2683                             }
2684
2685                         def location_key(location):
2686                             return 'url' if re.match(r'^https?://', location) else 'path'
2687
2688                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2689
2690                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2691                             media_location_key = location_key(media_template)
2692
2693                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2694                             # can't be used at the same time
2695                             if '%(Number' in media_template and 's' not in representation_ms_info:
2696                                 segment_duration = None
2697                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2698                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2699                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2700                                 representation_ms_info['fragments'] = [{
2701                                     media_location_key: media_template % {
2702                                         'Number': segment_number,
2703                                         'Bandwidth': bandwidth,
2704                                     },
2705                                     'duration': segment_duration,
2706                                 } for segment_number in range(
2707                                     representation_ms_info['start_number'],
2708                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2709                             else:
2710                                 # $Number*$ or $Time$ in media template with S list available
2711                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2712                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2713                                 representation_ms_info['fragments'] = []
2714                                 segment_time = 0
2715                                 segment_d = None
2716                                 segment_number = representation_ms_info['start_number']
2717
2718                                 def add_segment_url():
2719                                     segment_url = media_template % {
2720                                         'Time': segment_time,
2721                                         'Bandwidth': bandwidth,
2722                                         'Number': segment_number,
2723                                     }
2724                                     representation_ms_info['fragments'].append({
2725                                         media_location_key: segment_url,
2726                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2727                                     })
2728
2729                                 for num, s in enumerate(representation_ms_info['s']):
2730                                     segment_time = s.get('t') or segment_time
2731                                     segment_d = s['d']
2732                                     add_segment_url()
2733                                     segment_number += 1
2734                                     for r in range(s.get('r', 0)):
2735                                         segment_time += segment_d
2736                                         add_segment_url()
2737                                         segment_number += 1
2738                                     segment_time += segment_d
2739                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2740                             # No media template
2741                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2742                             # or any YouTube dashsegments video
2743                             fragments = []
2744                             segment_index = 0
2745                             timescale = representation_ms_info['timescale']
2746                             for s in representation_ms_info['s']:
2747                                 duration = float_or_none(s['d'], timescale)
2748                                 for r in range(s.get('r', 0) + 1):
2749                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2750                                     fragments.append({
2751                                         location_key(segment_uri): segment_uri,
2752                                         'duration': duration,
2753                                     })
2754                                     segment_index += 1
2755                             representation_ms_info['fragments'] = fragments
2756                         elif 'segment_urls' in representation_ms_info:
2757                             # Segment URLs with no SegmentTimeline
2758                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2759                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2760                             fragments = []
2761                             segment_duration = float_or_none(
2762                                 representation_ms_info['segment_duration'],
2763                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2764                             for segment_url in representation_ms_info['segment_urls']:
2765                                 fragment = {
2766                                     location_key(segment_url): segment_url,
2767                                 }
2768                                 if segment_duration:
2769                                     fragment['duration'] = segment_duration
2770                                 fragments.append(fragment)
2771                             representation_ms_info['fragments'] = fragments
2772                         # If there is a fragments key available then we correctly recognized fragmented media.
2773                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2774                         # assumption is not necessarily correct since we may simply have no support for
2775                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2776                         if 'fragments' in representation_ms_info:
2777                             f.update({
2778                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2779                                 'url': mpd_url or base_url,
2780                                 'fragment_base_url': base_url,
2781                                 'fragments': [],
2782                                 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2783                             })
2784                             if 'initialization_url' in representation_ms_info:
2785                                 initialization_url = representation_ms_info['initialization_url']
2786                                 if not f.get('url'):
2787                                     f['url'] = initialization_url
2788                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2789                             f['fragments'].extend(representation_ms_info['fragments'])
2790                         else:
2791                             # Assuming direct URL to unfragmented media.
2792                             f['url'] = base_url
2793                         if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2794                             formats.append(f)
2795                         elif content_type == 'text':
2796                             subtitles.setdefault(lang or 'und', []).append(f)
2797                     else:
2798                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2799         return formats, subtitles
2800
2801     def _extract_ism_formats(self, *args, **kwargs):
2802         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2803         if subs:
2804             self.report_warning(bug_reports_message(
2805                 "Ignoring subtitle tracks found in the ISM manifest; "
2806                 "if any subtitle tracks are missing,"
2807             ))
2808         return fmts
2809
2810     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2811         res = self._download_xml_handle(
2812             ism_url, video_id,
2813             note='Downloading ISM manifest' if note is None else note,
2814             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2815             fatal=fatal, data=data, headers=headers, query=query)
2816         if res is False:
2817             return [], {}
2818         ism_doc, urlh = res
2819         if ism_doc is None:
2820             return [], {}
2821
2822         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2823
2824     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2825         """
2826         Parse formats from ISM manifest.
2827         References:
2828          1. [MS-SSTR]: Smooth Streaming Protocol,
2829             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2830         """
2831         if ism_doc.get('IsLive') == 'TRUE':
2832             return [], {}
2833         if (not self.get_param('allow_unplayable_formats')
2834                 and ism_doc.find('Protection') is not None):
2835             return [], {}
2836
2837         duration = int(ism_doc.attrib['Duration'])
2838         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2839
2840         formats = []
2841         subtitles = {}
2842         for stream in ism_doc.findall('StreamIndex'):
2843             stream_type = stream.get('Type')
2844             if stream_type not in ('video', 'audio', 'text'):
2845                 continue
2846             url_pattern = stream.attrib['Url']
2847             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2848             stream_name = stream.get('Name')
2849             stream_language = stream.get('Language', 'und')
2850             for track in stream.findall('QualityLevel'):
2851                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2852                 # TODO: add support for WVC1 and WMAP
2853                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2854                     self.report_warning('%s is not a supported codec' % fourcc)
2855                     continue
2856                 tbr = int(track.attrib['Bitrate']) // 1000
2857                 # [1] does not mention Width and Height attributes. However,
2858                 # they're often present while MaxWidth and MaxHeight are
2859                 # missing, so should be used as fallbacks
2860                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2861                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2862                 sampling_rate = int_or_none(track.get('SamplingRate'))
2863
2864                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2865                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2866
2867                 fragments = []
2868                 fragment_ctx = {
2869                     'time': 0,
2870                 }
2871                 stream_fragments = stream.findall('c')
2872                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2873                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2874                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2875                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2876                     if not fragment_ctx['duration']:
2877                         try:
2878                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2879                         except IndexError:
2880                             next_fragment_time = duration
2881                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2882                     for _ in range(fragment_repeat):
2883                         fragments.append({
2884                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2885                             'duration': fragment_ctx['duration'] / stream_timescale,
2886                         })
2887                         fragment_ctx['time'] += fragment_ctx['duration']
2888
2889                 format_id = []
2890                 if ism_id:
2891                     format_id.append(ism_id)
2892                 if stream_name:
2893                     format_id.append(stream_name)
2894                 format_id.append(compat_str(tbr))
2895
2896                 if stream_type == 'text':
2897                     subtitles.setdefault(stream_language, []).append({
2898                         'ext': 'ismt',
2899                         'protocol': 'ism',
2900                         'url': ism_url,
2901                         'manifest_url': ism_url,
2902                         'fragments': fragments,
2903                         '_download_params': {
2904                             'stream_type': stream_type,
2905                             'duration': duration,
2906                             'timescale': stream_timescale,
2907                             'fourcc': fourcc,
2908                             'language': stream_language,
2909                             'codec_private_data': track.get('CodecPrivateData'),
2910                         }
2911                     })
2912                 elif stream_type in ('video', 'audio'):
2913                     formats.append({
2914                         'format_id': '-'.join(format_id),
2915                         'url': ism_url,
2916                         'manifest_url': ism_url,
2917                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2918                         'width': width,
2919                         'height': height,
2920                         'tbr': tbr,
2921                         'asr': sampling_rate,
2922                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2923                         'acodec': 'none' if stream_type == 'video' else fourcc,
2924                         'protocol': 'ism',
2925                         'fragments': fragments,
2926                         '_download_params': {
2927                             'stream_type': stream_type,
2928                             'duration': duration,
2929                             'timescale': stream_timescale,
2930                             'width': width or 0,
2931                             'height': height or 0,
2932                             'fourcc': fourcc,
2933                             'language': stream_language,
2934                             'codec_private_data': track.get('CodecPrivateData'),
2935                             'sampling_rate': sampling_rate,
2936                             'channels': int_or_none(track.get('Channels', 2)),
2937                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2938                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2939                         },
2940                     })
2941         return formats, subtitles
2942
2943     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2944         def absolute_url(item_url):
2945             return urljoin(base_url, item_url)
2946
2947         def parse_content_type(content_type):
2948             if not content_type:
2949                 return {}
2950             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2951             if ctr:
2952                 mimetype, codecs = ctr.groups()
2953                 f = parse_codecs(codecs)
2954                 f['ext'] = mimetype2ext(mimetype)
2955                 return f
2956             return {}
2957
2958         def _media_formats(src, cur_media_type, type_info={}):
2959             full_url = absolute_url(src)
2960             ext = type_info.get('ext') or determine_ext(full_url)
2961             if ext == 'm3u8':
2962                 is_plain_url = False
2963                 formats = self._extract_m3u8_formats(
2964                     full_url, video_id, ext='mp4',
2965                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2966                     preference=preference, quality=quality, fatal=False)
2967             elif ext == 'mpd':
2968                 is_plain_url = False
2969                 formats = self._extract_mpd_formats(
2970                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2971             else:
2972                 is_plain_url = True
2973                 formats = [{
2974                     'url': full_url,
2975                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2976                 }]
2977             return is_plain_url, formats
2978
2979         entries = []
2980         # amp-video and amp-audio are very similar to their HTML5 counterparts
2981         # so we wll include them right here (see
2982         # https://www.ampproject.org/docs/reference/components/amp-video)
2983         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2984         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2985         media_tags = [(media_tag, media_tag_name, media_type, '')
2986                       for media_tag, media_tag_name, media_type
2987                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2988         media_tags.extend(re.findall(
2989             # We only allow video|audio followed by a whitespace or '>'.
2990             # Allowing more characters may end up in significant slow down (see
2991             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2992             # http://www.porntrex.com/maps/videositemap.xml).
2993             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2994         for media_tag, _, media_type, media_content in media_tags:
2995             media_info = {
2996                 'formats': [],
2997                 'subtitles': {},
2998             }
2999             media_attributes = extract_attributes(media_tag)
3000             src = strip_or_none(media_attributes.get('src'))
3001             if src:
3002                 _, formats = _media_formats(src, media_type)
3003                 media_info['formats'].extend(formats)
3004             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3005             if media_content:
3006                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3007                     s_attr = extract_attributes(source_tag)
3008                     # data-video-src and data-src are non standard but seen
3009                     # several times in the wild
3010                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3011                     if not src:
3012                         continue
3013                     f = parse_content_type(s_attr.get('type'))
3014                     is_plain_url, formats = _media_formats(src, media_type, f)
3015                     if is_plain_url:
3016                         # width, height, res, label and title attributes are
3017                         # all not standard but seen several times in the wild
3018                         labels = [
3019                             s_attr.get(lbl)
3020                             for lbl in ('label', 'title')
3021                             if str_or_none(s_attr.get(lbl))
3022                         ]
3023                         width = int_or_none(s_attr.get('width'))
3024                         height = (int_or_none(s_attr.get('height'))
3025                                   or int_or_none(s_attr.get('res')))
3026                         if not width or not height:
3027                             for lbl in labels:
3028                                 resolution = parse_resolution(lbl)
3029                                 if not resolution:
3030                                     continue
3031                                 width = width or resolution.get('width')
3032                                 height = height or resolution.get('height')
3033                         for lbl in labels:
3034                             tbr = parse_bitrate(lbl)
3035                             if tbr:
3036                                 break
3037                         else:
3038                             tbr = None
3039                         f.update({
3040                             'width': width,
3041                             'height': height,
3042                             'tbr': tbr,
3043                             'format_id': s_attr.get('label') or s_attr.get('title'),
3044                         })
3045                         f.update(formats[0])
3046                         media_info['formats'].append(f)
3047                     else:
3048                         media_info['formats'].extend(formats)
3049                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3050                     track_attributes = extract_attributes(track_tag)
3051                     kind = track_attributes.get('kind')
3052                     if not kind or kind in ('subtitles', 'captions'):
3053                         src = strip_or_none(track_attributes.get('src'))
3054                         if not src:
3055                             continue
3056                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3057                         media_info['subtitles'].setdefault(lang, []).append({
3058                             'url': absolute_url(src),
3059                         })
3060             for f in media_info['formats']:
3061                 f.setdefault('http_headers', {})['Referer'] = base_url
3062             if media_info['formats'] or media_info['subtitles']:
3063                 entries.append(media_info)
3064         return entries
3065
3066     def _extract_akamai_formats(self, *args, **kwargs):
3067         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3068         if subs:
3069             self.report_warning(bug_reports_message(
3070                 "Ignoring subtitle tracks found in the manifests; "
3071                 "if any subtitle tracks are missing,"
3072             ))
3073         return fmts
3074
3075     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3076         signed = 'hdnea=' in manifest_url
3077         if not signed:
3078             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3079             manifest_url = re.sub(
3080                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3081                 '', manifest_url).strip('?')
3082
3083         formats = []
3084         subtitles = {}
3085
3086         hdcore_sign = 'hdcore=3.7.0'
3087         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3088         hds_host = hosts.get('hds')
3089         if hds_host:
3090             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3091         if 'hdcore=' not in f4m_url:
3092             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3093         f4m_formats = self._extract_f4m_formats(
3094             f4m_url, video_id, f4m_id='hds', fatal=False)
3095         for entry in f4m_formats:
3096             entry.update({'extra_param_to_segment_url': hdcore_sign})
3097         formats.extend(f4m_formats)
3098
3099         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3100         hls_host = hosts.get('hls')
3101         if hls_host:
3102             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3103         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3104             m3u8_url, video_id, 'mp4', 'm3u8_native',
3105             m3u8_id='hls', fatal=False)
3106         formats.extend(m3u8_formats)
3107         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3108
3109         http_host = hosts.get('http')
3110         if http_host and m3u8_formats and not signed:
3111             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3112             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3113             qualities_length = len(qualities)
3114             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3115                 i = 0
3116                 for f in m3u8_formats:
3117                     if f['vcodec'] != 'none':
3118                         for protocol in ('http', 'https'):
3119                             http_f = f.copy()
3120                             del http_f['manifest_url']
3121                             http_url = re.sub(
3122                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3123                             http_f.update({
3124                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3125                                 'url': http_url,
3126                                 'protocol': protocol,
3127                             })
3128                             formats.append(http_f)
3129                         i += 1
3130
3131         return formats, subtitles
3132
3133     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3134         query = compat_urlparse.urlparse(url).query
3135         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3136         mobj = re.search(
3137             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3138         url_base = mobj.group('url')
3139         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3140         formats = []
3141
3142         def manifest_url(manifest):
3143             m_url = '%s/%s' % (http_base_url, manifest)
3144             if query:
3145                 m_url += '?%s' % query
3146             return m_url
3147
3148         if 'm3u8' not in skip_protocols:
3149             formats.extend(self._extract_m3u8_formats(
3150                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3151                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3152         if 'f4m' not in skip_protocols:
3153             formats.extend(self._extract_f4m_formats(
3154                 manifest_url('manifest.f4m'),
3155                 video_id, f4m_id='hds', fatal=False))
3156         if 'dash' not in skip_protocols:
3157             formats.extend(self._extract_mpd_formats(
3158                 manifest_url('manifest.mpd'),
3159                 video_id, mpd_id='dash', fatal=False))
3160         if re.search(r'(?:/smil:|\.smil)', url_base):
3161             if 'smil' not in skip_protocols:
3162                 rtmp_formats = self._extract_smil_formats(
3163                     manifest_url('jwplayer.smil'),
3164                     video_id, fatal=False)
3165                 for rtmp_format in rtmp_formats:
3166                     rtsp_format = rtmp_format.copy()
3167                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3168                     del rtsp_format['play_path']
3169                     del rtsp_format['ext']
3170                     rtsp_format.update({
3171                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3172                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3173                         'protocol': 'rtsp',
3174                     })
3175                     formats.extend([rtmp_format, rtsp_format])
3176         else:
3177             for protocol in ('rtmp', 'rtsp'):
3178                 if protocol not in skip_protocols:
3179                     formats.append({
3180                         'url': '%s:%s' % (protocol, url_base),
3181                         'format_id': protocol,
3182                         'protocol': protocol,
3183                     })
3184         return formats
3185
3186     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3187         mobj = re.search(
3188             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3189             webpage)
3190         if mobj:
3191             try:
3192                 jwplayer_data = self._parse_json(mobj.group('options'),
3193                                                  video_id=video_id,
3194                                                  transform_source=transform_source)
3195             except ExtractorError:
3196                 pass
3197             else:
3198                 if isinstance(jwplayer_data, dict):
3199                     return jwplayer_data
3200
3201     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3202         jwplayer_data = self._find_jwplayer_data(
3203             webpage, video_id, transform_source=js_to_json)
3204         return self._parse_jwplayer_data(
3205             jwplayer_data, video_id, *args, **kwargs)
3206
3207     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3208                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3209         # JWPlayer backward compatibility: flattened playlists
3210         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3211         if 'playlist' not in jwplayer_data:
3212             jwplayer_data = {'playlist': [jwplayer_data]}
3213
3214         entries = []
3215
3216         # JWPlayer backward compatibility: single playlist item
3217         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3218         if not isinstance(jwplayer_data['playlist'], list):
3219             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3220
3221         for video_data in jwplayer_data['playlist']:
3222             # JWPlayer backward compatibility: flattened sources
3223             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3224             if 'sources' not in video_data:
3225                 video_data['sources'] = [video_data]
3226
3227             this_video_id = video_id or video_data['mediaid']
3228
3229             formats = self._parse_jwplayer_formats(
3230                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3231                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3232
3233             subtitles = {}
3234             tracks = video_data.get('tracks')
3235             if tracks and isinstance(tracks, list):
3236                 for track in tracks:
3237                     if not isinstance(track, dict):
3238                         continue
3239                     track_kind = track.get('kind')
3240                     if not track_kind or not isinstance(track_kind, compat_str):
3241                         continue
3242                     if track_kind.lower() not in ('captions', 'subtitles'):
3243                         continue
3244                     track_url = urljoin(base_url, track.get('file'))
3245                     if not track_url:
3246                         continue
3247                     subtitles.setdefault(track.get('label') or 'en', []).append({
3248                         'url': self._proto_relative_url(track_url)
3249                     })
3250
3251             entry = {
3252                 'id': this_video_id,
3253                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3254                 'description': clean_html(video_data.get('description')),
3255                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3256                 'timestamp': int_or_none(video_data.get('pubdate')),
3257                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3258                 'subtitles': subtitles,
3259             }
3260             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3261             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3262                 entry.update({
3263                     '_type': 'url_transparent',
3264                     'url': formats[0]['url'],
3265                 })
3266             else:
3267                 self._sort_formats(formats)
3268                 entry['formats'] = formats
3269             entries.append(entry)
3270         if len(entries) == 1:
3271             return entries[0]
3272         else:
3273             return self.playlist_result(entries)
3274
3275     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3276                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3277         urls = []
3278         formats = []
3279         for source in jwplayer_sources_data:
3280             if not isinstance(source, dict):
3281                 continue
3282             source_url = urljoin(
3283                 base_url, self._proto_relative_url(source.get('file')))
3284             if not source_url or source_url in urls:
3285                 continue
3286             urls.append(source_url)
3287             source_type = source.get('type') or ''
3288             ext = mimetype2ext(source_type) or determine_ext(source_url)
3289             if source_type == 'hls' or ext == 'm3u8':
3290                 formats.extend(self._extract_m3u8_formats(
3291                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3292                     m3u8_id=m3u8_id, fatal=False))
3293             elif source_type == 'dash' or ext == 'mpd':
3294                 formats.extend(self._extract_mpd_formats(
3295                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3296             elif ext == 'smil':
3297                 formats.extend(self._extract_smil_formats(
3298                     source_url, video_id, fatal=False))
3299             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3300             elif source_type.startswith('audio') or ext in (
3301                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3302                 formats.append({
3303                     'url': source_url,
3304                     'vcodec': 'none',
3305                     'ext': ext,
3306                 })
3307             else:
3308                 height = int_or_none(source.get('height'))
3309                 if height is None:
3310                     # Often no height is provided but there is a label in
3311                     # format like "1080p", "720p SD", or 1080.
3312                     height = int_or_none(self._search_regex(
3313                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3314                         'height', default=None))
3315                 a_format = {
3316                     'url': source_url,
3317                     'width': int_or_none(source.get('width')),
3318                     'height': height,
3319                     'tbr': int_or_none(source.get('bitrate')),
3320                     'ext': ext,
3321                 }
3322                 if source_url.startswith('rtmp'):
3323                     a_format['ext'] = 'flv'
3324                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3325                     # of jwplayer.flash.swf
3326                     rtmp_url_parts = re.split(
3327                         r'((?:mp4|mp3|flv):)', source_url, 1)
3328                     if len(rtmp_url_parts) == 3:
3329                         rtmp_url, prefix, play_path = rtmp_url_parts
3330                         a_format.update({
3331                             'url': rtmp_url,
3332                             'play_path': prefix + play_path,
3333                         })
3334                     if rtmp_params:
3335                         a_format.update(rtmp_params)
3336                 formats.append(a_format)
3337         return formats
3338
3339     def _live_title(self, name):
3340         """ Generate the title for a live video """
3341         now = datetime.datetime.now()
3342         now_str = now.strftime('%Y-%m-%d %H:%M')
3343         return name + ' ' + now_str
3344
3345     def _int(self, v, name, fatal=False, **kwargs):
3346         res = int_or_none(v, **kwargs)
3347         if 'get_attr' in kwargs:
3348             print(getattr(v, kwargs['get_attr']))
3349         if res is None:
3350             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3351             if fatal:
3352                 raise ExtractorError(msg)
3353             else:
3354                 self.report_warning(msg)
3355         return res
3356
3357     def _float(self, v, name, fatal=False, **kwargs):
3358         res = float_or_none(v, **kwargs)
3359         if res is None:
3360             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3361             if fatal:
3362                 raise ExtractorError(msg)
3363             else:
3364                 self.report_warning(msg)
3365         return res
3366
3367     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3368                     path='/', secure=False, discard=False, rest={}, **kwargs):
3369         cookie = compat_cookiejar_Cookie(
3370             0, name, value, port, port is not None, domain, True,
3371             domain.startswith('.'), path, True, secure, expire_time,
3372             discard, None, None, rest)
3373         self._downloader.cookiejar.set_cookie(cookie)
3374
3375     def _get_cookies(self, url):
3376         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3377         req = sanitized_Request(url)
3378         self._downloader.cookiejar.add_cookie_header(req)
3379         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3380
3381     def _apply_first_set_cookie_header(self, url_handle, cookie):
3382         """
3383         Apply first Set-Cookie header instead of the last. Experimental.
3384
3385         Some sites (e.g. [1-3]) may serve two cookies under the same name
3386         in Set-Cookie header and expect the first (old) one to be set rather
3387         than second (new). However, as of RFC6265 the newer one cookie
3388         should be set into cookie store what actually happens.
3389         We will workaround this issue by resetting the cookie to
3390         the first one manually.
3391         1. https://new.vk.com/
3392         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3393         3. https://learning.oreilly.com/
3394         """
3395         for header, cookies in url_handle.headers.items():
3396             if header.lower() != 'set-cookie':
3397                 continue
3398             if sys.version_info[0] >= 3:
3399                 cookies = cookies.encode('iso-8859-1')
3400             cookies = cookies.decode('utf-8')
3401             cookie_value = re.search(
3402                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3403             if cookie_value:
3404                 value, domain = cookie_value.groups()
3405                 self._set_cookie(domain, cookie, value)
3406                 break
3407
3408     def get_testcases(self, include_onlymatching=False):
3409         t = getattr(self, '_TEST', None)
3410         if t:
3411             assert not hasattr(self, '_TESTS'), \
3412                 '%s has _TEST and _TESTS' % type(self).__name__
3413             tests = [t]
3414         else:
3415             tests = getattr(self, '_TESTS', [])
3416         for t in tests:
3417             if not include_onlymatching and t.get('only_matching', False):
3418                 continue
3419             t['name'] = type(self).__name__[:-len('IE')]
3420             yield t
3421
3422     def is_suitable(self, age_limit):
3423         """ Test whether the extractor is generally suitable for the given
3424         age limit (i.e. pornographic sites are not, all others usually are) """
3425
3426         any_restricted = False
3427         for tc in self.get_testcases(include_onlymatching=False):
3428             if tc.get('playlist', []):
3429                 tc = tc['playlist'][0]
3430             is_restricted = age_restricted(
3431                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3432             if not is_restricted:
3433                 return True
3434             any_restricted = any_restricted or is_restricted
3435         return not any_restricted
3436
3437     def extract_subtitles(self, *args, **kwargs):
3438         if (self.get_param('writesubtitles', False)
3439                 or self.get_param('listsubtitles')):
3440             return self._get_subtitles(*args, **kwargs)
3441         return {}
3442
3443     def _get_subtitles(self, *args, **kwargs):
3444         raise NotImplementedError('This method must be implemented by subclasses')
3445
3446     @staticmethod
3447     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3448         """ Merge subtitle items for one language. Items with duplicated URLs
3449         will be dropped. """
3450         list1_urls = set([item['url'] for item in subtitle_list1])
3451         ret = list(subtitle_list1)
3452         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3453         return ret
3454
3455     @classmethod
3456     def _merge_subtitles(cls, *dicts, target=None):
3457         """ Merge subtitle dictionaries, language by language. """
3458         if target is None:
3459             target = {}
3460         for d in dicts:
3461             for lang, subs in d.items():
3462                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3463         return target
3464
3465     def extract_automatic_captions(self, *args, **kwargs):
3466         if (self.get_param('writeautomaticsub', False)
3467                 or self.get_param('listsubtitles')):
3468             return self._get_automatic_captions(*args, **kwargs)
3469         return {}
3470
3471     def _get_automatic_captions(self, *args, **kwargs):
3472         raise NotImplementedError('This method must be implemented by subclasses')
3473
3474     def mark_watched(self, *args, **kwargs):
3475         if (self.get_param('mark_watched', False)
3476                 and (self._get_login_info()[0] is not None
3477                      or self.get_param('cookiefile') is not None)):
3478             self._mark_watched(*args, **kwargs)
3479
3480     def _mark_watched(self, *args, **kwargs):
3481         raise NotImplementedError('This method must be implemented by subclasses')
3482
3483     def geo_verification_headers(self):
3484         headers = {}
3485         geo_verification_proxy = self.get_param('geo_verification_proxy')
3486         if geo_verification_proxy:
3487             headers['Ytdl-request-proxy'] = geo_verification_proxy
3488         return headers
3489
3490     def _generic_id(self, url):
3491         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3492
3493     def _generic_title(self, url):
3494         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3495
3496     @staticmethod
3497     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3498         all_known = all(map(
3499             lambda x: x is not None,
3500             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3501         return (
3502             'private' if is_private
3503             else 'premium_only' if needs_premium
3504             else 'subscriber_only' if needs_subscription
3505             else 'needs_auth' if needs_auth
3506             else 'unlisted' if is_unlisted
3507             else 'public' if all_known
3508             else None)
3509
3510     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3511         '''
3512         @returns            A list of values for the extractor argument given by "key"
3513                             or "default" if no such key is present
3514         @param default      The default value to return when the key is not present (default: [])
3515         @param casesense    When false, the values are converted to lower case
3516         '''
3517         val = traverse_obj(
3518             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3519         if val is None:
3520             return [] if default is NO_DEFAULT else default
3521         return list(val) if casesense else [x.lower() for x in val]
3522
3523
3524 class SearchInfoExtractor(InfoExtractor):
3525     """
3526     Base class for paged search queries extractors.
3527     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3528     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3529     """
3530
3531     @classmethod
3532     def _make_valid_url(cls):
3533         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3534
3535     @classmethod
3536     def suitable(cls, url):
3537         return re.match(cls._make_valid_url(), url) is not None
3538
3539     def _real_extract(self, query):
3540         mobj = re.match(self._make_valid_url(), query)
3541         if mobj is None:
3542             raise ExtractorError('Invalid search query "%s"' % query)
3543
3544         prefix = mobj.group('prefix')
3545         query = mobj.group('query')
3546         if prefix == '':
3547             return self._get_n_results(query, 1)
3548         elif prefix == 'all':
3549             return self._get_n_results(query, self._MAX_RESULTS)
3550         else:
3551             n = int(prefix)
3552             if n <= 0:
3553                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3554             elif n > self._MAX_RESULTS:
3555                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3556                 n = self._MAX_RESULTS
3557             return self._get_n_results(query, n)
3558
3559     def _get_n_results(self, query, n):
3560         """Get a specified number of results for a query"""
3561         raise NotImplementedError('This method must be implemented by subclasses')
3562
3563     @property
3564     def SEARCH_KEY(self):
3565         return self._SEARCH_KEY