yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import ssl
  14 import sys
  15 import time
  16 import math
  17
  18 from ..compat import (
  19     compat_cookiejar_Cookie,
  20     compat_cookies,
  21     compat_etree_Element,
  22     compat_etree_fromstring,
  23     compat_getpass,
  24     compat_integer_types,
  25     compat_http_client,
  26     compat_os_name,
  27     compat_str,
  28     compat_urllib_error,
  29     compat_urllib_parse_unquote,
  30     compat_urllib_parse_urlencode,
  31     compat_urllib_request,
  32     compat_urlparse,
  33     compat_xml_parse_error,
  34 )
  35 from ..downloader import FileDownloader
  36 from ..downloader.f4m import (
  37     get_base_url,
  38     remove_encrypted_media,
  39 )
  40 from ..utils import (
  41     NO_DEFAULT,
  42     age_restricted,
  43     base_url,
  44     bug_reports_message,
  45     clean_html,
  46     compiled_regex_type,
  47     determine_ext,
  48     determine_protocol,
  49     dict_get,
  50     error_to_compat_str,
  51     ExtractorError,
  52     extract_attributes,
  53     fix_xml_ampersands,
  54     float_or_none,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     js_to_json,
  59     JSON_LD_RE,
  60     mimetype2ext,
  61     orderedSet,
  62     parse_bitrate,
  63     parse_codecs,
  64     parse_duration,
  65     parse_iso8601,
  66     parse_m3u8_attributes,
  67     parse_resolution,
  68     RegexNotFoundError,
  69     sanitized_Request,
  70     sanitize_filename,
  71     str_or_none,
  72     str_to_int,
  73     strip_or_none,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     urljoin,
  80     url_basename,
  81     url_or_none,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor(object):
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped.
 107
 108     Additionally, it must contain either a formats entry or a url one:
 109
 110     formats:        A list of dictionaries for each format available, ordered
 111                     from worst to best quality.
 112
 113                     Potential fields:
 114                     * url        The mandatory URL representing the media:
 115                                    for plain file media - HTTP URL of this file,
 116                                    for RTMP - RTMP URL,
 117                                    for HLS - URL of the M3U8 media playlist,
 118                                    for HDS - URL of the F4M manifest,
 119                                    for DASH
 120                                      - HTTP URL to plain file media (in case of
 121                                        unfragmented media)
 122                                      - URL of the MPD manifest or base URL
 123                                        representing the media if MPD manifest
 124                                        is parsed from a string (in case of
 125                                        fragmented media)
 126                                    for MSS - URL of the ISM manifest.
 127                     * manifest_url
 128                                  The URL of the manifest file in case of
 129                                  fragmented media:
 130                                    for HLS - URL of the M3U8 master playlist,
 131                                    for HDS - URL of the F4M manifest,
 132                                    for DASH - URL of the MPD manifest,
 133                                    for MSS - URL of the ISM manifest.
 134                     * ext        Will be calculated from URL if missing
 135                     * format     A human-readable description of the format
 136                                  ("mp4 container with h264/opus").
 137                                  Calculated from the format_id, width, height.
 138                                  and format_note fields if missing.
 139                     * format_id  A short description of the format
 140                                  ("mp4_h264_opus" or "19").
 141                                 Technically optional, but strongly recommended.
 142                     * format_note Additional info about the format
 143                                  ("3D" or "DASH video")
 144                     * width      Width of the video, if known
 145                     * height     Height of the video, if known
 146                     * resolution Textual description of width and height
 147                     * tbr        Average bitrate of audio and video in KBit/s
 148                     * abr        Average audio bitrate in KBit/s
 149                     * acodec     Name of the audio codec in use
 150                     * asr        Audio sampling rate in Hertz
 151                     * vbr        Average video bitrate in KBit/s
 152                     * fps        Frame rate
 153                     * vcodec     Name of the video codec in use
 154                     * container  Name of the container format
 155                     * filesize   The number of bytes, if known in advance
 156                     * filesize_approx  An estimate for the number of bytes
 157                     * player_url SWF Player URL (used for rtmpdump).
 158                     * protocol   The protocol that will be used for the actual
 159                                  download, lower-case.
 160                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 161                                  "m3u8", "m3u8_native" or "http_dash_segments".
 162                     * fragment_base_url
 163                                  Base URL for fragments. Each fragment's path
 164                                  value (if present) will be relative to
 165                                  this URL.
 166                     * fragments  A list of fragments of a fragmented media.
 167                                  Each fragment entry must contain either an url
 168                                  or a path. If an url is present it should be
 169                                  considered by a client. Otherwise both path and
 170                                  fragment_base_url must be present. Here is
 171                                  the list of all potential fields:
 172                                  * "url" - fragment's URL
 173                                  * "path" - fragment's path relative to
 174                                             fragment_base_url
 175                                  * "duration" (optional, int or float)
 176                                  * "filesize" (optional, int)
 177                     * preference Order number of this format. If this field is
 178                                  present and not None, the formats get sorted
 179                                  by this field, regardless of all other values.
 180                                  -1 for default (order by other properties),
 181                                  -2 or smaller for less than default.
 182                                  < -1000 to hide the format (if there is
 183                                     another one which is strictly better)
 184                     * language   Language code, e.g. "de" or "en-US".
 185                     * language_preference  Is this in the language mentioned in
 186                                  the URL?
 187                                  10 if it's what the URL is about,
 188                                  -1 for default (don't know),
 189                                  -10 otherwise, other values reserved for now.
 190                     * quality    Order number of the video quality of this
 191                                  format, irrespective of the file format.
 192                                  -1 for default (order by other properties),
 193                                  -2 or smaller for less than default.
 194                     * source_preference  Order number for this video source
 195                                   (quality takes higher priority)
 196                                  -1 for default (order by other properties),
 197                                  -2 or smaller for less than default.
 198                     * http_headers  A dictionary of additional HTTP headers
 199                                  to add to the request.
 200                     * stretched_ratio  If given and not 1, indicates that the
 201                                  video's pixels are not square.
 202                                  width : height ratio as float.
 203                     * no_resume  The server does not support resuming the
 204                                  (HTTP or RTMP) download. Boolean.
 205                     * downloader_options  A dictionary of downloader options as
 206                                  described in FileDownloader
 207
 208     url:            Final video URL.
 209     ext:            Video filename extension.
 210     format:         The video format, defaults to ext (used for --get-format)
 211     player_url:     SWF Player URL (used for rtmpdump).
 212
 213     The following fields are optional:
 214
 215     alt_title:      A secondary title of the video.
 216     display_id      An alternative identifier for the video, not necessarily
 217                     unique, but available before title. Typically, id is
 218                     something like "4234987", title "Dancing naked mole rats",
 219                     and display_id "dancing-naked-mole-rats"
 220     thumbnails:     A list of dictionaries, with the following entries:
 221                         * "id" (optional, string) - Thumbnail format ID
 222                         * "url"
 223                         * "preference" (optional, int) - quality of the image
 224                         * "width" (optional, int)
 225                         * "height" (optional, int)
 226                         * "resolution" (optional, string "{width}x{height}",
 227                                         deprecated)
 228                         * "filesize" (optional, int)
 229     thumbnail:      Full URL to a video thumbnail image.
 230     description:    Full video description.
 231     uploader:       Full name of the video uploader.
 232     license:        License name the video is licensed under.
 233     creator:        The creator of the video.
 234     release_date:   The date (YYYYMMDD) when the video was released.
 235     timestamp:      UNIX timestamp of the moment the video became available.
 236     upload_date:    Video upload date (YYYYMMDD).
 237                     If not explicitly set, calculated from timestamp.
 238     uploader_id:    Nickname or id of the video uploader.
 239     uploader_url:   Full URL to a personal webpage of the video uploader.
 240     channel:        Full name of the channel the video is uploaded on.
 241                     Note that channel fields may or may not repeat uploader
 242                     fields. This depends on a particular extractor.
 243     channel_id:     Id of the channel.
 244     channel_url:    Full URL to a channel webpage.
 245     location:       Physical location where the video was filmed.
 246     subtitles:      The available subtitles as a dictionary in the format
 247                     {tag: subformats}. "tag" is usually a language code, and
 248                     "subformats" is a list sorted from lower to higher
 249                     preference, each element is a dictionary with the "ext"
 250                     entry and one of:
 251                         * "data": The subtitles file contents
 252                         * "url": A URL pointing to the subtitles file
 253                     "ext" will be calculated from URL if missing
 254     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 255                     automatically generated captions
 256     duration:       Length of the video in seconds, as an integer or float.
 257     view_count:     How many users have watched the video on the platform.
 258     like_count:     Number of positive ratings of the video
 259     dislike_count:  Number of negative ratings of the video
 260     repost_count:   Number of reposts of the video
 261     average_rating: Average rating give by users, the scale used depends on the webpage
 262     comment_count:  Number of comments on the video
 263     comments:       A list of comments, each with one or more of the following
 264                     properties (all but one of text or html optional):
 265                         * "author" - human-readable name of the comment author
 266                         * "author_id" - user ID of the comment author
 267                         * "id" - Comment ID
 268                         * "html" - Comment as HTML
 269                         * "text" - Plain text of the comment
 270                         * "timestamp" - UNIX timestamp of comment
 271                         * "parent" - ID of the comment this one is replying to.
 272                                      Set to "root" to indicate that this is a
 273                                      comment to the original video.
 274     age_limit:      Age restriction for the video, as an integer (years)
 275     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 276                     should allow to get the same result again. (It will be set
 277                     by YoutubeDL if it's missing)
 278     categories:     A list of categories that the video falls in, for example
 279                     ["Sports", "Berlin"]
 280     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 281     is_live:        True, False, or None (=unknown). Whether this video is a
 282                     live stream that goes on instead of a fixed-length video.
 283     was_live:       True, False, or None (=unknown). Whether this video was
 284                     originally a live stream.
 285     start_time:     Time in seconds where the reproduction should start, as
 286                     specified in the URL.
 287     end_time:       Time in seconds where the reproduction should end, as
 288                     specified in the URL.
 289     chapters:       A list of dictionaries, with the following entries:
 290                         * "start_time" - The start time of the chapter in seconds
 291                         * "end_time" - The end time of the chapter in seconds
 292                         * "title" (optional, string)
 293     playable_in_embed: Whether this video is allowed to play in embedded
 294                     players on other sites. Can be True (=always allowed),
 295                     False (=never allowed), None (=unknown), or a string
 296                     specifying the criteria for embedability (Eg: 'whitelist').
 297     __post_extractor: A function to be called just before the metadata is
 298                     written to either disk, logger or console. The function
 299                     must return a dict which will be added to the info_dict.
 300                     This is usefull for additional information that is
 301                     time-consuming to extract. Note that the fields thus
 302                     extracted will not be available to output template and
 303                     match_filter. So, only "comments" and "comment_count" are
 304                     currently allowed to be extracted via this method.
 305
 306     The following fields should only be used when the video belongs to some logical
 307     chapter or section:
 308
 309     chapter:        Name or title of the chapter the video belongs to.
 310     chapter_number: Number of the chapter the video belongs to, as an integer.
 311     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 312
 313     The following fields should only be used when the video is an episode of some
 314     series, programme or podcast:
 315
 316     series:         Title of the series or programme the video episode belongs to.
 317     season:         Title of the season the video episode belongs to.
 318     season_number:  Number of the season the video episode belongs to, as an integer.
 319     season_id:      Id of the season the video episode belongs to, as a unicode string.
 320     episode:        Title of the video episode. Unlike mandatory video title field,
 321                     this field should denote the exact title of the video episode
 322                     without any kind of decoration.
 323     episode_number: Number of the video episode within a season, as an integer.
 324     episode_id:     Id of the video episode, as a unicode string.
 325
 326     The following fields should only be used when the media is a track or a part of
 327     a music album:
 328
 329     track:          Title of the track.
 330     track_number:   Number of the track within an album or a disc, as an integer.
 331     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 332                     as a unicode string.
 333     artist:         Artist(s) of the track.
 334     genre:          Genre(s) of the track.
 335     album:          Title of the album the track belongs to.
 336     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 337     album_artist:   List of all artists appeared on the album (e.g.
 338                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 339                     and compilations).
 340     disc_number:    Number of the disc or other physical medium the track belongs to,
 341                     as an integer.
 342     release_year:   Year (YYYY) when the album was released.
 343
 344     Unless mentioned otherwise, the fields should be Unicode strings.
 345
 346     Unless mentioned otherwise, None is equivalent to absence of information.
 347
 348
 349     _type "playlist" indicates multiple videos.
 350     There must be a key "entries", which is a list, an iterable, or a PagedList
 351     object, each element of which is a valid dictionary by this specification.
 352
 353     Additionally, playlists can have "id", "title", and any other relevent
 354     attributes with the same semantics as videos (see above).
 355
 356
 357     _type "multi_video" indicates that there are multiple videos that
 358     form a single show, for examples multiple acts of an opera or TV episode.
 359     It must have an entries key like a playlist and contain all the keys
 360     required for a video at the same time.
 361
 362
 363     _type "url" indicates that the video must be extracted from another
 364     location, possibly by a different extractor. Its only required key is:
 365     "url" - the next URL to extract.
 366     The key "ie_key" can be set to the class name (minus the trailing "IE",
 367     e.g. "Youtube") if the extractor class is known in advance.
 368     Additionally, the dictionary may have any properties of the resolved entity
 369     known in advance, for example "title" if the title of the referred video is
 370     known ahead of time.
 371
 372
 373     _type "url_transparent" entities have the same specification as "url", but
 374     indicate that the given additional information is more precise than the one
 375     associated with the resolved URL.
 376     This is useful when a site employs a video service that hosts the video and
 377     its technical metadata, but that video service does not embed a useful
 378     title, description etc.
 379
 380
 381     Subclasses of this one should re-define the _real_initialize() and
 382     _real_extract() methods and define a _VALID_URL regexp.
 383     Probably, they should also be added to the list of extractors.
 384
 385     _GEO_BYPASS attribute may be set to False in order to disable
 386     geo restriction bypass mechanisms for a particular extractor.
 387     Though it won't disable explicit geo restriction bypass based on
 388     country code provided with geo_bypass_country.
 389
 390     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 391     countries for this extractor. One of these countries will be used by
 392     geo restriction bypass mechanism right away in order to bypass
 393     geo restriction, of course, if the mechanism is not disabled.
 394
 395     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 396     IP blocks in CIDR notation for this extractor. One of these IP blocks
 397     will be used by geo restriction bypass mechanism similarly
 398     to _GEO_COUNTRIES.
 399
 400     Finally, the _WORKING attribute should be set to False for broken IEs
 401     in order to warn the users and skip the tests.
 402     """
 403
 404     _ready = False
 405     _downloader = None
 406     _x_forwarded_for_ip = None
 407     _GEO_BYPASS = True
 408     _GEO_COUNTRIES = None
 409     _GEO_IP_BLOCKS = None
 410     _WORKING = True
 411
 412     def __init__(self, downloader=None):
 413         """Constructor. Receives an optional downloader."""
 414         self._ready = False
 415         self._x_forwarded_for_ip = None
 416         self.set_downloader(downloader)
 417
 418     @classmethod
 419     def suitable(cls, url):
 420         """Receives a URL and returns True if suitable for this IE."""
 421
 422         # This does not use has/getattr intentionally - we want to know whether
 423         # we have cached the regexp for *this* class, whereas getattr would also
 424         # match the superclass
 425         if '_VALID_URL_RE' not in cls.__dict__:
 426             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 427         return cls._VALID_URL_RE.match(url) is not None
 428
 429     @classmethod
 430     def _match_id(cls, url):
 431         if '_VALID_URL_RE' not in cls.__dict__:
 432             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 433         m = cls._VALID_URL_RE.match(url)
 434         assert m
 435         return compat_str(m.group('id'))
 436
 437     @classmethod
 438     def working(cls):
 439         """Getter method for _WORKING."""
 440         return cls._WORKING
 441
 442     def initialize(self):
 443         """Initializes an instance (authentication, etc)."""
 444         self._initialize_geo_bypass({
 445             'countries': self._GEO_COUNTRIES,
 446             'ip_blocks': self._GEO_IP_BLOCKS,
 447         })
 448         if not self._ready:
 449             self._real_initialize()
 450             self._ready = True
 451
 452     def _initialize_geo_bypass(self, geo_bypass_context):
 453         """
 454         Initialize geo restriction bypass mechanism.
 455
 456         This method is used to initialize geo bypass mechanism based on faking
 457         X-Forwarded-For HTTP header. A random country from provided country list
 458         is selected and a random IP belonging to this country is generated. This
 459         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 460         HTTP requests.
 461
 462         This method will be used for initial geo bypass mechanism initialization
 463         during the instance initialization with _GEO_COUNTRIES and
 464         _GEO_IP_BLOCKS.
 465
 466         You may also manually call it from extractor's code if geo bypass
 467         information is not available beforehand (e.g. obtained during
 468         extraction) or due to some other reason. In this case you should pass
 469         this information in geo bypass context passed as first argument. It may
 470         contain following fields:
 471
 472         countries:  List of geo unrestricted countries (similar
 473                     to _GEO_COUNTRIES)
 474         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 475                     (similar to _GEO_IP_BLOCKS)
 476
 477         """
 478         if not self._x_forwarded_for_ip:
 479
 480             # Geo bypass mechanism is explicitly disabled by user
 481             if not self._downloader.params.get('geo_bypass', True):
 482                 return
 483
 484             if not geo_bypass_context:
 485                 geo_bypass_context = {}
 486
 487             # Backward compatibility: previously _initialize_geo_bypass
 488             # expected a list of countries, some 3rd party code may still use
 489             # it this way
 490             if isinstance(geo_bypass_context, (list, tuple)):
 491                 geo_bypass_context = {
 492                     'countries': geo_bypass_context,
 493                 }
 494
 495             # The whole point of geo bypass mechanism is to fake IP
 496             # as X-Forwarded-For HTTP header based on some IP block or
 497             # country code.
 498
 499             # Path 1: bypassing based on IP block in CIDR notation
 500
 501             # Explicit IP block specified by user, use it right away
 502             # regardless of whether extractor is geo bypassable or not
 503             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 504
 505             # Otherwise use random IP block from geo bypass context but only
 506             # if extractor is known as geo bypassable
 507             if not ip_block:
 508                 ip_blocks = geo_bypass_context.get('ip_blocks')
 509                 if self._GEO_BYPASS and ip_blocks:
 510                     ip_block = random.choice(ip_blocks)
 511
 512             if ip_block:
 513                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 514                 if self._downloader.params.get('verbose', False):
 515                     self._downloader.to_screen(
 516                         '[debug] Using fake IP %s as X-Forwarded-For.'
 517                         % self._x_forwarded_for_ip)
 518                 return
 519
 520             # Path 2: bypassing based on country code
 521
 522             # Explicit country code specified by user, use it right away
 523             # regardless of whether extractor is geo bypassable or not
 524             country = self._downloader.params.get('geo_bypass_country', None)
 525
 526             # Otherwise use random country code from geo bypass context but
 527             # only if extractor is known as geo bypassable
 528             if not country:
 529                 countries = geo_bypass_context.get('countries')
 530                 if self._GEO_BYPASS and countries:
 531                     country = random.choice(countries)
 532
 533             if country:
 534                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 535                 if self._downloader.params.get('verbose', False):
 536                     self._downloader.to_screen(
 537                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 538                         % (self._x_forwarded_for_ip, country.upper()))
 539
 540     def extract(self, url):
 541         """Extracts URL information and returns it in list of dicts."""
 542         try:
 543             for _ in range(2):
 544                 try:
 545                     self.initialize()
 546                     ie_result = self._real_extract(url)
 547                     if self._x_forwarded_for_ip:
 548                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 549                     return ie_result
 550                 except GeoRestrictedError as e:
 551                     if self.__maybe_fake_ip_and_retry(e.countries):
 552                         continue
 553                     raise
 554         except ExtractorError:
 555             raise
 556         except compat_http_client.IncompleteRead as e:
 557             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 558         except (KeyError, StopIteration) as e:
 559             raise ExtractorError('An extractor error has occurred.', cause=e)
 560
 561     def __maybe_fake_ip_and_retry(self, countries):
 562         if (not self._downloader.params.get('geo_bypass_country', None)
 563                 and self._GEO_BYPASS
 564                 and self._downloader.params.get('geo_bypass', True)
 565                 and not self._x_forwarded_for_ip
 566                 and countries):
 567             country_code = random.choice(countries)
 568             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 569             if self._x_forwarded_for_ip:
 570                 self.report_warning(
 571                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 572                     % (self._x_forwarded_for_ip, country_code.upper()))
 573                 return True
 574         return False
 575
 576     def set_downloader(self, downloader):
 577         """Sets the downloader for this IE."""
 578         self._downloader = downloader
 579
 580     def _real_initialize(self):
 581         """Real initialization process. Redefine in subclasses."""
 582         pass
 583
 584     def _real_extract(self, url):
 585         """Real extraction process. Redefine in subclasses."""
 586         pass
 587
 588     @classmethod
 589     def ie_key(cls):
 590         """A string for getting the InfoExtractor with get_info_extractor"""
 591         return compat_str(cls.__name__[:-2])
 592
 593     @property
 594     def IE_NAME(self):
 595         return compat_str(type(self).__name__[:-2])
 596
 597     @staticmethod
 598     def __can_accept_status_code(err, expected_status):
 599         assert isinstance(err, compat_urllib_error.HTTPError)
 600         if expected_status is None:
 601             return False
 602         if isinstance(expected_status, compat_integer_types):
 603             return err.code == expected_status
 604         elif isinstance(expected_status, (list, tuple)):
 605             return err.code in expected_status
 606         elif callable(expected_status):
 607             return expected_status(err.code) is True
 608         else:
 609             assert False
 610
 611     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 612         """
 613         Return the response handle.
 614
 615         See _download_webpage docstring for arguments specification.
 616         """
 617         if not self._downloader._first_webpage_request:
 618             sleep_interval = float_or_none(self._downloader.params.get('sleep_interval_requests')) or 0
 619             if sleep_interval > 0:
 620                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 621                 time.sleep(sleep_interval)
 622         else:
 623             self._downloader._first_webpage_request = False
 624
 625         if note is None:
 626             self.report_download_webpage(video_id)
 627         elif note is not False:
 628             if video_id is None:
 629                 self.to_screen('%s' % (note,))
 630             else:
 631                 self.to_screen('%s: %s' % (video_id, note))
 632
 633         # Some sites check X-Forwarded-For HTTP header in order to figure out
 634         # the origin of the client behind proxy. This allows bypassing geo
 635         # restriction by faking this header's value to IP that belongs to some
 636         # geo unrestricted country. We will do so once we encounter any
 637         # geo restriction error.
 638         if self._x_forwarded_for_ip:
 639             if 'X-Forwarded-For' not in headers:
 640                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 641
 642         if isinstance(url_or_request, compat_urllib_request.Request):
 643             url_or_request = update_Request(
 644                 url_or_request, data=data, headers=headers, query=query)
 645         else:
 646             if query:
 647                 url_or_request = update_url_query(url_or_request, query)
 648             if data is not None or headers:
 649                 url_or_request = sanitized_Request(url_or_request, data, headers)
 650         exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 651         if hasattr(ssl, 'CertificateError'):
 652             exceptions.append(ssl.CertificateError)
 653         try:
 654             return self._downloader.urlopen(url_or_request)
 655         except tuple(exceptions) as err:
 656             if isinstance(err, compat_urllib_error.HTTPError):
 657                 if self.__can_accept_status_code(err, expected_status):
 658                     # Retain reference to error to prevent file object from
 659                     # being closed before it can be read. Works around the
 660                     # effects of <https://bugs.python.org/issue15002>
 661                     # introduced in Python 3.4.1.
 662                     err.fp._error = err
 663                     return err.fp
 664
 665             if errnote is False:
 666                 return False
 667             if errnote is None:
 668                 errnote = 'Unable to download webpage'
 669
 670             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 671             if fatal:
 672                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 673             else:
 674                 self._downloader.report_warning(errmsg)
 675                 return False
 676
 677     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 678         """
 679         Return a tuple (page content as string, URL handle).
 680
 681         See _download_webpage docstring for arguments specification.
 682         """
 683         # Strip hashes from the URL (#1038)
 684         if isinstance(url_or_request, (compat_str, str)):
 685             url_or_request = url_or_request.partition('#')[0]
 686
 687         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 688         if urlh is False:
 689             assert not fatal
 690             return False
 691         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 692         return (content, urlh)
 693
 694     @staticmethod
 695     def _guess_encoding_from_content(content_type, webpage_bytes):
 696         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 697         if m:
 698             encoding = m.group(1)
 699         else:
 700             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 701                           webpage_bytes[:1024])
 702             if m:
 703                 encoding = m.group(1).decode('ascii')
 704             elif webpage_bytes.startswith(b'\xff\xfe'):
 705                 encoding = 'utf-16'
 706             else:
 707                 encoding = 'utf-8'
 708
 709         return encoding
 710
 711     def __check_blocked(self, content):
 712         first_block = content[:512]
 713         if ('<title>Access to this site is blocked</title>' in content
 714                 and 'Websense' in first_block):
 715             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 716             blocked_iframe = self._html_search_regex(
 717                 r'<iframe src="([^"]+)"', content,
 718                 'Websense information URL', default=None)
 719             if blocked_iframe:
 720                 msg += ' Visit %s for more details' % blocked_iframe
 721             raise ExtractorError(msg, expected=True)
 722         if '<title>The URL you requested has been blocked</title>' in first_block:
 723             msg = (
 724                 'Access to this webpage has been blocked by Indian censorship. '
 725                 'Use a VPN or proxy server (with --proxy) to route around it.')
 726             block_msg = self._html_search_regex(
 727                 r'</h1><p>(.*?)</p>',
 728                 content, 'block message', default=None)
 729             if block_msg:
 730                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 731             raise ExtractorError(msg, expected=True)
 732         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 733                 and 'blocklist.rkn.gov.ru' in content):
 734             raise ExtractorError(
 735                 'Access to this webpage has been blocked by decision of the Russian government. '
 736                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 737                 expected=True)
 738
 739     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 740         content_type = urlh.headers.get('Content-Type', '')
 741         webpage_bytes = urlh.read()
 742         if prefix is not None:
 743             webpage_bytes = prefix + webpage_bytes
 744         if not encoding:
 745             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 746         if self._downloader.params.get('dump_intermediate_pages', False):
 747             self.to_screen('Dumping request to ' + urlh.geturl())
 748             dump = base64.b64encode(webpage_bytes).decode('ascii')
 749             self._downloader.to_screen(dump)
 750         if self._downloader.params.get('write_pages', False):
 751             basen = '%s_%s' % (video_id, urlh.geturl())
 752             if len(basen) > 240:
 753                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 754                 basen = basen[:240 - len(h)] + h
 755             raw_filename = basen + '.dump'
 756             filename = sanitize_filename(raw_filename, restricted=True)
 757             self.to_screen('Saving request to ' + filename)
 758             # Working around MAX_PATH limitation on Windows (see
 759             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 760             if compat_os_name == 'nt':
 761                 absfilepath = os.path.abspath(filename)
 762                 if len(absfilepath) > 259:
 763                     filename = '\\\\?\\' + absfilepath
 764             with open(filename, 'wb') as outf:
 765                 outf.write(webpage_bytes)
 766
 767         try:
 768             content = webpage_bytes.decode(encoding, 'replace')
 769         except LookupError:
 770             content = webpage_bytes.decode('utf-8', 'replace')
 771
 772         self.__check_blocked(content)
 773
 774         return content
 775
 776     def _download_webpage(
 777             self, url_or_request, video_id, note=None, errnote=None,
 778             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 779             headers={}, query={}, expected_status=None):
 780         """
 781         Return the data of the page as a string.
 782
 783         Arguments:
 784         url_or_request -- plain text URL as a string or
 785             a compat_urllib_request.Requestobject
 786         video_id -- Video/playlist/item identifier (string)
 787
 788         Keyword arguments:
 789         note -- note printed before downloading (string)
 790         errnote -- note printed in case of an error (string)
 791         fatal -- flag denoting whether error should be considered fatal,
 792             i.e. whether it should cause ExtractionError to be raised,
 793             otherwise a warning will be reported and extraction continued
 794         tries -- number of tries
 795         timeout -- sleep interval between tries
 796         encoding -- encoding for a page content decoding, guessed automatically
 797             when not explicitly specified
 798         data -- POST data (bytes)
 799         headers -- HTTP headers (dict)
 800         query -- URL query (dict)
 801         expected_status -- allows to accept failed HTTP requests (non 2xx
 802             status code) by explicitly specifying a set of accepted status
 803             codes. Can be any of the following entities:
 804                 - an integer type specifying an exact failed status code to
 805                   accept
 806                 - a list or a tuple of integer types specifying a list of
 807                   failed status codes to accept
 808                 - a callable accepting an actual failed status code and
 809                   returning True if it should be accepted
 810             Note that this argument does not affect success status codes (2xx)
 811             which are always accepted.
 812         """
 813
 814         success = False
 815         try_count = 0
 816         while success is False:
 817             try:
 818                 res = self._download_webpage_handle(
 819                     url_or_request, video_id, note, errnote, fatal,
 820                     encoding=encoding, data=data, headers=headers, query=query,
 821                     expected_status=expected_status)
 822                 success = True
 823             except compat_http_client.IncompleteRead as e:
 824                 try_count += 1
 825                 if try_count >= tries:
 826                     raise e
 827                 self._sleep(timeout, video_id)
 828         if res is False:
 829             return res
 830         else:
 831             content, _ = res
 832             return content
 833
 834     def _download_xml_handle(
 835             self, url_or_request, video_id, note='Downloading XML',
 836             errnote='Unable to download XML', transform_source=None,
 837             fatal=True, encoding=None, data=None, headers={}, query={},
 838             expected_status=None):
 839         """
 840         Return a tuple (xml as an compat_etree_Element, URL handle).
 841
 842         See _download_webpage docstring for arguments specification.
 843         """
 844         res = self._download_webpage_handle(
 845             url_or_request, video_id, note, errnote, fatal=fatal,
 846             encoding=encoding, data=data, headers=headers, query=query,
 847             expected_status=expected_status)
 848         if res is False:
 849             return res
 850         xml_string, urlh = res
 851         return self._parse_xml(
 852             xml_string, video_id, transform_source=transform_source,
 853             fatal=fatal), urlh
 854
 855     def _download_xml(
 856             self, url_or_request, video_id,
 857             note='Downloading XML', errnote='Unable to download XML',
 858             transform_source=None, fatal=True, encoding=None,
 859             data=None, headers={}, query={}, expected_status=None):
 860         """
 861         Return the xml as an compat_etree_Element.
 862
 863         See _download_webpage docstring for arguments specification.
 864         """
 865         res = self._download_xml_handle(
 866             url_or_request, video_id, note=note, errnote=errnote,
 867             transform_source=transform_source, fatal=fatal, encoding=encoding,
 868             data=data, headers=headers, query=query,
 869             expected_status=expected_status)
 870         return res if res is False else res[0]
 871
 872     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 873         if transform_source:
 874             xml_string = transform_source(xml_string)
 875         try:
 876             return compat_etree_fromstring(xml_string.encode('utf-8'))
 877         except compat_xml_parse_error as ve:
 878             errmsg = '%s: Failed to parse XML ' % video_id
 879             if fatal:
 880                 raise ExtractorError(errmsg, cause=ve)
 881             else:
 882                 self.report_warning(errmsg + str(ve))
 883
 884     def _download_json_handle(
 885             self, url_or_request, video_id, note='Downloading JSON metadata',
 886             errnote='Unable to download JSON metadata', transform_source=None,
 887             fatal=True, encoding=None, data=None, headers={}, query={},
 888             expected_status=None):
 889         """
 890         Return a tuple (JSON object, URL handle).
 891
 892         See _download_webpage docstring for arguments specification.
 893         """
 894         res = self._download_webpage_handle(
 895             url_or_request, video_id, note, errnote, fatal=fatal,
 896             encoding=encoding, data=data, headers=headers, query=query,
 897             expected_status=expected_status)
 898         if res is False:
 899             return res
 900         json_string, urlh = res
 901         return self._parse_json(
 902             json_string, video_id, transform_source=transform_source,
 903             fatal=fatal), urlh
 904
 905     def _download_json(
 906             self, url_or_request, video_id, note='Downloading JSON metadata',
 907             errnote='Unable to download JSON metadata', transform_source=None,
 908             fatal=True, encoding=None, data=None, headers={}, query={},
 909             expected_status=None):
 910         """
 911         Return the JSON object as a dict.
 912
 913         See _download_webpage docstring for arguments specification.
 914         """
 915         res = self._download_json_handle(
 916             url_or_request, video_id, note=note, errnote=errnote,
 917             transform_source=transform_source, fatal=fatal, encoding=encoding,
 918             data=data, headers=headers, query=query,
 919             expected_status=expected_status)
 920         return res if res is False else res[0]
 921
 922     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 923         if transform_source:
 924             json_string = transform_source(json_string)
 925         try:
 926             return json.loads(json_string)
 927         except ValueError as ve:
 928             errmsg = '%s: Failed to parse JSON ' % video_id
 929             if fatal:
 930                 raise ExtractorError(errmsg, cause=ve)
 931             else:
 932                 self.report_warning(errmsg + str(ve))
 933
 934     def report_warning(self, msg, video_id=None):
 935         idstr = '' if video_id is None else '%s: ' % video_id
 936         self._downloader.report_warning(
 937             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 938
 939     def to_screen(self, msg):
 940         """Print msg to screen, prefixing it with '[ie_name]'"""
 941         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 942
 943     def report_extraction(self, id_or_name):
 944         """Report information extraction."""
 945         self.to_screen('%s: Extracting information' % id_or_name)
 946
 947     def report_download_webpage(self, video_id):
 948         """Report webpage download."""
 949         self.to_screen('%s: Downloading webpage' % video_id)
 950
 951     def report_age_confirmation(self):
 952         """Report attempt to confirm age."""
 953         self.to_screen('Confirming age')
 954
 955     def report_login(self):
 956         """Report attempt to log in."""
 957         self.to_screen('Logging in')
 958
 959     @staticmethod
 960     def raise_login_required(msg='This video is only available for registered users'):
 961         raise ExtractorError(
 962             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 963             expected=True)
 964
 965     @staticmethod
 966     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 967         raise GeoRestrictedError(msg, countries=countries)
 968
 969     # Methods for following #608
 970     @staticmethod
 971     def url_result(url, ie=None, video_id=None, video_title=None):
 972         """Returns a URL that points to a page that should be processed"""
 973         # TODO: ie should be the class used for getting the info
 974         video_info = {'_type': 'url',
 975                       'url': url,
 976                       'ie_key': ie}
 977         if video_id is not None:
 978             video_info['id'] = video_id
 979         if video_title is not None:
 980             video_info['title'] = video_title
 981         return video_info
 982
 983     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 984         urls = orderedSet(
 985             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 986             for m in matches)
 987         return self.playlist_result(
 988             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 989
 990     @staticmethod
 991     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
 992         """Returns a playlist"""
 993         video_info = {'_type': 'playlist',
 994                       'entries': entries}
 995         video_info.update(kwargs)
 996         if playlist_id:
 997             video_info['id'] = playlist_id
 998         if playlist_title:
 999             video_info['title'] = playlist_title
1000         if playlist_description is not None:
1001             video_info['description'] = playlist_description
1002         return video_info
1003
1004     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1005         """
1006         Perform a regex search on the given string, using a single or a list of
1007         patterns returning the first matching group.
1008         In case of failure return a default value or raise a WARNING or a
1009         RegexNotFoundError, depending on fatal, specifying the field name.
1010         """
1011         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1012             mobj = re.search(pattern, string, flags)
1013         else:
1014             for p in pattern:
1015                 mobj = re.search(p, string, flags)
1016                 if mobj:
1017                     break
1018
1019         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1020             _name = '\033[0;34m%s\033[0m' % name
1021         else:
1022             _name = name
1023
1024         if mobj:
1025             if group is None:
1026                 # return the first matching group
1027                 return next(g for g in mobj.groups() if g is not None)
1028             else:
1029                 return mobj.group(group)
1030         elif default is not NO_DEFAULT:
1031             return default
1032         elif fatal:
1033             raise RegexNotFoundError('Unable to extract %s' % _name)
1034         else:
1035             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1036             return None
1037
1038     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1039         """
1040         Like _search_regex, but strips HTML tags and unescapes entities.
1041         """
1042         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1043         if res:
1044             return clean_html(res).strip()
1045         else:
1046             return res
1047
1048     def _get_netrc_login_info(self, netrc_machine=None):
1049         username = None
1050         password = None
1051         netrc_machine = netrc_machine or self._NETRC_MACHINE
1052
1053         if self._downloader.params.get('usenetrc', False):
1054             try:
1055                 info = netrc.netrc().authenticators(netrc_machine)
1056                 if info is not None:
1057                     username = info[0]
1058                     password = info[2]
1059                 else:
1060                     raise netrc.NetrcParseError(
1061                         'No authenticators for %s' % netrc_machine)
1062             except (IOError, netrc.NetrcParseError) as err:
1063                 self._downloader.report_warning(
1064                     'parsing .netrc: %s' % error_to_compat_str(err))
1065
1066         return username, password
1067
1068     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1069         """
1070         Get the login info as (username, password)
1071         First look for the manually specified credentials using username_option
1072         and password_option as keys in params dictionary. If no such credentials
1073         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1074         value.
1075         If there's no info available, return (None, None)
1076         """
1077         if self._downloader is None:
1078             return (None, None)
1079
1080         downloader_params = self._downloader.params
1081
1082         # Attempt to use provided username and password or .netrc data
1083         if downloader_params.get(username_option) is not None:
1084             username = downloader_params[username_option]
1085             password = downloader_params[password_option]
1086         else:
1087             username, password = self._get_netrc_login_info(netrc_machine)
1088
1089         return username, password
1090
1091     def _get_tfa_info(self, note='two-factor verification code'):
1092         """
1093         Get the two-factor authentication info
1094         TODO - asking the user will be required for sms/phone verify
1095         currently just uses the command line option
1096         If there's no info available, return None
1097         """
1098         if self._downloader is None:
1099             return None
1100         downloader_params = self._downloader.params
1101
1102         if downloader_params.get('twofactor') is not None:
1103             return downloader_params['twofactor']
1104
1105         return compat_getpass('Type %s and press [Return]: ' % note)
1106
1107     # Helper functions for extracting OpenGraph info
1108     @staticmethod
1109     def _og_regexes(prop):
1110         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1111         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1112                        % {'prop': re.escape(prop)})
1113         template = r'<meta[^>]+?%s[^>]+?%s'
1114         return [
1115             template % (property_re, content_re),
1116             template % (content_re, property_re),
1117         ]
1118
1119     @staticmethod
1120     def _meta_regex(prop):
1121         return r'''(?isx)<meta
1122                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1123                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1124
1125     def _og_search_property(self, prop, html, name=None, **kargs):
1126         if not isinstance(prop, (list, tuple)):
1127             prop = [prop]
1128         if name is None:
1129             name = 'OpenGraph %s' % prop[0]
1130         og_regexes = []
1131         for p in prop:
1132             og_regexes.extend(self._og_regexes(p))
1133         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1134         if escaped is None:
1135             return None
1136         return unescapeHTML(escaped)
1137
1138     def _og_search_thumbnail(self, html, **kargs):
1139         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1140
1141     def _og_search_description(self, html, **kargs):
1142         return self._og_search_property('description', html, fatal=False, **kargs)
1143
1144     def _og_search_title(self, html, **kargs):
1145         return self._og_search_property('title', html, **kargs)
1146
1147     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1148         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1149         if secure:
1150             regexes = self._og_regexes('video:secure_url') + regexes
1151         return self._html_search_regex(regexes, html, name, **kargs)
1152
1153     def _og_search_url(self, html, **kargs):
1154         return self._og_search_property('url', html, **kargs)
1155
1156     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1157         if not isinstance(name, (list, tuple)):
1158             name = [name]
1159         if display_name is None:
1160             display_name = name[0]
1161         return self._html_search_regex(
1162             [self._meta_regex(n) for n in name],
1163             html, display_name, fatal=fatal, group='content', **kwargs)
1164
1165     def _dc_search_uploader(self, html):
1166         return self._html_search_meta('dc.creator', html, 'uploader')
1167
1168     def _rta_search(self, html):
1169         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1170         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1171                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1172                      html):
1173             return 18
1174         return 0
1175
1176     def _media_rating_search(self, html):
1177         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1178         rating = self._html_search_meta('rating', html)
1179
1180         if not rating:
1181             return None
1182
1183         RATING_TABLE = {
1184             'safe for kids': 0,
1185             'general': 8,
1186             '14 years': 14,
1187             'mature': 17,
1188             'restricted': 19,
1189         }
1190         return RATING_TABLE.get(rating.lower())
1191
1192     def _family_friendly_search(self, html):
1193         # See http://schema.org/VideoObject
1194         family_friendly = self._html_search_meta(
1195             'isFamilyFriendly', html, default=None)
1196
1197         if not family_friendly:
1198             return None
1199
1200         RATING_TABLE = {
1201             '1': 0,
1202             'true': 0,
1203             '0': 18,
1204             'false': 18,
1205         }
1206         return RATING_TABLE.get(family_friendly.lower())
1207
1208     def _twitter_search_player(self, html):
1209         return self._html_search_meta('twitter:player', html,
1210                                       'twitter card player')
1211
1212     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1213         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1214         default = kwargs.get('default', NO_DEFAULT)
1215         # JSON-LD may be malformed and thus `fatal` should be respected.
1216         # At the same time `default` may be passed that assumes `fatal=False`
1217         # for _search_regex. Let's simulate the same behavior here as well.
1218         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1219         json_ld = []
1220         for mobj in json_ld_list:
1221             json_ld_item = self._parse_json(
1222                 mobj.group('json_ld'), video_id, fatal=fatal)
1223             if not json_ld_item:
1224                 continue
1225             if isinstance(json_ld_item, dict):
1226                 json_ld.append(json_ld_item)
1227             elif isinstance(json_ld_item, (list, tuple)):
1228                 json_ld.extend(json_ld_item)
1229         if json_ld:
1230             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1231         if json_ld:
1232             return json_ld
1233         if default is not NO_DEFAULT:
1234             return default
1235         elif fatal:
1236             raise RegexNotFoundError('Unable to extract JSON-LD')
1237         else:
1238             self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1239             return {}
1240
1241     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1242         if isinstance(json_ld, compat_str):
1243             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1244         if not json_ld:
1245             return {}
1246         info = {}
1247         if not isinstance(json_ld, (list, tuple, dict)):
1248             return info
1249         if isinstance(json_ld, dict):
1250             json_ld = [json_ld]
1251
1252         INTERACTION_TYPE_MAP = {
1253             'CommentAction': 'comment',
1254             'AgreeAction': 'like',
1255             'DisagreeAction': 'dislike',
1256             'LikeAction': 'like',
1257             'DislikeAction': 'dislike',
1258             'ListenAction': 'view',
1259             'WatchAction': 'view',
1260             'ViewAction': 'view',
1261         }
1262
1263         def extract_interaction_type(e):
1264             interaction_type = e.get('interactionType')
1265             if isinstance(interaction_type, dict):
1266                 interaction_type = interaction_type.get('@type')
1267             return str_or_none(interaction_type)
1268
1269         def extract_interaction_statistic(e):
1270             interaction_statistic = e.get('interactionStatistic')
1271             if isinstance(interaction_statistic, dict):
1272                 interaction_statistic = [interaction_statistic]
1273             if not isinstance(interaction_statistic, list):
1274                 return
1275             for is_e in interaction_statistic:
1276                 if not isinstance(is_e, dict):
1277                     continue
1278                 if is_e.get('@type') != 'InteractionCounter':
1279                     continue
1280                 interaction_type = extract_interaction_type(is_e)
1281                 if not interaction_type:
1282                     continue
1283                 # For interaction count some sites provide string instead of
1284                 # an integer (as per spec) with non digit characters (e.g. ",")
1285                 # so extracting count with more relaxed str_to_int
1286                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1287                 if interaction_count is None:
1288                     continue
1289                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1290                 if not count_kind:
1291                     continue
1292                 count_key = '%s_count' % count_kind
1293                 if info.get(count_key) is not None:
1294                     continue
1295                 info[count_key] = interaction_count
1296
1297         def extract_video_object(e):
1298             assert e['@type'] == 'VideoObject'
1299             info.update({
1300                 'url': url_or_none(e.get('contentUrl')),
1301                 'title': unescapeHTML(e.get('name')),
1302                 'description': unescapeHTML(e.get('description')),
1303                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1304                 'duration': parse_duration(e.get('duration')),
1305                 'timestamp': unified_timestamp(e.get('uploadDate')),
1306                 'uploader': str_or_none(e.get('author')),
1307                 'filesize': float_or_none(e.get('contentSize')),
1308                 'tbr': int_or_none(e.get('bitrate')),
1309                 'width': int_or_none(e.get('width')),
1310                 'height': int_or_none(e.get('height')),
1311                 'view_count': int_or_none(e.get('interactionCount')),
1312             })
1313             extract_interaction_statistic(e)
1314
1315         for e in json_ld:
1316             if '@context' in e:
1317                 item_type = e.get('@type')
1318                 if expected_type is not None and expected_type != item_type:
1319                     continue
1320                 if item_type in ('TVEpisode', 'Episode'):
1321                     episode_name = unescapeHTML(e.get('name'))
1322                     info.update({
1323                         'episode': episode_name,
1324                         'episode_number': int_or_none(e.get('episodeNumber')),
1325                         'description': unescapeHTML(e.get('description')),
1326                     })
1327                     if not info.get('title') and episode_name:
1328                         info['title'] = episode_name
1329                     part_of_season = e.get('partOfSeason')
1330                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1331                         info.update({
1332                             'season': unescapeHTML(part_of_season.get('name')),
1333                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1334                         })
1335                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1336                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1337                         info['series'] = unescapeHTML(part_of_series.get('name'))
1338                 elif item_type == 'Movie':
1339                     info.update({
1340                         'title': unescapeHTML(e.get('name')),
1341                         'description': unescapeHTML(e.get('description')),
1342                         'duration': parse_duration(e.get('duration')),
1343                         'timestamp': unified_timestamp(e.get('dateCreated')),
1344                     })
1345                 elif item_type in ('Article', 'NewsArticle'):
1346                     info.update({
1347                         'timestamp': parse_iso8601(e.get('datePublished')),
1348                         'title': unescapeHTML(e.get('headline')),
1349                         'description': unescapeHTML(e.get('articleBody')),
1350                     })
1351                 elif item_type == 'VideoObject':
1352                     extract_video_object(e)
1353                     if expected_type is None:
1354                         continue
1355                     else:
1356                         break
1357                 video = e.get('video')
1358                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1359                     extract_video_object(video)
1360                 if expected_type is None:
1361                     continue
1362                 else:
1363                     break
1364         return dict((k, v) for k, v in info.items() if v is not None)
1365
1366     @staticmethod
1367     def _hidden_inputs(html):
1368         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1369         hidden_inputs = {}
1370         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1371             attrs = extract_attributes(input)
1372             if not input:
1373                 continue
1374             if attrs.get('type') not in ('hidden', 'submit'):
1375                 continue
1376             name = attrs.get('name') or attrs.get('id')
1377             value = attrs.get('value')
1378             if name and value is not None:
1379                 hidden_inputs[name] = value
1380         return hidden_inputs
1381
1382     def _form_hidden_inputs(self, form_id, html):
1383         form = self._search_regex(
1384             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1385             html, '%s form' % form_id, group='form')
1386         return self._hidden_inputs(form)
1387
1388     class FormatSort:
1389         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<seperator>[~:])(?P<limit>.*?))?)? *$'
1390
1391         default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
1392                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1393                    'proto', 'ext', 'has_audio', 'source', 'format_id')  # These must not be aliases
1394
1395         settings = {
1396             'vcodec': {'type': 'ordered', 'regex': True,
1397                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1398             'acodec': {'type': 'ordered', 'regex': True,
1399                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1400             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1401                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
1402             'vext': {'type': 'ordered', 'field': 'video_ext',
1403                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1404                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1405             'aext': {'type': 'ordered', 'field': 'audio_ext',
1406                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1407                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1408             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1409             'ie_pref': {'priority': True, 'type': 'extractor'},
1410             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1411             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1412             'lang': {'priority': True, 'convert': 'ignore', 'type': 'extractor', 'field': 'language_preference'},
1413             'quality': {'convert': 'float_none', 'type': 'extractor'},
1414             'filesize': {'convert': 'bytes'},
1415             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1416             'id': {'convert': 'string', 'field': 'format_id'},
1417             'height': {'convert': 'float_none'},
1418             'width': {'convert': 'float_none'},
1419             'fps': {'convert': 'float_none'},
1420             'tbr': {'convert': 'float_none'},
1421             'vbr': {'convert': 'float_none'},
1422             'abr': {'convert': 'float_none'},
1423             'asr': {'convert': 'float_none'},
1424             'source': {'convert': 'ignore', 'type': 'extractor', 'field': 'source_preference'},
1425
1426             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1427             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1428             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1429             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1430             'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1431
1432             # Most of these exist only for compatibility reasons
1433             'dimension': {'type': 'alias', 'field': 'res'},
1434             'resolution': {'type': 'alias', 'field': 'res'},
1435             'extension': {'type': 'alias', 'field': 'ext'},
1436             'bitrate': {'type': 'alias', 'field': 'br'},
1437             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1438             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1439             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1440             'framerate': {'type': 'alias', 'field': 'fps'},
1441             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1442             'protocol': {'type': 'alias', 'field': 'proto'},
1443             'source_preference': {'type': 'alias', 'field': 'source'},
1444             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1445             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1446             'samplerate': {'type': 'alias', 'field': 'asr'},
1447             'video_ext': {'type': 'alias', 'field': 'vext'},
1448             'audio_ext': {'type': 'alias', 'field': 'aext'},
1449             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1450             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1451             'video': {'type': 'alias', 'field': 'hasvid'},
1452             'has_video': {'type': 'alias', 'field': 'hasvid'},
1453             'audio': {'type': 'alias', 'field': 'hasaud'},
1454             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1455             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1456             'preference': {'type': 'alias', 'field': 'ie_pref'},
1457             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1458             'format_id': {'type': 'alias', 'field': 'id'},
1459         }
1460
1461         _order = []
1462
1463         def _get_field_setting(self, field, key):
1464             if field not in self.settings:
1465                 self.settings[field] = {}
1466             propObj = self.settings[field]
1467             if key not in propObj:
1468                 type = propObj.get('type')
1469                 if key == 'field':
1470                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1471                 elif key == 'convert':
1472                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1473                 else:
1474                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1475                 propObj[key] = default
1476             return propObj[key]
1477
1478         def _resolve_field_value(self, field, value, convertNone=False):
1479             if value is None:
1480                 if not convertNone:
1481                     return None
1482             else:
1483                 value = value.lower()
1484             conversion = self._get_field_setting(field, 'convert')
1485             if conversion == 'ignore':
1486                 return None
1487             if conversion == 'string':
1488                 return value
1489             elif conversion == 'float_none':
1490                 return float_or_none(value)
1491             elif conversion == 'bytes':
1492                 return FileDownloader.parse_bytes(value)
1493             elif conversion == 'order':
1494                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1495                 use_regex = self._get_field_setting(field, 'regex')
1496                 list_length = len(order_list)
1497                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1498                 if use_regex and value is not None:
1499                     for i, regex in enumerate(order_list):
1500                         if regex and re.match(regex, value):
1501                             return list_length - i
1502                     return list_length - empty_pos  # not in list
1503                 else:  # not regex or  value = None
1504                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1505             else:
1506                 if value.isnumeric():
1507                     return float(value)
1508                 else:
1509                     self.settings[field]['convert'] = 'string'
1510                     return value
1511
1512         def evaluate_params(self, params, sort_extractor):
1513             self._use_free_order = params.get('prefer_free_formats', False)
1514             self._sort_user = params.get('format_sort', [])
1515             self._sort_extractor = sort_extractor
1516
1517             def add_item(field, reverse, closest, limit_text):
1518                 field = field.lower()
1519                 if field in self._order:
1520                     return
1521                 self._order.append(field)
1522                 limit = self._resolve_field_value(field, limit_text)
1523                 data = {
1524                     'reverse': reverse,
1525                     'closest': False if limit is None else closest,
1526                     'limit_text': limit_text,
1527                     'limit': limit}
1528                 if field in self.settings:
1529                     self.settings[field].update(data)
1530                 else:
1531                     self.settings[field] = data
1532
1533             sort_list = (
1534                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1535                 + (tuple() if params.get('format_sort_force', False)
1536                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1537                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1538
1539             for item in sort_list:
1540                 match = re.match(self.regex, item)
1541                 if match is None:
1542                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1543                 field = match.group('field')
1544                 if field is None:
1545                     continue
1546                 if self._get_field_setting(field, 'type') == 'alias':
1547                     field = self._get_field_setting(field, 'field')
1548                 reverse = match.group('reverse') is not None
1549                 closest = match.group('seperator') == '~'
1550                 limit_text = match.group('limit')
1551
1552                 has_limit = limit_text is not None
1553                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1554                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1555
1556                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1557                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1558                 limit_count = len(limits)
1559                 for (i, f) in enumerate(fields):
1560                     add_item(f, reverse, closest,
1561                              limits[i] if i < limit_count
1562                              else limits[0] if has_limit and not has_multiple_limits
1563                              else None)
1564
1565         def print_verbose_info(self, to_screen):
1566             to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
1567             if self._sort_extractor:
1568                 to_screen('[debug] Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1569             to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1570                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1571                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1572                               self._get_field_setting(field, 'limit_text'),
1573                               self._get_field_setting(field, 'limit'))
1574                 if self._get_field_setting(field, 'limit_text') is not None else '')
1575                 for field in self._order if self._get_field_setting(field, 'visible')]))
1576
1577         def _calculate_field_preference_from_value(self, format, field, type, value):
1578             reverse = self._get_field_setting(field, 'reverse')
1579             closest = self._get_field_setting(field, 'closest')
1580             limit = self._get_field_setting(field, 'limit')
1581
1582             if type == 'extractor':
1583                 maximum = self._get_field_setting(field, 'max')
1584                 if value is None or (maximum is not None and value >= maximum):
1585                     value = -1
1586             elif type == 'boolean':
1587                 in_list = self._get_field_setting(field, 'in_list')
1588                 not_in_list = self._get_field_setting(field, 'not_in_list')
1589                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1590             elif type == 'ordered':
1591                 value = self._resolve_field_value(field, value, True)
1592
1593             # try to convert to number
1594             val_num = float_or_none(value)
1595             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1596             if is_num:
1597                 value = val_num
1598
1599             return ((-10, 0) if value is None
1600                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1601                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1602                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1603                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1604                     else (-1, value, 0))
1605
1606         def _calculate_field_preference(self, format, field):
1607             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1608             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1609             if type == 'multiple':
1610                 type = 'field'  # Only 'field' is allowed in multiple for now
1611                 actual_fields = self._get_field_setting(field, 'field')
1612
1613                 def wrapped_function(values):
1614                     values = tuple(filter(lambda x: x is not None, values))
1615                     return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1616                             else values[0] if values
1617                             else None)
1618
1619                 value = wrapped_function((get_value(f) for f in actual_fields))
1620             else:
1621                 value = get_value(field)
1622             return self._calculate_field_preference_from_value(format, field, type, value)
1623
1624         def calculate_preference(self, format):
1625             # Determine missing protocol
1626             if not format.get('protocol'):
1627                 format['protocol'] = determine_protocol(format)
1628
1629             # Determine missing ext
1630             if not format.get('ext') and 'url' in format:
1631                 format['ext'] = determine_ext(format['url'])
1632             if format.get('vcodec') == 'none':
1633                 format['audio_ext'] = format['ext']
1634                 format['video_ext'] = 'none'
1635             else:
1636                 format['video_ext'] = format['ext']
1637                 format['audio_ext'] = 'none'
1638             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1639             #    format['preference'] = -1000
1640
1641             # Determine missing bitrates
1642             if format.get('tbr') is None:
1643                 if format.get('vbr') is not None and format.get('abr') is not None:
1644                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1645             else:
1646                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1647                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1648                 if format.get('acodec') != "none" and format.get('abr') is None:
1649                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1650
1651             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1652
1653     def _sort_formats(self, formats, field_preference=[]):
1654         if not formats:
1655             raise ExtractorError('No video formats found')
1656         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1657         format_sort.evaluate_params(self._downloader.params, field_preference)
1658         if self._downloader.params.get('verbose', False):
1659             format_sort.print_verbose_info(self._downloader.to_screen)
1660         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1661
1662     def _check_formats(self, formats, video_id):
1663         if formats:
1664             formats[:] = filter(
1665                 lambda f: self._is_valid_url(
1666                     f['url'], video_id,
1667                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1668                 formats)
1669
1670     @staticmethod
1671     def _remove_duplicate_formats(formats):
1672         format_urls = set()
1673         unique_formats = []
1674         for f in formats:
1675             if f['url'] not in format_urls:
1676                 format_urls.add(f['url'])
1677                 unique_formats.append(f)
1678         formats[:] = unique_formats
1679
1680     def _is_valid_url(self, url, video_id, item='video', headers={}):
1681         url = self._proto_relative_url(url, scheme='http:')
1682         # For now assume non HTTP(S) URLs always valid
1683         if not (url.startswith('http://') or url.startswith('https://')):
1684             return True
1685         try:
1686             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1687             return True
1688         except ExtractorError as e:
1689             self.to_screen(
1690                 '%s: %s URL is invalid, skipping: %s'
1691                 % (video_id, item, error_to_compat_str(e.cause)))
1692             return False
1693
1694     def http_scheme(self):
1695         """ Either "http:" or "https:", depending on the user's preferences """
1696         return (
1697             'http:'
1698             if self._downloader.params.get('prefer_insecure', False)
1699             else 'https:')
1700
1701     def _proto_relative_url(self, url, scheme=None):
1702         if url is None:
1703             return url
1704         if url.startswith('//'):
1705             if scheme is None:
1706                 scheme = self.http_scheme()
1707             return scheme + url
1708         else:
1709             return url
1710
1711     def _sleep(self, timeout, video_id, msg_template=None):
1712         if msg_template is None:
1713             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1714         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1715         self.to_screen(msg)
1716         time.sleep(timeout)
1717
1718     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1719                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1720                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1721         manifest = self._download_xml(
1722             manifest_url, video_id, 'Downloading f4m manifest',
1723             'Unable to download f4m manifest',
1724             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1725             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1726             transform_source=transform_source,
1727             fatal=fatal, data=data, headers=headers, query=query)
1728
1729         if manifest is False:
1730             return []
1731
1732         return self._parse_f4m_formats(
1733             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1734             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1735
1736     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1737                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1738                            fatal=True, m3u8_id=None):
1739         if not isinstance(manifest, compat_etree_Element) and not fatal:
1740             return []
1741
1742         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1743         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1744         if akamai_pv is not None and ';' in akamai_pv.text:
1745             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1746             if playerVerificationChallenge.strip() != '':
1747                 return []
1748
1749         formats = []
1750         manifest_version = '1.0'
1751         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1752         if not media_nodes:
1753             manifest_version = '2.0'
1754             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1755         # Remove unsupported DRM protected media from final formats
1756         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1757         media_nodes = remove_encrypted_media(media_nodes)
1758         if not media_nodes:
1759             return formats
1760
1761         manifest_base_url = get_base_url(manifest)
1762
1763         bootstrap_info = xpath_element(
1764             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1765             'bootstrap info', default=None)
1766
1767         vcodec = None
1768         mime_type = xpath_text(
1769             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1770             'base URL', default=None)
1771         if mime_type and mime_type.startswith('audio/'):
1772             vcodec = 'none'
1773
1774         for i, media_el in enumerate(media_nodes):
1775             tbr = int_or_none(media_el.attrib.get('bitrate'))
1776             width = int_or_none(media_el.attrib.get('width'))
1777             height = int_or_none(media_el.attrib.get('height'))
1778             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1779             # If <bootstrapInfo> is present, the specified f4m is a
1780             # stream-level manifest, and only set-level manifests may refer to
1781             # external resources.  See section 11.4 and section 4 of F4M spec
1782             if bootstrap_info is None:
1783                 media_url = None
1784                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1785                 if manifest_version == '2.0':
1786                     media_url = media_el.attrib.get('href')
1787                 if media_url is None:
1788                     media_url = media_el.attrib.get('url')
1789                 if not media_url:
1790                     continue
1791                 manifest_url = (
1792                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1793                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1794                 # If media_url is itself a f4m manifest do the recursive extraction
1795                 # since bitrates in parent manifest (this one) and media_url manifest
1796                 # may differ leading to inability to resolve the format by requested
1797                 # bitrate in f4m downloader
1798                 ext = determine_ext(manifest_url)
1799                 if ext == 'f4m':
1800                     f4m_formats = self._extract_f4m_formats(
1801                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1802                         transform_source=transform_source, fatal=fatal)
1803                     # Sometimes stream-level manifest contains single media entry that
1804                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1805                     # At the same time parent's media entry in set-level manifest may
1806                     # contain it. We will copy it from parent in such cases.
1807                     if len(f4m_formats) == 1:
1808                         f = f4m_formats[0]
1809                         f.update({
1810                             'tbr': f.get('tbr') or tbr,
1811                             'width': f.get('width') or width,
1812                             'height': f.get('height') or height,
1813                             'format_id': f.get('format_id') if not tbr else format_id,
1814                             'vcodec': vcodec,
1815                         })
1816                     formats.extend(f4m_formats)
1817                     continue
1818                 elif ext == 'm3u8':
1819                     formats.extend(self._extract_m3u8_formats(
1820                         manifest_url, video_id, 'mp4', preference=preference,
1821                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1822                     continue
1823             formats.append({
1824                 'format_id': format_id,
1825                 'url': manifest_url,
1826                 'manifest_url': manifest_url,
1827                 'ext': 'flv' if bootstrap_info is not None else None,
1828                 'protocol': 'f4m',
1829                 'tbr': tbr,
1830                 'width': width,
1831                 'height': height,
1832                 'vcodec': vcodec,
1833                 'preference': preference,
1834                 'quality': quality,
1835             })
1836         return formats
1837
1838     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1839         return {
1840             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1841             'url': m3u8_url,
1842             'ext': ext,
1843             'protocol': 'm3u8',
1844             'preference': preference - 100 if preference else -100,
1845             'quality': quality,
1846             'resolution': 'multiple',
1847             'format_note': 'Quality selection URL',
1848         }
1849
1850     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1851                               entry_protocol='m3u8', preference=None, quality=None,
1852                               m3u8_id=None, live=False, note=None, errnote=None,
1853                               fatal=True, data=None, headers={}, query={}):
1854         res = self._download_webpage_handle(
1855             m3u8_url, video_id,
1856             note=note or 'Downloading m3u8 information',
1857             errnote=errnote or 'Failed to download m3u8 information',
1858             fatal=fatal, data=data, headers=headers, query=query)
1859
1860         if res is False:
1861             return []
1862
1863         m3u8_doc, urlh = res
1864         m3u8_url = urlh.geturl()
1865
1866         return self._parse_m3u8_formats(
1867             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1868             preference=preference, quality=quality, m3u8_id=m3u8_id,
1869             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1870             headers=headers, query=query, video_id=video_id)
1871
1872     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1873                             entry_protocol='m3u8', preference=None, quality=None,
1874                             m3u8_id=None, live=False, note=None, errnote=None,
1875                             fatal=True, data=None, headers={}, query={}, video_id=None):
1876         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1877             return []
1878
1879         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1880             return []
1881
1882         formats = []
1883
1884         format_url = lambda u: (
1885             u
1886             if re.match(r'^https?://', u)
1887             else compat_urlparse.urljoin(m3u8_url, u))
1888
1889         split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
1890
1891         # References:
1892         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1893         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1894         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1895
1896         # We should try extracting formats only from master playlists [1, 4.3.4],
1897         # i.e. playlists that describe available qualities. On the other hand
1898         # media playlists [1, 4.3.3] should be returned as is since they contain
1899         # just the media without qualities renditions.
1900         # Fortunately, master playlist can be easily distinguished from media
1901         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1902         # master playlist tags MUST NOT appear in a media playlist and vice versa.
1903         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1904         # media playlist and MUST NOT appear in master playlist thus we can
1905         # clearly detect media playlist with this criterion.
1906
1907         def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None):
1908             if not m3u8_doc:
1909                 if not format_url:
1910                     return []
1911                 res = self._download_webpage_handle(
1912                     format_url, video_id,
1913                     note=False,
1914                     errnote=errnote or 'Failed to download m3u8 playlist information',
1915                     fatal=fatal, data=data, headers=headers, query=query)
1916
1917                 if res is False:
1918                     return []
1919
1920                 m3u8_doc, urlh = res
1921                 format_url = urlh.geturl()
1922
1923             playlist_formats = []
1924             i = (
1925                 0
1926                 if split_discontinuity
1927                 else None)
1928             format_info = {
1929                 'index': i,
1930                 'key_data': None,
1931                 'files': [],
1932             }
1933             for line in m3u8_doc.splitlines():
1934                 if not line.startswith('#'):
1935                     format_info['files'].append(line)
1936                 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
1937                     i += 1
1938                     playlist_formats.append(format_info)
1939                     format_info = {
1940                         'index': i,
1941                         'url': format_url,
1942                         'files': [],
1943                     }
1944             playlist_formats.append(format_info)
1945             return playlist_formats
1946
1947         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1948
1949             playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
1950
1951             for format in playlist_formats:
1952                 format_id = []
1953                 if m3u8_id:
1954                     format_id.append(m3u8_id)
1955                 format_index = format.get('index')
1956                 if format_index:
1957                     format_id.append(str(format_index))
1958                 f = {
1959                     'format_id': '-'.join(format_id),
1960                     'format_index': format_index,
1961                     'url': m3u8_url,
1962                     'ext': ext,
1963                     'protocol': entry_protocol,
1964                     'preference': preference,
1965                     'quality': quality,
1966                 }
1967                 formats.append(f)
1968
1969             return formats
1970
1971         groups = {}
1972         last_stream_inf = {}
1973
1974         def extract_media(x_media_line):
1975             media = parse_m3u8_attributes(x_media_line)
1976             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1977             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1978             if not (media_type and group_id and name):
1979                 return
1980             groups.setdefault(group_id, []).append(media)
1981             if media_type not in ('VIDEO', 'AUDIO'):
1982                 return
1983             media_url = media.get('URI')
1984             if media_url:
1985                 manifest_url = format_url(media_url)
1986                 format_id = []
1987                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
1988
1989                 for format in playlist_formats:
1990                     format_index = format.get('index')
1991                     for v in (m3u8_id, group_id, name):
1992                         if v:
1993                             format_id.append(v)
1994                     if format_index:
1995                         format_id.append(str(format_index))
1996                     f = {
1997                         'format_id': '-'.join(format_id),
1998                         'format_index': format_index,
1999                         'url': manifest_url,
2000                         'manifest_url': m3u8_url,
2001                         'language': media.get('LANGUAGE'),
2002                         'ext': ext,
2003                         'protocol': entry_protocol,
2004                         'preference': preference,
2005                         'quality': quality,
2006                     }
2007                     if media_type == 'AUDIO':
2008                         f['vcodec'] = 'none'
2009                     formats.append(f)
2010
2011         def build_stream_name():
2012             # Despite specification does not mention NAME attribute for
2013             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2014             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2015             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2016             stream_name = last_stream_inf.get('NAME')
2017             if stream_name:
2018                 return stream_name
2019             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2020             # from corresponding rendition group
2021             stream_group_id = last_stream_inf.get('VIDEO')
2022             if not stream_group_id:
2023                 return
2024             stream_group = groups.get(stream_group_id)
2025             if not stream_group:
2026                 return stream_group_id
2027             rendition = stream_group[0]
2028             return rendition.get('NAME') or stream_group_id
2029
2030         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2031         # chance to detect video only formats when EXT-X-STREAM-INF tags
2032         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2033         for line in m3u8_doc.splitlines():
2034             if line.startswith('#EXT-X-MEDIA:'):
2035                 extract_media(line)
2036
2037         for line in m3u8_doc.splitlines():
2038             if line.startswith('#EXT-X-STREAM-INF:'):
2039                 last_stream_inf = parse_m3u8_attributes(line)
2040             elif line.startswith('#') or not line.strip():
2041                 continue
2042             else:
2043                 tbr = float_or_none(
2044                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2045                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2046                 manifest_url = format_url(line.strip())
2047
2048                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
2049
2050                 for format in playlist_formats:
2051                     format_id = []
2052                     if m3u8_id:
2053                         format_id.append(m3u8_id)
2054                     format_index = format.get('index')
2055                     stream_name = build_stream_name()
2056                     # Bandwidth of live streams may differ over time thus making
2057                     # format_id unpredictable. So it's better to keep provided
2058                     # format_id intact.
2059                     if not live:
2060                         format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2061                     if format_index:
2062                         format_id.append(str(format_index))
2063                     f = {
2064                         'format_id': '-'.join(format_id),
2065                         'format_index': format_index,
2066                         'url': manifest_url,
2067                         'manifest_url': m3u8_url,
2068                         'tbr': tbr,
2069                         'ext': ext,
2070                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2071                         'protocol': entry_protocol,
2072                         'preference': preference,
2073                         'quality': quality,
2074                     }
2075                     resolution = last_stream_inf.get('RESOLUTION')
2076                     if resolution:
2077                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2078                         if mobj:
2079                             f['width'] = int(mobj.group('width'))
2080                             f['height'] = int(mobj.group('height'))
2081                     # Unified Streaming Platform
2082                     mobj = re.search(
2083                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2084                     if mobj:
2085                         abr, vbr = mobj.groups()
2086                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2087                         f.update({
2088                             'vbr': vbr,
2089                             'abr': abr,
2090                         })
2091                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2092                     f.update(codecs)
2093                     audio_group_id = last_stream_inf.get('AUDIO')
2094                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2095                     # references a rendition group MUST have a CODECS attribute.
2096                     # However, this is not always respected, for example, [2]
2097                     # contains EXT-X-STREAM-INF tag which references AUDIO
2098                     # rendition group but does not have CODECS and despite
2099                     # referencing an audio group it represents a complete
2100                     # (with audio and video) format. So, for such cases we will
2101                     # ignore references to rendition groups and treat them
2102                     # as complete formats.
2103                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2104                         audio_group = groups.get(audio_group_id)
2105                         if audio_group and audio_group[0].get('URI'):
2106                             # TODO: update acodec for audio only formats with
2107                             # the same GROUP-ID
2108                             f['acodec'] = 'none'
2109                     formats.append(f)
2110
2111                     # for DailyMotion
2112                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2113                     if progressive_uri:
2114                         http_f = f.copy()
2115                         del http_f['manifest_url']
2116                         http_f.update({
2117                             'format_id': f['format_id'].replace('hls-', 'http-'),
2118                             'protocol': 'http',
2119                             'url': progressive_uri,
2120                         })
2121                         formats.append(http_f)
2122
2123                 last_stream_inf = {}
2124         return formats
2125
2126     @staticmethod
2127     def _xpath_ns(path, namespace=None):
2128         if not namespace:
2129             return path
2130         out = []
2131         for c in path.split('/'):
2132             if not c or c == '.':
2133                 out.append(c)
2134             else:
2135                 out.append('{%s}%s' % (namespace, c))
2136         return '/'.join(out)
2137
2138     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2139         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2140
2141         if smil is False:
2142             assert not fatal
2143             return []
2144
2145         namespace = self._parse_smil_namespace(smil)
2146
2147         return self._parse_smil_formats(
2148             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2149
2150     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2151         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2152         if smil is False:
2153             return {}
2154         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2155
2156     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2157         return self._download_xml(
2158             smil_url, video_id, 'Downloading SMIL file',
2159             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2160
2161     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2162         namespace = self._parse_smil_namespace(smil)
2163
2164         formats = self._parse_smil_formats(
2165             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2166         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2167
2168         video_id = os.path.splitext(url_basename(smil_url))[0]
2169         title = None
2170         description = None
2171         upload_date = None
2172         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2173             name = meta.attrib.get('name')
2174             content = meta.attrib.get('content')
2175             if not name or not content:
2176                 continue
2177             if not title and name == 'title':
2178                 title = content
2179             elif not description and name in ('description', 'abstract'):
2180                 description = content
2181             elif not upload_date and name == 'date':
2182                 upload_date = unified_strdate(content)
2183
2184         thumbnails = [{
2185             'id': image.get('type'),
2186             'url': image.get('src'),
2187             'width': int_or_none(image.get('width')),
2188             'height': int_or_none(image.get('height')),
2189         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2190
2191         return {
2192             'id': video_id,
2193             'title': title or video_id,
2194             'description': description,
2195             'upload_date': upload_date,
2196             'thumbnails': thumbnails,
2197             'formats': formats,
2198             'subtitles': subtitles,
2199         }
2200
2201     def _parse_smil_namespace(self, smil):
2202         return self._search_regex(
2203             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2204
2205     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2206         base = smil_url
2207         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2208             b = meta.get('base') or meta.get('httpBase')
2209             if b:
2210                 base = b
2211                 break
2212
2213         formats = []
2214         rtmp_count = 0
2215         http_count = 0
2216         m3u8_count = 0
2217
2218         srcs = []
2219         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2220         for medium in media:
2221             src = medium.get('src')
2222             if not src or src in srcs:
2223                 continue
2224             srcs.append(src)
2225
2226             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2227             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2228             width = int_or_none(medium.get('width'))
2229             height = int_or_none(medium.get('height'))
2230             proto = medium.get('proto')
2231             ext = medium.get('ext')
2232             src_ext = determine_ext(src)
2233             streamer = medium.get('streamer') or base
2234
2235             if proto == 'rtmp' or streamer.startswith('rtmp'):
2236                 rtmp_count += 1
2237                 formats.append({
2238                     'url': streamer,
2239                     'play_path': src,
2240                     'ext': 'flv',
2241                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2242                     'tbr': bitrate,
2243                     'filesize': filesize,
2244                     'width': width,
2245                     'height': height,
2246                 })
2247                 if transform_rtmp_url:
2248                     streamer, src = transform_rtmp_url(streamer, src)
2249                     formats[-1].update({
2250                         'url': streamer,
2251                         'play_path': src,
2252                     })
2253                 continue
2254
2255             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2256             src_url = src_url.strip()
2257
2258             if proto == 'm3u8' or src_ext == 'm3u8':
2259                 m3u8_formats = self._extract_m3u8_formats(
2260                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2261                 if len(m3u8_formats) == 1:
2262                     m3u8_count += 1
2263                     m3u8_formats[0].update({
2264                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2265                         'tbr': bitrate,
2266                         'width': width,
2267                         'height': height,
2268                     })
2269                 formats.extend(m3u8_formats)
2270             elif src_ext == 'f4m':
2271                 f4m_url = src_url
2272                 if not f4m_params:
2273                     f4m_params = {
2274                         'hdcore': '3.2.0',
2275                         'plugin': 'flowplayer-3.2.0.1',
2276                     }
2277                 f4m_url += '&' if '?' in f4m_url else '?'
2278                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2279                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2280             elif src_ext == 'mpd':
2281                 formats.extend(self._extract_mpd_formats(
2282                     src_url, video_id, mpd_id='dash', fatal=False))
2283             elif re.search(r'\.ism/[Mm]anifest', src_url):
2284                 formats.extend(self._extract_ism_formats(
2285                     src_url, video_id, ism_id='mss', fatal=False))
2286             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2287                 http_count += 1
2288                 formats.append({
2289                     'url': src_url,
2290                     'ext': ext or src_ext or 'flv',
2291                     'format_id': 'http-%d' % (bitrate or http_count),
2292                     'tbr': bitrate,
2293                     'filesize': filesize,
2294                     'width': width,
2295                     'height': height,
2296                 })
2297
2298         return formats
2299
2300     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2301         urls = []
2302         subtitles = {}
2303         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2304             src = textstream.get('src')
2305             if not src or src in urls:
2306                 continue
2307             urls.append(src)
2308             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2309             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2310             subtitles.setdefault(lang, []).append({
2311                 'url': src,
2312                 'ext': ext,
2313             })
2314         return subtitles
2315
2316     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2317         xspf = self._download_xml(
2318             xspf_url, playlist_id, 'Downloading xpsf playlist',
2319             'Unable to download xspf manifest', fatal=fatal)
2320         if xspf is False:
2321             return []
2322         return self._parse_xspf(
2323             xspf, playlist_id, xspf_url=xspf_url,
2324             xspf_base_url=base_url(xspf_url))
2325
2326     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2327         NS_MAP = {
2328             'xspf': 'http://xspf.org/ns/0/',
2329             's1': 'http://static.streamone.nl/player/ns/0',
2330         }
2331
2332         entries = []
2333         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2334             title = xpath_text(
2335                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2336             description = xpath_text(
2337                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2338             thumbnail = xpath_text(
2339                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2340             duration = float_or_none(
2341                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2342
2343             formats = []
2344             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2345                 format_url = urljoin(xspf_base_url, location.text)
2346                 if not format_url:
2347                     continue
2348                 formats.append({
2349                     'url': format_url,
2350                     'manifest_url': xspf_url,
2351                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2352                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2353                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2354                 })
2355             self._sort_formats(formats)
2356
2357             entries.append({
2358                 'id': playlist_id,
2359                 'title': title,
2360                 'description': description,
2361                 'thumbnail': thumbnail,
2362                 'duration': duration,
2363                 'formats': formats,
2364             })
2365         return entries
2366
2367     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2368         res = self._download_xml_handle(
2369             mpd_url, video_id,
2370             note=note or 'Downloading MPD manifest',
2371             errnote=errnote or 'Failed to download MPD manifest',
2372             fatal=fatal, data=data, headers=headers, query=query)
2373         if res is False:
2374             return []
2375         mpd_doc, urlh = res
2376         if mpd_doc is None:
2377             return []
2378         mpd_base_url = base_url(urlh.geturl())
2379
2380         return self._parse_mpd_formats(
2381             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2382
2383     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2384         """
2385         Parse formats from MPD manifest.
2386         References:
2387          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2388             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2389          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2390         """
2391         if not self._downloader.params.get('dynamic_mpd'):
2392             if mpd_doc.get('type') == 'dynamic':
2393                 return []
2394
2395         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2396
2397         def _add_ns(path):
2398             return self._xpath_ns(path, namespace)
2399
2400         def is_drm_protected(element):
2401             return element.find(_add_ns('ContentProtection')) is not None
2402
2403         def extract_multisegment_info(element, ms_parent_info):
2404             ms_info = ms_parent_info.copy()
2405
2406             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2407             # common attributes and elements.  We will only extract relevant
2408             # for us.
2409             def extract_common(source):
2410                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2411                 if segment_timeline is not None:
2412                     s_e = segment_timeline.findall(_add_ns('S'))
2413                     if s_e:
2414                         ms_info['total_number'] = 0
2415                         ms_info['s'] = []
2416                         for s in s_e:
2417                             r = int(s.get('r', 0))
2418                             ms_info['total_number'] += 1 + r
2419                             ms_info['s'].append({
2420                                 't': int(s.get('t', 0)),
2421                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2422                                 'd': int(s.attrib['d']),
2423                                 'r': r,
2424                             })
2425                 start_number = source.get('startNumber')
2426                 if start_number:
2427                     ms_info['start_number'] = int(start_number)
2428                 timescale = source.get('timescale')
2429                 if timescale:
2430                     ms_info['timescale'] = int(timescale)
2431                 segment_duration = source.get('duration')
2432                 if segment_duration:
2433                     ms_info['segment_duration'] = float(segment_duration)
2434
2435             def extract_Initialization(source):
2436                 initialization = source.find(_add_ns('Initialization'))
2437                 if initialization is not None:
2438                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2439
2440             segment_list = element.find(_add_ns('SegmentList'))
2441             if segment_list is not None:
2442                 extract_common(segment_list)
2443                 extract_Initialization(segment_list)
2444                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2445                 if segment_urls_e:
2446                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2447             else:
2448                 segment_template = element.find(_add_ns('SegmentTemplate'))
2449                 if segment_template is not None:
2450                     extract_common(segment_template)
2451                     media = segment_template.get('media')
2452                     if media:
2453                         ms_info['media'] = media
2454                     initialization = segment_template.get('initialization')
2455                     if initialization:
2456                         ms_info['initialization'] = initialization
2457                     else:
2458                         extract_Initialization(segment_template)
2459             return ms_info
2460
2461         skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
2462
2463         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2464         formats = []
2465         for period in mpd_doc.findall(_add_ns('Period')):
2466             period_duration = parse_duration(period.get('duration')) or mpd_duration
2467             period_ms_info = extract_multisegment_info(period, {
2468                 'start_number': 1,
2469                 'timescale': 1,
2470             })
2471             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2472                 if skip_unplayable and is_drm_protected(adaptation_set):
2473                     continue
2474                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2475                 for representation in adaptation_set.findall(_add_ns('Representation')):
2476                     if skip_unplayable and is_drm_protected(representation):
2477                         continue
2478                     representation_attrib = adaptation_set.attrib.copy()
2479                     representation_attrib.update(representation.attrib)
2480                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2481                     mime_type = representation_attrib['mimeType']
2482                     content_type = mime_type.split('/')[0]
2483                     if content_type == 'text':
2484                         # TODO implement WebVTT downloading
2485                         pass
2486                     elif content_type in ('video', 'audio'):
2487                         base_url = ''
2488                         for element in (representation, adaptation_set, period, mpd_doc):
2489                             base_url_e = element.find(_add_ns('BaseURL'))
2490                             if base_url_e is not None:
2491                                 base_url = base_url_e.text + base_url
2492                                 if re.match(r'^https?://', base_url):
2493                                     break
2494                         if mpd_base_url and not re.match(r'^https?://', base_url):
2495                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2496                                 mpd_base_url += '/'
2497                             base_url = mpd_base_url + base_url
2498                         representation_id = representation_attrib.get('id')
2499                         lang = representation_attrib.get('lang')
2500                         url_el = representation.find(_add_ns('BaseURL'))
2501                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2502                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2503                         f = {
2504                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2505                             'manifest_url': mpd_url,
2506                             'ext': mimetype2ext(mime_type),
2507                             'width': int_or_none(representation_attrib.get('width')),
2508                             'height': int_or_none(representation_attrib.get('height')),
2509                             'tbr': float_or_none(bandwidth, 1000),
2510                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2511                             'fps': int_or_none(representation_attrib.get('frameRate')),
2512                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2513                             'format_note': 'DASH %s' % content_type,
2514                             'filesize': filesize,
2515                             'container': mimetype2ext(mime_type) + '_dash',
2516                         }
2517                         f.update(parse_codecs(representation_attrib.get('codecs')))
2518                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2519
2520                         def prepare_template(template_name, identifiers):
2521                             tmpl = representation_ms_info[template_name]
2522                             # First of, % characters outside $...$ templates
2523                             # must be escaped by doubling for proper processing
2524                             # by % operator string formatting used further (see
2525                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2526                             t = ''
2527                             in_template = False
2528                             for c in tmpl:
2529                                 t += c
2530                                 if c == '$':
2531                                     in_template = not in_template
2532                                 elif c == '%' and not in_template:
2533                                     t += c
2534                             # Next, $...$ templates are translated to their
2535                             # %(...) counterparts to be used with % operator
2536                             t = t.replace('$RepresentationID$', representation_id)
2537                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2538                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2539                             t.replace('$$', '$')
2540                             return t
2541
2542                         # @initialization is a regular template like @media one
2543                         # so it should be handled just the same way (see
2544                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2545                         if 'initialization' in representation_ms_info:
2546                             initialization_template = prepare_template(
2547                                 'initialization',
2548                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2549                                 # $Time$ shall not be included for @initialization thus
2550                                 # only $Bandwidth$ remains
2551                                 ('Bandwidth', ))
2552                             representation_ms_info['initialization_url'] = initialization_template % {
2553                                 'Bandwidth': bandwidth,
2554                             }
2555
2556                         def location_key(location):
2557                             return 'url' if re.match(r'^https?://', location) else 'path'
2558
2559                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2560
2561                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2562                             media_location_key = location_key(media_template)
2563
2564                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2565                             # can't be used at the same time
2566                             if '%(Number' in media_template and 's' not in representation_ms_info:
2567                                 segment_duration = None
2568                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2569                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2570                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2571                                 representation_ms_info['fragments'] = [{
2572                                     media_location_key: media_template % {
2573                                         'Number': segment_number,
2574                                         'Bandwidth': bandwidth,
2575                                     },
2576                                     'duration': segment_duration,
2577                                 } for segment_number in range(
2578                                     representation_ms_info['start_number'],
2579                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2580                             else:
2581                                 # $Number*$ or $Time$ in media template with S list available
2582                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2583                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2584                                 representation_ms_info['fragments'] = []
2585                                 segment_time = 0
2586                                 segment_d = None
2587                                 segment_number = representation_ms_info['start_number']
2588
2589                                 def add_segment_url():
2590                                     segment_url = media_template % {
2591                                         'Time': segment_time,
2592                                         'Bandwidth': bandwidth,
2593                                         'Number': segment_number,
2594                                     }
2595                                     representation_ms_info['fragments'].append({
2596                                         media_location_key: segment_url,
2597                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2598                                     })
2599
2600                                 for num, s in enumerate(representation_ms_info['s']):
2601                                     segment_time = s.get('t') or segment_time
2602                                     segment_d = s['d']
2603                                     add_segment_url()
2604                                     segment_number += 1
2605                                     for r in range(s.get('r', 0)):
2606                                         segment_time += segment_d
2607                                         add_segment_url()
2608                                         segment_number += 1
2609                                     segment_time += segment_d
2610                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2611                             # No media template
2612                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2613                             # or any YouTube dashsegments video
2614                             fragments = []
2615                             segment_index = 0
2616                             timescale = representation_ms_info['timescale']
2617                             for s in representation_ms_info['s']:
2618                                 duration = float_or_none(s['d'], timescale)
2619                                 for r in range(s.get('r', 0) + 1):
2620                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2621                                     fragments.append({
2622                                         location_key(segment_uri): segment_uri,
2623                                         'duration': duration,
2624                                     })
2625                                     segment_index += 1
2626                             representation_ms_info['fragments'] = fragments
2627                         elif 'segment_urls' in representation_ms_info:
2628                             # Segment URLs with no SegmentTimeline
2629                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2630                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2631                             fragments = []
2632                             segment_duration = float_or_none(
2633                                 representation_ms_info['segment_duration'],
2634                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2635                             for segment_url in representation_ms_info['segment_urls']:
2636                                 fragment = {
2637                                     location_key(segment_url): segment_url,
2638                                 }
2639                                 if segment_duration:
2640                                     fragment['duration'] = segment_duration
2641                                 fragments.append(fragment)
2642                             representation_ms_info['fragments'] = fragments
2643                         # If there is a fragments key available then we correctly recognized fragmented media.
2644                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2645                         # assumption is not necessarily correct since we may simply have no support for
2646                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2647                         if 'fragments' in representation_ms_info:
2648                             f.update({
2649                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2650                                 'url': mpd_url or base_url,
2651                                 'fragment_base_url': base_url,
2652                                 'fragments': [],
2653                                 'protocol': 'http_dash_segments',
2654                             })
2655                             if 'initialization_url' in representation_ms_info:
2656                                 initialization_url = representation_ms_info['initialization_url']
2657                                 if not f.get('url'):
2658                                     f['url'] = initialization_url
2659                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2660                             f['fragments'].extend(representation_ms_info['fragments'])
2661                         else:
2662                             # Assuming direct URL to unfragmented media.
2663                             f['url'] = base_url
2664                         formats.append(f)
2665                     else:
2666                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2667         return formats
2668
2669     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2670         res = self._download_xml_handle(
2671             ism_url, video_id,
2672             note=note or 'Downloading ISM manifest',
2673             errnote=errnote or 'Failed to download ISM manifest',
2674             fatal=fatal, data=data, headers=headers, query=query)
2675         if res is False:
2676             return []
2677         ism_doc, urlh = res
2678         if ism_doc is None:
2679             return []
2680
2681         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2682
2683     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2684         """
2685         Parse formats from ISM manifest.
2686         References:
2687          1. [MS-SSTR]: Smooth Streaming Protocol,
2688             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2689         """
2690         if ism_doc.get('IsLive') == 'TRUE':
2691             return []
2692         if (not self._downloader.params.get('allow_unplayable_formats')
2693                 and ism_doc.find('Protection') is not None):
2694             return []
2695
2696         duration = int(ism_doc.attrib['Duration'])
2697         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2698
2699         formats = []
2700         for stream in ism_doc.findall('StreamIndex'):
2701             stream_type = stream.get('Type')
2702             if stream_type not in ('video', 'audio'):
2703                 continue
2704             url_pattern = stream.attrib['Url']
2705             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2706             stream_name = stream.get('Name')
2707             for track in stream.findall('QualityLevel'):
2708                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2709                 # TODO: add support for WVC1 and WMAP
2710                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2711                     self.report_warning('%s is not a supported codec' % fourcc)
2712                     continue
2713                 tbr = int(track.attrib['Bitrate']) // 1000
2714                 # [1] does not mention Width and Height attributes. However,
2715                 # they're often present while MaxWidth and MaxHeight are
2716                 # missing, so should be used as fallbacks
2717                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2718                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2719                 sampling_rate = int_or_none(track.get('SamplingRate'))
2720
2721                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2722                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2723
2724                 fragments = []
2725                 fragment_ctx = {
2726                     'time': 0,
2727                 }
2728                 stream_fragments = stream.findall('c')
2729                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2730                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2731                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2732                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2733                     if not fragment_ctx['duration']:
2734                         try:
2735                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2736                         except IndexError:
2737                             next_fragment_time = duration
2738                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2739                     for _ in range(fragment_repeat):
2740                         fragments.append({
2741                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2742                             'duration': fragment_ctx['duration'] / stream_timescale,
2743                         })
2744                         fragment_ctx['time'] += fragment_ctx['duration']
2745
2746                 format_id = []
2747                 if ism_id:
2748                     format_id.append(ism_id)
2749                 if stream_name:
2750                     format_id.append(stream_name)
2751                 format_id.append(compat_str(tbr))
2752
2753                 formats.append({
2754                     'format_id': '-'.join(format_id),
2755                     'url': ism_url,
2756                     'manifest_url': ism_url,
2757                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2758                     'width': width,
2759                     'height': height,
2760                     'tbr': tbr,
2761                     'asr': sampling_rate,
2762                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2763                     'acodec': 'none' if stream_type == 'video' else fourcc,
2764                     'protocol': 'ism',
2765                     'fragments': fragments,
2766                     '_download_params': {
2767                         'duration': duration,
2768                         'timescale': stream_timescale,
2769                         'width': width or 0,
2770                         'height': height or 0,
2771                         'fourcc': fourcc,
2772                         'codec_private_data': track.get('CodecPrivateData'),
2773                         'sampling_rate': sampling_rate,
2774                         'channels': int_or_none(track.get('Channels', 2)),
2775                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2776                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2777                     },
2778                 })
2779         return formats
2780
2781     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2782         def absolute_url(item_url):
2783             return urljoin(base_url, item_url)
2784
2785         def parse_content_type(content_type):
2786             if not content_type:
2787                 return {}
2788             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2789             if ctr:
2790                 mimetype, codecs = ctr.groups()
2791                 f = parse_codecs(codecs)
2792                 f['ext'] = mimetype2ext(mimetype)
2793                 return f
2794             return {}
2795
2796         def _media_formats(src, cur_media_type, type_info={}):
2797             full_url = absolute_url(src)
2798             ext = type_info.get('ext') or determine_ext(full_url)
2799             if ext == 'm3u8':
2800                 is_plain_url = False
2801                 formats = self._extract_m3u8_formats(
2802                     full_url, video_id, ext='mp4',
2803                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2804                     preference=preference, quality=quality, fatal=False)
2805             elif ext == 'mpd':
2806                 is_plain_url = False
2807                 formats = self._extract_mpd_formats(
2808                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2809             else:
2810                 is_plain_url = True
2811                 formats = [{
2812                     'url': full_url,
2813                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2814                 }]
2815             return is_plain_url, formats
2816
2817         entries = []
2818         # amp-video and amp-audio are very similar to their HTML5 counterparts
2819         # so we wll include them right here (see
2820         # https://www.ampproject.org/docs/reference/components/amp-video)
2821         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2822         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2823         media_tags = [(media_tag, media_tag_name, media_type, '')
2824                       for media_tag, media_tag_name, media_type
2825                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2826         media_tags.extend(re.findall(
2827             # We only allow video|audio followed by a whitespace or '>'.
2828             # Allowing more characters may end up in significant slow down (see
2829             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2830             # http://www.porntrex.com/maps/videositemap.xml).
2831             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2832         for media_tag, _, media_type, media_content in media_tags:
2833             media_info = {
2834                 'formats': [],
2835                 'subtitles': {},
2836             }
2837             media_attributes = extract_attributes(media_tag)
2838             src = strip_or_none(media_attributes.get('src'))
2839             if src:
2840                 _, formats = _media_formats(src, media_type)
2841                 media_info['formats'].extend(formats)
2842             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2843             if media_content:
2844                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2845                     s_attr = extract_attributes(source_tag)
2846                     # data-video-src and data-src are non standard but seen
2847                     # several times in the wild
2848                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2849                     if not src:
2850                         continue
2851                     f = parse_content_type(s_attr.get('type'))
2852                     is_plain_url, formats = _media_formats(src, media_type, f)
2853                     if is_plain_url:
2854                         # width, height, res, label and title attributes are
2855                         # all not standard but seen several times in the wild
2856                         labels = [
2857                             s_attr.get(lbl)
2858                             for lbl in ('label', 'title')
2859                             if str_or_none(s_attr.get(lbl))
2860                         ]
2861                         width = int_or_none(s_attr.get('width'))
2862                         height = (int_or_none(s_attr.get('height'))
2863                                   or int_or_none(s_attr.get('res')))
2864                         if not width or not height:
2865                             for lbl in labels:
2866                                 resolution = parse_resolution(lbl)
2867                                 if not resolution:
2868                                     continue
2869                                 width = width or resolution.get('width')
2870                                 height = height or resolution.get('height')
2871                         for lbl in labels:
2872                             tbr = parse_bitrate(lbl)
2873                             if tbr:
2874                                 break
2875                         else:
2876                             tbr = None
2877                         f.update({
2878                             'width': width,
2879                             'height': height,
2880                             'tbr': tbr,
2881                             'format_id': s_attr.get('label') or s_attr.get('title'),
2882                         })
2883                         f.update(formats[0])
2884                         media_info['formats'].append(f)
2885                     else:
2886                         media_info['formats'].extend(formats)
2887                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2888                     track_attributes = extract_attributes(track_tag)
2889                     kind = track_attributes.get('kind')
2890                     if not kind or kind in ('subtitles', 'captions'):
2891                         src = strip_or_none(track_attributes.get('src'))
2892                         if not src:
2893                             continue
2894                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2895                         media_info['subtitles'].setdefault(lang, []).append({
2896                             'url': absolute_url(src),
2897                         })
2898             for f in media_info['formats']:
2899                 f.setdefault('http_headers', {})['Referer'] = base_url
2900             if media_info['formats'] or media_info['subtitles']:
2901                 entries.append(media_info)
2902         return entries
2903
2904     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2905         signed = 'hdnea=' in manifest_url
2906         if not signed:
2907             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2908             manifest_url = re.sub(
2909                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2910                 '', manifest_url).strip('?')
2911
2912         formats = []
2913
2914         hdcore_sign = 'hdcore=3.7.0'
2915         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2916         hds_host = hosts.get('hds')
2917         if hds_host:
2918             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2919         if 'hdcore=' not in f4m_url:
2920             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2921         f4m_formats = self._extract_f4m_formats(
2922             f4m_url, video_id, f4m_id='hds', fatal=False)
2923         for entry in f4m_formats:
2924             entry.update({'extra_param_to_segment_url': hdcore_sign})
2925         formats.extend(f4m_formats)
2926
2927         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2928         hls_host = hosts.get('hls')
2929         if hls_host:
2930             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2931         m3u8_formats = self._extract_m3u8_formats(
2932             m3u8_url, video_id, 'mp4', 'm3u8_native',
2933             m3u8_id='hls', fatal=False)
2934         formats.extend(m3u8_formats)
2935
2936         http_host = hosts.get('http')
2937         if http_host and m3u8_formats and not signed:
2938             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
2939             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2940             qualities_length = len(qualities)
2941             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
2942                 i = 0
2943                 for f in m3u8_formats:
2944                     if f['vcodec'] != 'none':
2945                         for protocol in ('http', 'https'):
2946                             http_f = f.copy()
2947                             del http_f['manifest_url']
2948                             http_url = re.sub(
2949                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
2950                             http_f.update({
2951                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2952                                 'url': http_url,
2953                                 'protocol': protocol,
2954                             })
2955                             formats.append(http_f)
2956                         i += 1
2957
2958         return formats
2959
2960     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2961         query = compat_urlparse.urlparse(url).query
2962         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2963         mobj = re.search(
2964             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2965         url_base = mobj.group('url')
2966         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2967         formats = []
2968
2969         def manifest_url(manifest):
2970             m_url = '%s/%s' % (http_base_url, manifest)
2971             if query:
2972                 m_url += '?%s' % query
2973             return m_url
2974
2975         if 'm3u8' not in skip_protocols:
2976             formats.extend(self._extract_m3u8_formats(
2977                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2978                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2979         if 'f4m' not in skip_protocols:
2980             formats.extend(self._extract_f4m_formats(
2981                 manifest_url('manifest.f4m'),
2982                 video_id, f4m_id='hds', fatal=False))
2983         if 'dash' not in skip_protocols:
2984             formats.extend(self._extract_mpd_formats(
2985                 manifest_url('manifest.mpd'),
2986                 video_id, mpd_id='dash', fatal=False))
2987         if re.search(r'(?:/smil:|\.smil)', url_base):
2988             if 'smil' not in skip_protocols:
2989                 rtmp_formats = self._extract_smil_formats(
2990                     manifest_url('jwplayer.smil'),
2991                     video_id, fatal=False)
2992                 for rtmp_format in rtmp_formats:
2993                     rtsp_format = rtmp_format.copy()
2994                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2995                     del rtsp_format['play_path']
2996                     del rtsp_format['ext']
2997                     rtsp_format.update({
2998                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2999                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3000                         'protocol': 'rtsp',
3001                     })
3002                     formats.extend([rtmp_format, rtsp_format])
3003         else:
3004             for protocol in ('rtmp', 'rtsp'):
3005                 if protocol not in skip_protocols:
3006                     formats.append({
3007                         'url': '%s:%s' % (protocol, url_base),
3008                         'format_id': protocol,
3009                         'protocol': protocol,
3010                     })
3011         return formats
3012
3013     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3014         mobj = re.search(
3015             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3016             webpage)
3017         if mobj:
3018             try:
3019                 jwplayer_data = self._parse_json(mobj.group('options'),
3020                                                  video_id=video_id,
3021                                                  transform_source=transform_source)
3022             except ExtractorError:
3023                 pass
3024             else:
3025                 if isinstance(jwplayer_data, dict):
3026                     return jwplayer_data
3027
3028     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3029         jwplayer_data = self._find_jwplayer_data(
3030             webpage, video_id, transform_source=js_to_json)
3031         return self._parse_jwplayer_data(
3032             jwplayer_data, video_id, *args, **kwargs)
3033
3034     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3035                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3036         # JWPlayer backward compatibility: flattened playlists
3037         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3038         if 'playlist' not in jwplayer_data:
3039             jwplayer_data = {'playlist': [jwplayer_data]}
3040
3041         entries = []
3042
3043         # JWPlayer backward compatibility: single playlist item
3044         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3045         if not isinstance(jwplayer_data['playlist'], list):
3046             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3047
3048         for video_data in jwplayer_data['playlist']:
3049             # JWPlayer backward compatibility: flattened sources
3050             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3051             if 'sources' not in video_data:
3052                 video_data['sources'] = [video_data]
3053
3054             this_video_id = video_id or video_data['mediaid']
3055
3056             formats = self._parse_jwplayer_formats(
3057                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3058                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3059
3060             subtitles = {}
3061             tracks = video_data.get('tracks')
3062             if tracks and isinstance(tracks, list):
3063                 for track in tracks:
3064                     if not isinstance(track, dict):
3065                         continue
3066                     track_kind = track.get('kind')
3067                     if not track_kind or not isinstance(track_kind, compat_str):
3068                         continue
3069                     if track_kind.lower() not in ('captions', 'subtitles'):
3070                         continue
3071                     track_url = urljoin(base_url, track.get('file'))
3072                     if not track_url:
3073                         continue
3074                     subtitles.setdefault(track.get('label') or 'en', []).append({
3075                         'url': self._proto_relative_url(track_url)
3076                     })
3077
3078             entry = {
3079                 'id': this_video_id,
3080                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3081                 'description': clean_html(video_data.get('description')),
3082                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3083                 'timestamp': int_or_none(video_data.get('pubdate')),
3084                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3085                 'subtitles': subtitles,
3086             }
3087             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3088             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3089                 entry.update({
3090                     '_type': 'url_transparent',
3091                     'url': formats[0]['url'],
3092                 })
3093             else:
3094                 self._sort_formats(formats)
3095                 entry['formats'] = formats
3096             entries.append(entry)
3097         if len(entries) == 1:
3098             return entries[0]
3099         else:
3100             return self.playlist_result(entries)
3101
3102     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3103                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3104         urls = []
3105         formats = []
3106         for source in jwplayer_sources_data:
3107             if not isinstance(source, dict):
3108                 continue
3109             source_url = urljoin(
3110                 base_url, self._proto_relative_url(source.get('file')))
3111             if not source_url or source_url in urls:
3112                 continue
3113             urls.append(source_url)
3114             source_type = source.get('type') or ''
3115             ext = mimetype2ext(source_type) or determine_ext(source_url)
3116             if source_type == 'hls' or ext == 'm3u8':
3117                 formats.extend(self._extract_m3u8_formats(
3118                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3119                     m3u8_id=m3u8_id, fatal=False))
3120             elif source_type == 'dash' or ext == 'mpd':
3121                 formats.extend(self._extract_mpd_formats(
3122                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3123             elif ext == 'smil':
3124                 formats.extend(self._extract_smil_formats(
3125                     source_url, video_id, fatal=False))
3126             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3127             elif source_type.startswith('audio') or ext in (
3128                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3129                 formats.append({
3130                     'url': source_url,
3131                     'vcodec': 'none',
3132                     'ext': ext,
3133                 })
3134             else:
3135                 height = int_or_none(source.get('height'))
3136                 if height is None:
3137                     # Often no height is provided but there is a label in
3138                     # format like "1080p", "720p SD", or 1080.
3139                     height = int_or_none(self._search_regex(
3140                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3141                         'height', default=None))
3142                 a_format = {
3143                     'url': source_url,
3144                     'width': int_or_none(source.get('width')),
3145                     'height': height,
3146                     'tbr': int_or_none(source.get('bitrate')),
3147                     'ext': ext,
3148                 }
3149                 if source_url.startswith('rtmp'):
3150                     a_format['ext'] = 'flv'
3151                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3152                     # of jwplayer.flash.swf
3153                     rtmp_url_parts = re.split(
3154                         r'((?:mp4|mp3|flv):)', source_url, 1)
3155                     if len(rtmp_url_parts) == 3:
3156                         rtmp_url, prefix, play_path = rtmp_url_parts
3157                         a_format.update({
3158                             'url': rtmp_url,
3159                             'play_path': prefix + play_path,
3160                         })
3161                     if rtmp_params:
3162                         a_format.update(rtmp_params)
3163                 formats.append(a_format)
3164         return formats
3165
3166     def _live_title(self, name):
3167         """ Generate the title for a live video """
3168         now = datetime.datetime.now()
3169         now_str = now.strftime('%Y-%m-%d %H:%M')
3170         return name + ' ' + now_str
3171
3172     def _int(self, v, name, fatal=False, **kwargs):
3173         res = int_or_none(v, **kwargs)
3174         if 'get_attr' in kwargs:
3175             print(getattr(v, kwargs['get_attr']))
3176         if res is None:
3177             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3178             if fatal:
3179                 raise ExtractorError(msg)
3180             else:
3181                 self._downloader.report_warning(msg)
3182         return res
3183
3184     def _float(self, v, name, fatal=False, **kwargs):
3185         res = float_or_none(v, **kwargs)
3186         if res is None:
3187             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3188             if fatal:
3189                 raise ExtractorError(msg)
3190             else:
3191                 self._downloader.report_warning(msg)
3192         return res
3193
3194     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3195                     path='/', secure=False, discard=False, rest={}, **kwargs):
3196         cookie = compat_cookiejar_Cookie(
3197             0, name, value, port, port is not None, domain, True,
3198             domain.startswith('.'), path, True, secure, expire_time,
3199             discard, None, None, rest)
3200         self._downloader.cookiejar.set_cookie(cookie)
3201
3202     def _get_cookies(self, url):
3203         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
3204         req = sanitized_Request(url)
3205         self._downloader.cookiejar.add_cookie_header(req)
3206         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
3207
3208     def _apply_first_set_cookie_header(self, url_handle, cookie):
3209         """
3210         Apply first Set-Cookie header instead of the last. Experimental.
3211
3212         Some sites (e.g. [1-3]) may serve two cookies under the same name
3213         in Set-Cookie header and expect the first (old) one to be set rather
3214         than second (new). However, as of RFC6265 the newer one cookie
3215         should be set into cookie store what actually happens.
3216         We will workaround this issue by resetting the cookie to
3217         the first one manually.
3218         1. https://new.vk.com/
3219         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3220         3. https://learning.oreilly.com/
3221         """
3222         for header, cookies in url_handle.headers.items():
3223             if header.lower() != 'set-cookie':
3224                 continue
3225             if sys.version_info[0] >= 3:
3226                 cookies = cookies.encode('iso-8859-1')
3227             cookies = cookies.decode('utf-8')
3228             cookie_value = re.search(
3229                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3230             if cookie_value:
3231                 value, domain = cookie_value.groups()
3232                 self._set_cookie(domain, cookie, value)
3233                 break
3234
3235     def get_testcases(self, include_onlymatching=False):
3236         t = getattr(self, '_TEST', None)
3237         if t:
3238             assert not hasattr(self, '_TESTS'), \
3239                 '%s has _TEST and _TESTS' % type(self).__name__
3240             tests = [t]
3241         else:
3242             tests = getattr(self, '_TESTS', [])
3243         for t in tests:
3244             if not include_onlymatching and t.get('only_matching', False):
3245                 continue
3246             t['name'] = type(self).__name__[:-len('IE')]
3247             yield t
3248
3249     def is_suitable(self, age_limit):
3250         """ Test whether the extractor is generally suitable for the given
3251         age limit (i.e. pornographic sites are not, all others usually are) """
3252
3253         any_restricted = False
3254         for tc in self.get_testcases(include_onlymatching=False):
3255             if tc.get('playlist', []):
3256                 tc = tc['playlist'][0]
3257             is_restricted = age_restricted(
3258                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3259             if not is_restricted:
3260                 return True
3261             any_restricted = any_restricted or is_restricted
3262         return not any_restricted
3263
3264     def extract_subtitles(self, *args, **kwargs):
3265         if (self._downloader.params.get('writesubtitles', False)
3266                 or self._downloader.params.get('listsubtitles')):
3267             return self._get_subtitles(*args, **kwargs)
3268         return {}
3269
3270     def _get_subtitles(self, *args, **kwargs):
3271         raise NotImplementedError('This method must be implemented by subclasses')
3272
3273     @staticmethod
3274     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3275         """ Merge subtitle items for one language. Items with duplicated URLs
3276         will be dropped. """
3277         list1_urls = set([item['url'] for item in subtitle_list1])
3278         ret = list(subtitle_list1)
3279         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3280         return ret
3281
3282     @classmethod
3283     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
3284         """ Merge two subtitle dictionaries, language by language. """
3285         ret = dict(subtitle_dict1)
3286         for lang in subtitle_dict2:
3287             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
3288         return ret
3289
3290     def extract_automatic_captions(self, *args, **kwargs):
3291         if (self._downloader.params.get('writeautomaticsub', False)
3292                 or self._downloader.params.get('listsubtitles')):
3293             return self._get_automatic_captions(*args, **kwargs)
3294         return {}
3295
3296     def _get_automatic_captions(self, *args, **kwargs):
3297         raise NotImplementedError('This method must be implemented by subclasses')
3298
3299     def mark_watched(self, *args, **kwargs):
3300         if (self._downloader.params.get('mark_watched', False)
3301                 and (self._get_login_info()[0] is not None
3302                      or self._downloader.params.get('cookiefile') is not None)):
3303             self._mark_watched(*args, **kwargs)
3304
3305     def _mark_watched(self, *args, **kwargs):
3306         raise NotImplementedError('This method must be implemented by subclasses')
3307
3308     def geo_verification_headers(self):
3309         headers = {}
3310         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3311         if geo_verification_proxy:
3312             headers['Ytdl-request-proxy'] = geo_verification_proxy
3313         return headers
3314
3315     def _generic_id(self, url):
3316         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3317
3318     def _generic_title(self, url):
3319         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3320
3321
3322 class SearchInfoExtractor(InfoExtractor):
3323     """
3324     Base class for paged search queries extractors.
3325     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3326     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3327     """
3328
3329     @classmethod
3330     def _make_valid_url(cls):
3331         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3332
3333     @classmethod
3334     def suitable(cls, url):
3335         return re.match(cls._make_valid_url(), url) is not None
3336
3337     def _real_extract(self, query):
3338         mobj = re.match(self._make_valid_url(), query)
3339         if mobj is None:
3340             raise ExtractorError('Invalid search query "%s"' % query)
3341
3342         prefix = mobj.group('prefix')
3343         query = mobj.group('query')
3344         if prefix == '':
3345             return self._get_n_results(query, 1)
3346         elif prefix == 'all':
3347             return self._get_n_results(query, self._MAX_RESULTS)
3348         else:
3349             n = int(prefix)
3350             if n <= 0:
3351                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3352             elif n > self._MAX_RESULTS:
3353                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3354                 n = self._MAX_RESULTS
3355             return self._get_n_results(query, n)
3356
3357     def _get_n_results(self, query, n):
3358         """Get a specified number of results for a query"""
3359         raise NotImplementedError('This method must be implemented by subclasses')
3360
3361     @property
3362     def SEARCH_KEY(self):
3363         return self._SEARCH_KEY