yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import ssl
  14 import sys
  15 import time
  16 import math
  17
  18 from ..compat import (
  19     compat_cookiejar_Cookie,
  20     compat_cookies_SimpleCookie,
  21     compat_etree_Element,
  22     compat_etree_fromstring,
  23     compat_getpass,
  24     compat_integer_types,
  25     compat_http_client,
  26     compat_os_name,
  27     compat_str,
  28     compat_urllib_error,
  29     compat_urllib_parse_unquote,
  30     compat_urllib_parse_urlencode,
  31     compat_urllib_request,
  32     compat_urlparse,
  33     compat_xml_parse_error,
  34 )
  35 from ..downloader import FileDownloader
  36 from ..downloader.f4m import (
  37     get_base_url,
  38     remove_encrypted_media,
  39 )
  40 from ..utils import (
  41     NO_DEFAULT,
  42     age_restricted,
  43     base_url,
  44     bug_reports_message,
  45     clean_html,
  46     compiled_regex_type,
  47     determine_ext,
  48     determine_protocol,
  49     dict_get,
  50     error_to_compat_str,
  51     ExtractorError,
  52     extract_attributes,
  53     fix_xml_ampersands,
  54     float_or_none,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     js_to_json,
  59     JSON_LD_RE,
  60     mimetype2ext,
  61     orderedSet,
  62     parse_bitrate,
  63     parse_codecs,
  64     parse_duration,
  65     parse_iso8601,
  66     parse_m3u8_attributes,
  67     parse_resolution,
  68     RegexNotFoundError,
  69     sanitized_Request,
  70     sanitize_filename,
  71     str_or_none,
  72     str_to_int,
  73     strip_or_none,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     urljoin,
  80     url_basename,
  81     url_or_none,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor(object):
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped.
 107
 108     Additionally, it must contain either a formats entry or a url one:
 109
 110     formats:        A list of dictionaries for each format available, ordered
 111                     from worst to best quality.
 112
 113                     Potential fields:
 114                     * url        The mandatory URL representing the media:
 115                                    for plain file media - HTTP URL of this file,
 116                                    for RTMP - RTMP URL,
 117                                    for HLS - URL of the M3U8 media playlist,
 118                                    for HDS - URL of the F4M manifest,
 119                                    for DASH
 120                                      - HTTP URL to plain file media (in case of
 121                                        unfragmented media)
 122                                      - URL of the MPD manifest or base URL
 123                                        representing the media if MPD manifest
 124                                        is parsed from a string (in case of
 125                                        fragmented media)
 126                                    for MSS - URL of the ISM manifest.
 127                     * manifest_url
 128                                  The URL of the manifest file in case of
 129                                  fragmented media:
 130                                    for HLS - URL of the M3U8 master playlist,
 131                                    for HDS - URL of the F4M manifest,
 132                                    for DASH - URL of the MPD manifest,
 133                                    for MSS - URL of the ISM manifest.
 134                     * ext        Will be calculated from URL if missing
 135                     * format     A human-readable description of the format
 136                                  ("mp4 container with h264/opus").
 137                                  Calculated from the format_id, width, height.
 138                                  and format_note fields if missing.
 139                     * format_id  A short description of the format
 140                                  ("mp4_h264_opus" or "19").
 141                                 Technically optional, but strongly recommended.
 142                     * format_note Additional info about the format
 143                                  ("3D" or "DASH video")
 144                     * width      Width of the video, if known
 145                     * height     Height of the video, if known
 146                     * resolution Textual description of width and height
 147                     * tbr        Average bitrate of audio and video in KBit/s
 148                     * abr        Average audio bitrate in KBit/s
 149                     * acodec     Name of the audio codec in use
 150                     * asr        Audio sampling rate in Hertz
 151                     * vbr        Average video bitrate in KBit/s
 152                     * fps        Frame rate
 153                     * vcodec     Name of the video codec in use
 154                     * container  Name of the container format
 155                     * filesize   The number of bytes, if known in advance
 156                     * filesize_approx  An estimate for the number of bytes
 157                     * player_url SWF Player URL (used for rtmpdump).
 158                     * protocol   The protocol that will be used for the actual
 159                                  download, lower-case.
 160                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 161                                  "m3u8", "m3u8_native" or "http_dash_segments".
 162                     * fragment_base_url
 163                                  Base URL for fragments. Each fragment's path
 164                                  value (if present) will be relative to
 165                                  this URL.
 166                     * fragments  A list of fragments of a fragmented media.
 167                                  Each fragment entry must contain either an url
 168                                  or a path. If an url is present it should be
 169                                  considered by a client. Otherwise both path and
 170                                  fragment_base_url must be present. Here is
 171                                  the list of all potential fields:
 172                                  * "url" - fragment's URL
 173                                  * "path" - fragment's path relative to
 174                                             fragment_base_url
 175                                  * "duration" (optional, int or float)
 176                                  * "filesize" (optional, int)
 177                     * preference Order number of this format. If this field is
 178                                  present and not None, the formats get sorted
 179                                  by this field, regardless of all other values.
 180                                  -1 for default (order by other properties),
 181                                  -2 or smaller for less than default.
 182                                  < -1000 to hide the format (if there is
 183                                     another one which is strictly better)
 184                     * language   Language code, e.g. "de" or "en-US".
 185                     * language_preference  Is this in the language mentioned in
 186                                  the URL?
 187                                  10 if it's what the URL is about,
 188                                  -1 for default (don't know),
 189                                  -10 otherwise, other values reserved for now.
 190                     * quality    Order number of the video quality of this
 191                                  format, irrespective of the file format.
 192                                  -1 for default (order by other properties),
 193                                  -2 or smaller for less than default.
 194                     * source_preference  Order number for this video source
 195                                   (quality takes higher priority)
 196                                  -1 for default (order by other properties),
 197                                  -2 or smaller for less than default.
 198                     * http_headers  A dictionary of additional HTTP headers
 199                                  to add to the request.
 200                     * stretched_ratio  If given and not 1, indicates that the
 201                                  video's pixels are not square.
 202                                  width : height ratio as float.
 203                     * no_resume  The server does not support resuming the
 204                                  (HTTP or RTMP) download. Boolean.
 205                     * downloader_options  A dictionary of downloader options as
 206                                  described in FileDownloader
 207
 208     url:            Final video URL.
 209     ext:            Video filename extension.
 210     format:         The video format, defaults to ext (used for --get-format)
 211     player_url:     SWF Player URL (used for rtmpdump).
 212
 213     The following fields are optional:
 214
 215     alt_title:      A secondary title of the video.
 216     display_id      An alternative identifier for the video, not necessarily
 217                     unique, but available before title. Typically, id is
 218                     something like "4234987", title "Dancing naked mole rats",
 219                     and display_id "dancing-naked-mole-rats"
 220     thumbnails:     A list of dictionaries, with the following entries:
 221                         * "id" (optional, string) - Thumbnail format ID
 222                         * "url"
 223                         * "preference" (optional, int) - quality of the image
 224                         * "width" (optional, int)
 225                         * "height" (optional, int)
 226                         * "resolution" (optional, string "{width}x{height}",
 227                                         deprecated)
 228                         * "filesize" (optional, int)
 229     thumbnail:      Full URL to a video thumbnail image.
 230     description:    Full video description.
 231     uploader:       Full name of the video uploader.
 232     license:        License name the video is licensed under.
 233     creator:        The creator of the video.
 234     release_timestamp: UNIX timestamp of the moment the video was released.
 235     release_date:   The date (YYYYMMDD) when the video was released.
 236     timestamp:      UNIX timestamp of the moment the video was uploaded
 237     upload_date:    Video upload date (YYYYMMDD).
 238                     If not explicitly set, calculated from timestamp.
 239     uploader_id:    Nickname or id of the video uploader.
 240     uploader_url:   Full URL to a personal webpage of the video uploader.
 241     channel:        Full name of the channel the video is uploaded on.
 242                     Note that channel fields may or may not repeat uploader
 243                     fields. This depends on a particular extractor.
 244     channel_id:     Id of the channel.
 245     channel_url:    Full URL to a channel webpage.
 246     location:       Physical location where the video was filmed.
 247     subtitles:      The available subtitles as a dictionary in the format
 248                     {tag: subformats}. "tag" is usually a language code, and
 249                     "subformats" is a list sorted from lower to higher
 250                     preference, each element is a dictionary with the "ext"
 251                     entry and one of:
 252                         * "data": The subtitles file contents
 253                         * "url": A URL pointing to the subtitles file
 254                     "ext" will be calculated from URL if missing
 255     automatic_captions: Like 'subtitles'; contains automatically generated
 256                     captions instead of normal subtitles
 257     duration:       Length of the video in seconds, as an integer or float.
 258     view_count:     How many users have watched the video on the platform.
 259     like_count:     Number of positive ratings of the video
 260     dislike_count:  Number of negative ratings of the video
 261     repost_count:   Number of reposts of the video
 262     average_rating: Average rating give by users, the scale used depends on the webpage
 263     comment_count:  Number of comments on the video
 264     comments:       A list of comments, each with one or more of the following
 265                     properties (all but one of text or html optional):
 266                         * "author" - human-readable name of the comment author
 267                         * "author_id" - user ID of the comment author
 268                         * "author_thumbnail" - The thumbnail of the comment author
 269                         * "id" - Comment ID
 270                         * "html" - Comment as HTML
 271                         * "text" - Plain text of the comment
 272                         * "timestamp" - UNIX timestamp of comment
 273                         * "parent" - ID of the comment this one is replying to.
 274                                      Set to "root" to indicate that this is a
 275                                      comment to the original video.
 276                         * "like_count" - Number of positive ratings of the comment
 277                         * "dislike_count" - Number of negative ratings of the comment
 278                         * "is_favorited" - Whether the comment is marked as
 279                                            favorite by the video uploader
 280                         * "author_is_uploader" - Whether the comment is made by
 281                                                  the video uploader
 282     age_limit:      Age restriction for the video, as an integer (years)
 283     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 284                     should allow to get the same result again. (It will be set
 285                     by YoutubeDL if it's missing)
 286     categories:     A list of categories that the video falls in, for example
 287                     ["Sports", "Berlin"]
 288     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 289     is_live:        True, False, or None (=unknown). Whether this video is a
 290                     live stream that goes on instead of a fixed-length video.
 291     was_live:       True, False, or None (=unknown). Whether this video was
 292                     originally a live stream.
 293     start_time:     Time in seconds where the reproduction should start, as
 294                     specified in the URL.
 295     end_time:       Time in seconds where the reproduction should end, as
 296                     specified in the URL.
 297     chapters:       A list of dictionaries, with the following entries:
 298                         * "start_time" - The start time of the chapter in seconds
 299                         * "end_time" - The end time of the chapter in seconds
 300                         * "title" (optional, string)
 301     playable_in_embed: Whether this video is allowed to play in embedded
 302                     players on other sites. Can be True (=always allowed),
 303                     False (=never allowed), None (=unknown), or a string
 304                     specifying the criteria for embedability (Eg: 'whitelist')
 305     availability:   Under what condition the video is available. One of
 306                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 307                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 308                     to set it
 309     __post_extractor: A function to be called just before the metadata is
 310                     written to either disk, logger or console. The function
 311                     must return a dict which will be added to the info_dict.
 312                     This is usefull for additional information that is
 313                     time-consuming to extract. Note that the fields thus
 314                     extracted will not be available to output template and
 315                     match_filter. So, only "comments" and "comment_count" are
 316                     currently allowed to be extracted via this method.
 317
 318     The following fields should only be used when the video belongs to some logical
 319     chapter or section:
 320
 321     chapter:        Name or title of the chapter the video belongs to.
 322     chapter_number: Number of the chapter the video belongs to, as an integer.
 323     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 324
 325     The following fields should only be used when the video is an episode of some
 326     series, programme or podcast:
 327
 328     series:         Title of the series or programme the video episode belongs to.
 329     season:         Title of the season the video episode belongs to.
 330     season_number:  Number of the season the video episode belongs to, as an integer.
 331     season_id:      Id of the season the video episode belongs to, as a unicode string.
 332     episode:        Title of the video episode. Unlike mandatory video title field,
 333                     this field should denote the exact title of the video episode
 334                     without any kind of decoration.
 335     episode_number: Number of the video episode within a season, as an integer.
 336     episode_id:     Id of the video episode, as a unicode string.
 337
 338     The following fields should only be used when the media is a track or a part of
 339     a music album:
 340
 341     track:          Title of the track.
 342     track_number:   Number of the track within an album or a disc, as an integer.
 343     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 344                     as a unicode string.
 345     artist:         Artist(s) of the track.
 346     genre:          Genre(s) of the track.
 347     album:          Title of the album the track belongs to.
 348     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 349     album_artist:   List of all artists appeared on the album (e.g.
 350                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 351                     and compilations).
 352     disc_number:    Number of the disc or other physical medium the track belongs to,
 353                     as an integer.
 354     release_year:   Year (YYYY) when the album was released.
 355
 356     Unless mentioned otherwise, the fields should be Unicode strings.
 357
 358     Unless mentioned otherwise, None is equivalent to absence of information.
 359
 360
 361     _type "playlist" indicates multiple videos.
 362     There must be a key "entries", which is a list, an iterable, or a PagedList
 363     object, each element of which is a valid dictionary by this specification.
 364
 365     Additionally, playlists can have "id", "title", and any other relevent
 366     attributes with the same semantics as videos (see above).
 367
 368
 369     _type "multi_video" indicates that there are multiple videos that
 370     form a single show, for examples multiple acts of an opera or TV episode.
 371     It must have an entries key like a playlist and contain all the keys
 372     required for a video at the same time.
 373
 374
 375     _type "url" indicates that the video must be extracted from another
 376     location, possibly by a different extractor. Its only required key is:
 377     "url" - the next URL to extract.
 378     The key "ie_key" can be set to the class name (minus the trailing "IE",
 379     e.g. "Youtube") if the extractor class is known in advance.
 380     Additionally, the dictionary may have any properties of the resolved entity
 381     known in advance, for example "title" if the title of the referred video is
 382     known ahead of time.
 383
 384
 385     _type "url_transparent" entities have the same specification as "url", but
 386     indicate that the given additional information is more precise than the one
 387     associated with the resolved URL.
 388     This is useful when a site employs a video service that hosts the video and
 389     its technical metadata, but that video service does not embed a useful
 390     title, description etc.
 391
 392
 393     Subclasses of this one should re-define the _real_initialize() and
 394     _real_extract() methods and define a _VALID_URL regexp.
 395     Probably, they should also be added to the list of extractors.
 396
 397     _GEO_BYPASS attribute may be set to False in order to disable
 398     geo restriction bypass mechanisms for a particular extractor.
 399     Though it won't disable explicit geo restriction bypass based on
 400     country code provided with geo_bypass_country.
 401
 402     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 403     countries for this extractor. One of these countries will be used by
 404     geo restriction bypass mechanism right away in order to bypass
 405     geo restriction, of course, if the mechanism is not disabled.
 406
 407     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 408     IP blocks in CIDR notation for this extractor. One of these IP blocks
 409     will be used by geo restriction bypass mechanism similarly
 410     to _GEO_COUNTRIES.
 411
 412     Finally, the _WORKING attribute should be set to False for broken IEs
 413     in order to warn the users and skip the tests.
 414     """
 415
 416     _ready = False
 417     _downloader = None
 418     _x_forwarded_for_ip = None
 419     _GEO_BYPASS = True
 420     _GEO_COUNTRIES = None
 421     _GEO_IP_BLOCKS = None
 422     _WORKING = True
 423
 424     def __init__(self, downloader=None):
 425         """Constructor. Receives an optional downloader."""
 426         self._ready = False
 427         self._x_forwarded_for_ip = None
 428         self.set_downloader(downloader)
 429
 430     @classmethod
 431     def suitable(cls, url):
 432         """Receives a URL and returns True if suitable for this IE."""
 433
 434         # This does not use has/getattr intentionally - we want to know whether
 435         # we have cached the regexp for *this* class, whereas getattr would also
 436         # match the superclass
 437         if '_VALID_URL_RE' not in cls.__dict__:
 438             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 439         return cls._VALID_URL_RE.match(url) is not None
 440
 441     @classmethod
 442     def _match_id(cls, url):
 443         if '_VALID_URL_RE' not in cls.__dict__:
 444             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 445         m = cls._VALID_URL_RE.match(url)
 446         assert m
 447         return compat_str(m.group('id'))
 448
 449     @classmethod
 450     def working(cls):
 451         """Getter method for _WORKING."""
 452         return cls._WORKING
 453
 454     def initialize(self):
 455         """Initializes an instance (authentication, etc)."""
 456         self._initialize_geo_bypass({
 457             'countries': self._GEO_COUNTRIES,
 458             'ip_blocks': self._GEO_IP_BLOCKS,
 459         })
 460         if not self._ready:
 461             self._real_initialize()
 462             self._ready = True
 463
 464     def _initialize_geo_bypass(self, geo_bypass_context):
 465         """
 466         Initialize geo restriction bypass mechanism.
 467
 468         This method is used to initialize geo bypass mechanism based on faking
 469         X-Forwarded-For HTTP header. A random country from provided country list
 470         is selected and a random IP belonging to this country is generated. This
 471         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 472         HTTP requests.
 473
 474         This method will be used for initial geo bypass mechanism initialization
 475         during the instance initialization with _GEO_COUNTRIES and
 476         _GEO_IP_BLOCKS.
 477
 478         You may also manually call it from extractor's code if geo bypass
 479         information is not available beforehand (e.g. obtained during
 480         extraction) or due to some other reason. In this case you should pass
 481         this information in geo bypass context passed as first argument. It may
 482         contain following fields:
 483
 484         countries:  List of geo unrestricted countries (similar
 485                     to _GEO_COUNTRIES)
 486         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 487                     (similar to _GEO_IP_BLOCKS)
 488
 489         """
 490         if not self._x_forwarded_for_ip:
 491
 492             # Geo bypass mechanism is explicitly disabled by user
 493             if not self._downloader.params.get('geo_bypass', True):
 494                 return
 495
 496             if not geo_bypass_context:
 497                 geo_bypass_context = {}
 498
 499             # Backward compatibility: previously _initialize_geo_bypass
 500             # expected a list of countries, some 3rd party code may still use
 501             # it this way
 502             if isinstance(geo_bypass_context, (list, tuple)):
 503                 geo_bypass_context = {
 504                     'countries': geo_bypass_context,
 505                 }
 506
 507             # The whole point of geo bypass mechanism is to fake IP
 508             # as X-Forwarded-For HTTP header based on some IP block or
 509             # country code.
 510
 511             # Path 1: bypassing based on IP block in CIDR notation
 512
 513             # Explicit IP block specified by user, use it right away
 514             # regardless of whether extractor is geo bypassable or not
 515             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 516
 517             # Otherwise use random IP block from geo bypass context but only
 518             # if extractor is known as geo bypassable
 519             if not ip_block:
 520                 ip_blocks = geo_bypass_context.get('ip_blocks')
 521                 if self._GEO_BYPASS and ip_blocks:
 522                     ip_block = random.choice(ip_blocks)
 523
 524             if ip_block:
 525                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 526                 if self._downloader.params.get('verbose', False):
 527                     self._downloader.to_screen(
 528                         '[debug] Using fake IP %s as X-Forwarded-For.'
 529                         % self._x_forwarded_for_ip)
 530                 return
 531
 532             # Path 2: bypassing based on country code
 533
 534             # Explicit country code specified by user, use it right away
 535             # regardless of whether extractor is geo bypassable or not
 536             country = self._downloader.params.get('geo_bypass_country', None)
 537
 538             # Otherwise use random country code from geo bypass context but
 539             # only if extractor is known as geo bypassable
 540             if not country:
 541                 countries = geo_bypass_context.get('countries')
 542                 if self._GEO_BYPASS and countries:
 543                     country = random.choice(countries)
 544
 545             if country:
 546                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 547                 if self._downloader.params.get('verbose', False):
 548                     self._downloader.to_screen(
 549                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 550                         % (self._x_forwarded_for_ip, country.upper()))
 551
 552     def extract(self, url):
 553         """Extracts URL information and returns it in list of dicts."""
 554         try:
 555             for _ in range(2):
 556                 try:
 557                     self.initialize()
 558                     ie_result = self._real_extract(url)
 559                     if self._x_forwarded_for_ip:
 560                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 561                     return ie_result
 562                 except GeoRestrictedError as e:
 563                     if self.__maybe_fake_ip_and_retry(e.countries):
 564                         continue
 565                     raise
 566         except ExtractorError:
 567             raise
 568         except compat_http_client.IncompleteRead as e:
 569             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 570         except (KeyError, StopIteration) as e:
 571             raise ExtractorError('An extractor error has occurred.', cause=e)
 572
 573     def __maybe_fake_ip_and_retry(self, countries):
 574         if (not self._downloader.params.get('geo_bypass_country', None)
 575                 and self._GEO_BYPASS
 576                 and self._downloader.params.get('geo_bypass', True)
 577                 and not self._x_forwarded_for_ip
 578                 and countries):
 579             country_code = random.choice(countries)
 580             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 581             if self._x_forwarded_for_ip:
 582                 self.report_warning(
 583                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 584                     % (self._x_forwarded_for_ip, country_code.upper()))
 585                 return True
 586         return False
 587
 588     def set_downloader(self, downloader):
 589         """Sets the downloader for this IE."""
 590         self._downloader = downloader
 591
 592     def _real_initialize(self):
 593         """Real initialization process. Redefine in subclasses."""
 594         pass
 595
 596     def _real_extract(self, url):
 597         """Real extraction process. Redefine in subclasses."""
 598         pass
 599
 600     @classmethod
 601     def ie_key(cls):
 602         """A string for getting the InfoExtractor with get_info_extractor"""
 603         return compat_str(cls.__name__[:-2])
 604
 605     @property
 606     def IE_NAME(self):
 607         return compat_str(type(self).__name__[:-2])
 608
 609     @staticmethod
 610     def __can_accept_status_code(err, expected_status):
 611         assert isinstance(err, compat_urllib_error.HTTPError)
 612         if expected_status is None:
 613             return False
 614         if isinstance(expected_status, compat_integer_types):
 615             return err.code == expected_status
 616         elif isinstance(expected_status, (list, tuple)):
 617             return err.code in expected_status
 618         elif callable(expected_status):
 619             return expected_status(err.code) is True
 620         else:
 621             assert False
 622
 623     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 624         """
 625         Return the response handle.
 626
 627         See _download_webpage docstring for arguments specification.
 628         """
 629         if not self._downloader._first_webpage_request:
 630             sleep_interval = float_or_none(self._downloader.params.get('sleep_interval_requests')) or 0
 631             if sleep_interval > 0:
 632                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 633                 time.sleep(sleep_interval)
 634         else:
 635             self._downloader._first_webpage_request = False
 636
 637         if note is None:
 638             self.report_download_webpage(video_id)
 639         elif note is not False:
 640             if video_id is None:
 641                 self.to_screen('%s' % (note,))
 642             else:
 643                 self.to_screen('%s: %s' % (video_id, note))
 644
 645         # Some sites check X-Forwarded-For HTTP header in order to figure out
 646         # the origin of the client behind proxy. This allows bypassing geo
 647         # restriction by faking this header's value to IP that belongs to some
 648         # geo unrestricted country. We will do so once we encounter any
 649         # geo restriction error.
 650         if self._x_forwarded_for_ip:
 651             if 'X-Forwarded-For' not in headers:
 652                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 653
 654         if isinstance(url_or_request, compat_urllib_request.Request):
 655             url_or_request = update_Request(
 656                 url_or_request, data=data, headers=headers, query=query)
 657         else:
 658             if query:
 659                 url_or_request = update_url_query(url_or_request, query)
 660             if data is not None or headers:
 661                 url_or_request = sanitized_Request(url_or_request, data, headers)
 662         exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 663         if hasattr(ssl, 'CertificateError'):
 664             exceptions.append(ssl.CertificateError)
 665         try:
 666             return self._downloader.urlopen(url_or_request)
 667         except tuple(exceptions) as err:
 668             if isinstance(err, compat_urllib_error.HTTPError):
 669                 if self.__can_accept_status_code(err, expected_status):
 670                     # Retain reference to error to prevent file object from
 671                     # being closed before it can be read. Works around the
 672                     # effects of <https://bugs.python.org/issue15002>
 673                     # introduced in Python 3.4.1.
 674                     err.fp._error = err
 675                     return err.fp
 676
 677             if errnote is False:
 678                 return False
 679             if errnote is None:
 680                 errnote = 'Unable to download webpage'
 681
 682             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 683             if fatal:
 684                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 685             else:
 686                 self._downloader.report_warning(errmsg)
 687                 return False
 688
 689     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 690         """
 691         Return a tuple (page content as string, URL handle).
 692
 693         See _download_webpage docstring for arguments specification.
 694         """
 695         # Strip hashes from the URL (#1038)
 696         if isinstance(url_or_request, (compat_str, str)):
 697             url_or_request = url_or_request.partition('#')[0]
 698
 699         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 700         if urlh is False:
 701             assert not fatal
 702             return False
 703         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 704         return (content, urlh)
 705
 706     @staticmethod
 707     def _guess_encoding_from_content(content_type, webpage_bytes):
 708         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 709         if m:
 710             encoding = m.group(1)
 711         else:
 712             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 713                           webpage_bytes[:1024])
 714             if m:
 715                 encoding = m.group(1).decode('ascii')
 716             elif webpage_bytes.startswith(b'\xff\xfe'):
 717                 encoding = 'utf-16'
 718             else:
 719                 encoding = 'utf-8'
 720
 721         return encoding
 722
 723     def __check_blocked(self, content):
 724         first_block = content[:512]
 725         if ('<title>Access to this site is blocked</title>' in content
 726                 and 'Websense' in first_block):
 727             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 728             blocked_iframe = self._html_search_regex(
 729                 r'<iframe src="([^"]+)"', content,
 730                 'Websense information URL', default=None)
 731             if blocked_iframe:
 732                 msg += ' Visit %s for more details' % blocked_iframe
 733             raise ExtractorError(msg, expected=True)
 734         if '<title>The URL you requested has been blocked</title>' in first_block:
 735             msg = (
 736                 'Access to this webpage has been blocked by Indian censorship. '
 737                 'Use a VPN or proxy server (with --proxy) to route around it.')
 738             block_msg = self._html_search_regex(
 739                 r'</h1><p>(.*?)</p>',
 740                 content, 'block message', default=None)
 741             if block_msg:
 742                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 743             raise ExtractorError(msg, expected=True)
 744         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 745                 and 'blocklist.rkn.gov.ru' in content):
 746             raise ExtractorError(
 747                 'Access to this webpage has been blocked by decision of the Russian government. '
 748                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 749                 expected=True)
 750
 751     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 752         content_type = urlh.headers.get('Content-Type', '')
 753         webpage_bytes = urlh.read()
 754         if prefix is not None:
 755             webpage_bytes = prefix + webpage_bytes
 756         if not encoding:
 757             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 758         if self._downloader.params.get('dump_intermediate_pages', False):
 759             self.to_screen('Dumping request to ' + urlh.geturl())
 760             dump = base64.b64encode(webpage_bytes).decode('ascii')
 761             self._downloader.to_screen(dump)
 762         if self._downloader.params.get('write_pages', False):
 763             basen = '%s_%s' % (video_id, urlh.geturl())
 764             if len(basen) > 240:
 765                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 766                 basen = basen[:240 - len(h)] + h
 767             raw_filename = basen + '.dump'
 768             filename = sanitize_filename(raw_filename, restricted=True)
 769             self.to_screen('Saving request to ' + filename)
 770             # Working around MAX_PATH limitation on Windows (see
 771             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 772             if compat_os_name == 'nt':
 773                 absfilepath = os.path.abspath(filename)
 774                 if len(absfilepath) > 259:
 775                     filename = '\\\\?\\' + absfilepath
 776             with open(filename, 'wb') as outf:
 777                 outf.write(webpage_bytes)
 778
 779         try:
 780             content = webpage_bytes.decode(encoding, 'replace')
 781         except LookupError:
 782             content = webpage_bytes.decode('utf-8', 'replace')
 783
 784         self.__check_blocked(content)
 785
 786         return content
 787
 788     def _download_webpage(
 789             self, url_or_request, video_id, note=None, errnote=None,
 790             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 791             headers={}, query={}, expected_status=None):
 792         """
 793         Return the data of the page as a string.
 794
 795         Arguments:
 796         url_or_request -- plain text URL as a string or
 797             a compat_urllib_request.Requestobject
 798         video_id -- Video/playlist/item identifier (string)
 799
 800         Keyword arguments:
 801         note -- note printed before downloading (string)
 802         errnote -- note printed in case of an error (string)
 803         fatal -- flag denoting whether error should be considered fatal,
 804             i.e. whether it should cause ExtractionError to be raised,
 805             otherwise a warning will be reported and extraction continued
 806         tries -- number of tries
 807         timeout -- sleep interval between tries
 808         encoding -- encoding for a page content decoding, guessed automatically
 809             when not explicitly specified
 810         data -- POST data (bytes)
 811         headers -- HTTP headers (dict)
 812         query -- URL query (dict)
 813         expected_status -- allows to accept failed HTTP requests (non 2xx
 814             status code) by explicitly specifying a set of accepted status
 815             codes. Can be any of the following entities:
 816                 - an integer type specifying an exact failed status code to
 817                   accept
 818                 - a list or a tuple of integer types specifying a list of
 819                   failed status codes to accept
 820                 - a callable accepting an actual failed status code and
 821                   returning True if it should be accepted
 822             Note that this argument does not affect success status codes (2xx)
 823             which are always accepted.
 824         """
 825
 826         success = False
 827         try_count = 0
 828         while success is False:
 829             try:
 830                 res = self._download_webpage_handle(
 831                     url_or_request, video_id, note, errnote, fatal,
 832                     encoding=encoding, data=data, headers=headers, query=query,
 833                     expected_status=expected_status)
 834                 success = True
 835             except compat_http_client.IncompleteRead as e:
 836                 try_count += 1
 837                 if try_count >= tries:
 838                     raise e
 839                 self._sleep(timeout, video_id)
 840         if res is False:
 841             return res
 842         else:
 843             content, _ = res
 844             return content
 845
 846     def _download_xml_handle(
 847             self, url_or_request, video_id, note='Downloading XML',
 848             errnote='Unable to download XML', transform_source=None,
 849             fatal=True, encoding=None, data=None, headers={}, query={},
 850             expected_status=None):
 851         """
 852         Return a tuple (xml as an compat_etree_Element, URL handle).
 853
 854         See _download_webpage docstring for arguments specification.
 855         """
 856         res = self._download_webpage_handle(
 857             url_or_request, video_id, note, errnote, fatal=fatal,
 858             encoding=encoding, data=data, headers=headers, query=query,
 859             expected_status=expected_status)
 860         if res is False:
 861             return res
 862         xml_string, urlh = res
 863         return self._parse_xml(
 864             xml_string, video_id, transform_source=transform_source,
 865             fatal=fatal), urlh
 866
 867     def _download_xml(
 868             self, url_or_request, video_id,
 869             note='Downloading XML', errnote='Unable to download XML',
 870             transform_source=None, fatal=True, encoding=None,
 871             data=None, headers={}, query={}, expected_status=None):
 872         """
 873         Return the xml as an compat_etree_Element.
 874
 875         See _download_webpage docstring for arguments specification.
 876         """
 877         res = self._download_xml_handle(
 878             url_or_request, video_id, note=note, errnote=errnote,
 879             transform_source=transform_source, fatal=fatal, encoding=encoding,
 880             data=data, headers=headers, query=query,
 881             expected_status=expected_status)
 882         return res if res is False else res[0]
 883
 884     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 885         if transform_source:
 886             xml_string = transform_source(xml_string)
 887         try:
 888             return compat_etree_fromstring(xml_string.encode('utf-8'))
 889         except compat_xml_parse_error as ve:
 890             errmsg = '%s: Failed to parse XML ' % video_id
 891             if fatal:
 892                 raise ExtractorError(errmsg, cause=ve)
 893             else:
 894                 self.report_warning(errmsg + str(ve))
 895
 896     def _download_json_handle(
 897             self, url_or_request, video_id, note='Downloading JSON metadata',
 898             errnote='Unable to download JSON metadata', transform_source=None,
 899             fatal=True, encoding=None, data=None, headers={}, query={},
 900             expected_status=None):
 901         """
 902         Return a tuple (JSON object, URL handle).
 903
 904         See _download_webpage docstring for arguments specification.
 905         """
 906         res = self._download_webpage_handle(
 907             url_or_request, video_id, note, errnote, fatal=fatal,
 908             encoding=encoding, data=data, headers=headers, query=query,
 909             expected_status=expected_status)
 910         if res is False:
 911             return res
 912         json_string, urlh = res
 913         return self._parse_json(
 914             json_string, video_id, transform_source=transform_source,
 915             fatal=fatal), urlh
 916
 917     def _download_json(
 918             self, url_or_request, video_id, note='Downloading JSON metadata',
 919             errnote='Unable to download JSON metadata', transform_source=None,
 920             fatal=True, encoding=None, data=None, headers={}, query={},
 921             expected_status=None):
 922         """
 923         Return the JSON object as a dict.
 924
 925         See _download_webpage docstring for arguments specification.
 926         """
 927         res = self._download_json_handle(
 928             url_or_request, video_id, note=note, errnote=errnote,
 929             transform_source=transform_source, fatal=fatal, encoding=encoding,
 930             data=data, headers=headers, query=query,
 931             expected_status=expected_status)
 932         return res if res is False else res[0]
 933
 934     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 935         if transform_source:
 936             json_string = transform_source(json_string)
 937         try:
 938             return json.loads(json_string)
 939         except ValueError as ve:
 940             errmsg = '%s: Failed to parse JSON ' % video_id
 941             if fatal:
 942                 raise ExtractorError(errmsg, cause=ve)
 943             else:
 944                 self.report_warning(errmsg + str(ve))
 945
 946     def report_warning(self, msg, video_id=None):
 947         idstr = '' if video_id is None else '%s: ' % video_id
 948         self._downloader.report_warning(
 949             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 950
 951     def to_screen(self, msg):
 952         """Print msg to screen, prefixing it with '[ie_name]'"""
 953         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 954
 955     def report_extraction(self, id_or_name):
 956         """Report information extraction."""
 957         self.to_screen('%s: Extracting information' % id_or_name)
 958
 959     def report_download_webpage(self, video_id):
 960         """Report webpage download."""
 961         self.to_screen('%s: Downloading webpage' % video_id)
 962
 963     def report_age_confirmation(self):
 964         """Report attempt to confirm age."""
 965         self.to_screen('Confirming age')
 966
 967     def report_login(self):
 968         """Report attempt to log in."""
 969         self.to_screen('Logging in')
 970
 971     @staticmethod
 972     def raise_login_required(msg='This video is only available for registered users'):
 973         raise ExtractorError(
 974             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 975             expected=True)
 976
 977     @staticmethod
 978     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 979         raise GeoRestrictedError(msg, countries=countries)
 980
 981     # Methods for following #608
 982     @staticmethod
 983     def url_result(url, ie=None, video_id=None, video_title=None):
 984         """Returns a URL that points to a page that should be processed"""
 985         # TODO: ie should be the class used for getting the info
 986         video_info = {'_type': 'url',
 987                       'url': url,
 988                       'ie_key': ie}
 989         if video_id is not None:
 990             video_info['id'] = video_id
 991         if video_title is not None:
 992             video_info['title'] = video_title
 993         return video_info
 994
 995     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 996         urls = orderedSet(
 997             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 998             for m in matches)
 999         return self.playlist_result(
1000             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1001
1002     @staticmethod
1003     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1004         """Returns a playlist"""
1005         video_info = {'_type': 'playlist',
1006                       'entries': entries}
1007         video_info.update(kwargs)
1008         if playlist_id:
1009             video_info['id'] = playlist_id
1010         if playlist_title:
1011             video_info['title'] = playlist_title
1012         if playlist_description is not None:
1013             video_info['description'] = playlist_description
1014         return video_info
1015
1016     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1017         """
1018         Perform a regex search on the given string, using a single or a list of
1019         patterns returning the first matching group.
1020         In case of failure return a default value or raise a WARNING or a
1021         RegexNotFoundError, depending on fatal, specifying the field name.
1022         """
1023         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1024             mobj = re.search(pattern, string, flags)
1025         else:
1026             for p in pattern:
1027                 mobj = re.search(p, string, flags)
1028                 if mobj:
1029                     break
1030
1031         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1032             _name = '\033[0;34m%s\033[0m' % name
1033         else:
1034             _name = name
1035
1036         if mobj:
1037             if group is None:
1038                 # return the first matching group
1039                 return next(g for g in mobj.groups() if g is not None)
1040             else:
1041                 return mobj.group(group)
1042         elif default is not NO_DEFAULT:
1043             return default
1044         elif fatal:
1045             raise RegexNotFoundError('Unable to extract %s' % _name)
1046         else:
1047             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1048             return None
1049
1050     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1051         """
1052         Like _search_regex, but strips HTML tags and unescapes entities.
1053         """
1054         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1055         if res:
1056             return clean_html(res).strip()
1057         else:
1058             return res
1059
1060     def _get_netrc_login_info(self, netrc_machine=None):
1061         username = None
1062         password = None
1063         netrc_machine = netrc_machine or self._NETRC_MACHINE
1064
1065         if self._downloader.params.get('usenetrc', False):
1066             try:
1067                 info = netrc.netrc().authenticators(netrc_machine)
1068                 if info is not None:
1069                     username = info[0]
1070                     password = info[2]
1071                 else:
1072                     raise netrc.NetrcParseError(
1073                         'No authenticators for %s' % netrc_machine)
1074             except (IOError, netrc.NetrcParseError) as err:
1075                 self._downloader.report_warning(
1076                     'parsing .netrc: %s' % error_to_compat_str(err))
1077
1078         return username, password
1079
1080     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1081         """
1082         Get the login info as (username, password)
1083         First look for the manually specified credentials using username_option
1084         and password_option as keys in params dictionary. If no such credentials
1085         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1086         value.
1087         If there's no info available, return (None, None)
1088         """
1089         if self._downloader is None:
1090             return (None, None)
1091
1092         downloader_params = self._downloader.params
1093
1094         # Attempt to use provided username and password or .netrc data
1095         if downloader_params.get(username_option) is not None:
1096             username = downloader_params[username_option]
1097             password = downloader_params[password_option]
1098         else:
1099             username, password = self._get_netrc_login_info(netrc_machine)
1100
1101         return username, password
1102
1103     def _get_tfa_info(self, note='two-factor verification code'):
1104         """
1105         Get the two-factor authentication info
1106         TODO - asking the user will be required for sms/phone verify
1107         currently just uses the command line option
1108         If there's no info available, return None
1109         """
1110         if self._downloader is None:
1111             return None
1112         downloader_params = self._downloader.params
1113
1114         if downloader_params.get('twofactor') is not None:
1115             return downloader_params['twofactor']
1116
1117         return compat_getpass('Type %s and press [Return]: ' % note)
1118
1119     # Helper functions for extracting OpenGraph info
1120     @staticmethod
1121     def _og_regexes(prop):
1122         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1123         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1124                        % {'prop': re.escape(prop)})
1125         template = r'<meta[^>]+?%s[^>]+?%s'
1126         return [
1127             template % (property_re, content_re),
1128             template % (content_re, property_re),
1129         ]
1130
1131     @staticmethod
1132     def _meta_regex(prop):
1133         return r'''(?isx)<meta
1134                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1135                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1136
1137     def _og_search_property(self, prop, html, name=None, **kargs):
1138         if not isinstance(prop, (list, tuple)):
1139             prop = [prop]
1140         if name is None:
1141             name = 'OpenGraph %s' % prop[0]
1142         og_regexes = []
1143         for p in prop:
1144             og_regexes.extend(self._og_regexes(p))
1145         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1146         if escaped is None:
1147             return None
1148         return unescapeHTML(escaped)
1149
1150     def _og_search_thumbnail(self, html, **kargs):
1151         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1152
1153     def _og_search_description(self, html, **kargs):
1154         return self._og_search_property('description', html, fatal=False, **kargs)
1155
1156     def _og_search_title(self, html, **kargs):
1157         return self._og_search_property('title', html, **kargs)
1158
1159     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1160         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1161         if secure:
1162             regexes = self._og_regexes('video:secure_url') + regexes
1163         return self._html_search_regex(regexes, html, name, **kargs)
1164
1165     def _og_search_url(self, html, **kargs):
1166         return self._og_search_property('url', html, **kargs)
1167
1168     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1169         if not isinstance(name, (list, tuple)):
1170             name = [name]
1171         if display_name is None:
1172             display_name = name[0]
1173         return self._html_search_regex(
1174             [self._meta_regex(n) for n in name],
1175             html, display_name, fatal=fatal, group='content', **kwargs)
1176
1177     def _dc_search_uploader(self, html):
1178         return self._html_search_meta('dc.creator', html, 'uploader')
1179
1180     def _rta_search(self, html):
1181         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1182         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1183                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1184                      html):
1185             return 18
1186         return 0
1187
1188     def _media_rating_search(self, html):
1189         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1190         rating = self._html_search_meta('rating', html)
1191
1192         if not rating:
1193             return None
1194
1195         RATING_TABLE = {
1196             'safe for kids': 0,
1197             'general': 8,
1198             '14 years': 14,
1199             'mature': 17,
1200             'restricted': 19,
1201         }
1202         return RATING_TABLE.get(rating.lower())
1203
1204     def _family_friendly_search(self, html):
1205         # See http://schema.org/VideoObject
1206         family_friendly = self._html_search_meta(
1207             'isFamilyFriendly', html, default=None)
1208
1209         if not family_friendly:
1210             return None
1211
1212         RATING_TABLE = {
1213             '1': 0,
1214             'true': 0,
1215             '0': 18,
1216             'false': 18,
1217         }
1218         return RATING_TABLE.get(family_friendly.lower())
1219
1220     def _twitter_search_player(self, html):
1221         return self._html_search_meta('twitter:player', html,
1222                                       'twitter card player')
1223
1224     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1225         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1226         default = kwargs.get('default', NO_DEFAULT)
1227         # JSON-LD may be malformed and thus `fatal` should be respected.
1228         # At the same time `default` may be passed that assumes `fatal=False`
1229         # for _search_regex. Let's simulate the same behavior here as well.
1230         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1231         json_ld = []
1232         for mobj in json_ld_list:
1233             json_ld_item = self._parse_json(
1234                 mobj.group('json_ld'), video_id, fatal=fatal)
1235             if not json_ld_item:
1236                 continue
1237             if isinstance(json_ld_item, dict):
1238                 json_ld.append(json_ld_item)
1239             elif isinstance(json_ld_item, (list, tuple)):
1240                 json_ld.extend(json_ld_item)
1241         if json_ld:
1242             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1243         if json_ld:
1244             return json_ld
1245         if default is not NO_DEFAULT:
1246             return default
1247         elif fatal:
1248             raise RegexNotFoundError('Unable to extract JSON-LD')
1249         else:
1250             self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1251             return {}
1252
1253     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1254         if isinstance(json_ld, compat_str):
1255             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1256         if not json_ld:
1257             return {}
1258         info = {}
1259         if not isinstance(json_ld, (list, tuple, dict)):
1260             return info
1261         if isinstance(json_ld, dict):
1262             json_ld = [json_ld]
1263
1264         INTERACTION_TYPE_MAP = {
1265             'CommentAction': 'comment',
1266             'AgreeAction': 'like',
1267             'DisagreeAction': 'dislike',
1268             'LikeAction': 'like',
1269             'DislikeAction': 'dislike',
1270             'ListenAction': 'view',
1271             'WatchAction': 'view',
1272             'ViewAction': 'view',
1273         }
1274
1275         def extract_interaction_type(e):
1276             interaction_type = e.get('interactionType')
1277             if isinstance(interaction_type, dict):
1278                 interaction_type = interaction_type.get('@type')
1279             return str_or_none(interaction_type)
1280
1281         def extract_interaction_statistic(e):
1282             interaction_statistic = e.get('interactionStatistic')
1283             if isinstance(interaction_statistic, dict):
1284                 interaction_statistic = [interaction_statistic]
1285             if not isinstance(interaction_statistic, list):
1286                 return
1287             for is_e in interaction_statistic:
1288                 if not isinstance(is_e, dict):
1289                     continue
1290                 if is_e.get('@type') != 'InteractionCounter':
1291                     continue
1292                 interaction_type = extract_interaction_type(is_e)
1293                 if not interaction_type:
1294                     continue
1295                 # For interaction count some sites provide string instead of
1296                 # an integer (as per spec) with non digit characters (e.g. ",")
1297                 # so extracting count with more relaxed str_to_int
1298                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1299                 if interaction_count is None:
1300                     continue
1301                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1302                 if not count_kind:
1303                     continue
1304                 count_key = '%s_count' % count_kind
1305                 if info.get(count_key) is not None:
1306                     continue
1307                 info[count_key] = interaction_count
1308
1309         def extract_video_object(e):
1310             assert e['@type'] == 'VideoObject'
1311             author = e.get('author')
1312             info.update({
1313                 'url': url_or_none(e.get('contentUrl')),
1314                 'title': unescapeHTML(e.get('name')),
1315                 'description': unescapeHTML(e.get('description')),
1316                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1317                 'duration': parse_duration(e.get('duration')),
1318                 'timestamp': unified_timestamp(e.get('uploadDate')),
1319                 # author can be an instance of 'Organization' or 'Person' types.
1320                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1321                 # however some websites are using 'Text' type instead.
1322                 # 1. https://schema.org/VideoObject
1323                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1324                 'filesize': float_or_none(e.get('contentSize')),
1325                 'tbr': int_or_none(e.get('bitrate')),
1326                 'width': int_or_none(e.get('width')),
1327                 'height': int_or_none(e.get('height')),
1328                 'view_count': int_or_none(e.get('interactionCount')),
1329             })
1330             extract_interaction_statistic(e)
1331
1332         for e in json_ld:
1333             if '@context' in e:
1334                 item_type = e.get('@type')
1335                 if expected_type is not None and expected_type != item_type:
1336                     continue
1337                 if item_type in ('TVEpisode', 'Episode'):
1338                     episode_name = unescapeHTML(e.get('name'))
1339                     info.update({
1340                         'episode': episode_name,
1341                         'episode_number': int_or_none(e.get('episodeNumber')),
1342                         'description': unescapeHTML(e.get('description')),
1343                     })
1344                     if not info.get('title') and episode_name:
1345                         info['title'] = episode_name
1346                     part_of_season = e.get('partOfSeason')
1347                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1348                         info.update({
1349                             'season': unescapeHTML(part_of_season.get('name')),
1350                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1351                         })
1352                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1353                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1354                         info['series'] = unescapeHTML(part_of_series.get('name'))
1355                 elif item_type == 'Movie':
1356                     info.update({
1357                         'title': unescapeHTML(e.get('name')),
1358                         'description': unescapeHTML(e.get('description')),
1359                         'duration': parse_duration(e.get('duration')),
1360                         'timestamp': unified_timestamp(e.get('dateCreated')),
1361                     })
1362                 elif item_type in ('Article', 'NewsArticle'):
1363                     info.update({
1364                         'timestamp': parse_iso8601(e.get('datePublished')),
1365                         'title': unescapeHTML(e.get('headline')),
1366                         'description': unescapeHTML(e.get('articleBody')),
1367                     })
1368                 elif item_type == 'VideoObject':
1369                     extract_video_object(e)
1370                     if expected_type is None:
1371                         continue
1372                     else:
1373                         break
1374                 video = e.get('video')
1375                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1376                     extract_video_object(video)
1377                 if expected_type is None:
1378                     continue
1379                 else:
1380                     break
1381         return dict((k, v) for k, v in info.items() if v is not None)
1382
1383     @staticmethod
1384     def _hidden_inputs(html):
1385         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1386         hidden_inputs = {}
1387         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1388             attrs = extract_attributes(input)
1389             if not input:
1390                 continue
1391             if attrs.get('type') not in ('hidden', 'submit'):
1392                 continue
1393             name = attrs.get('name') or attrs.get('id')
1394             value = attrs.get('value')
1395             if name and value is not None:
1396                 hidden_inputs[name] = value
1397         return hidden_inputs
1398
1399     def _form_hidden_inputs(self, form_id, html):
1400         form = self._search_regex(
1401             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1402             html, '%s form' % form_id, group='form')
1403         return self._hidden_inputs(form)
1404
1405     class FormatSort:
1406         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1407
1408         default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
1409                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1410                    'proto', 'ext', 'has_audio', 'source', 'format_id')  # These must not be aliases
1411
1412         settings = {
1413             'vcodec': {'type': 'ordered', 'regex': True,
1414                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1415             'acodec': {'type': 'ordered', 'regex': True,
1416                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1417             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1418                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
1419             'vext': {'type': 'ordered', 'field': 'video_ext',
1420                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1421                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1422             'aext': {'type': 'ordered', 'field': 'audio_ext',
1423                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1424                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1425             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1426             'ie_pref': {'priority': True, 'type': 'extractor'},
1427             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1428             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1429             'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
1430             'quality': {'convert': 'float_none'},
1431             'filesize': {'convert': 'bytes'},
1432             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1433             'id': {'convert': 'string', 'field': 'format_id'},
1434             'height': {'convert': 'float_none'},
1435             'width': {'convert': 'float_none'},
1436             'fps': {'convert': 'float_none'},
1437             'tbr': {'convert': 'float_none'},
1438             'vbr': {'convert': 'float_none'},
1439             'abr': {'convert': 'float_none'},
1440             'asr': {'convert': 'float_none'},
1441             'source': {'convert': 'ignore', 'field': 'source_preference'},
1442
1443             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1444             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1445             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1446             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1447             'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1448
1449             # Most of these exist only for compatibility reasons
1450             'dimension': {'type': 'alias', 'field': 'res'},
1451             'resolution': {'type': 'alias', 'field': 'res'},
1452             'extension': {'type': 'alias', 'field': 'ext'},
1453             'bitrate': {'type': 'alias', 'field': 'br'},
1454             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1455             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1456             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1457             'framerate': {'type': 'alias', 'field': 'fps'},
1458             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1459             'protocol': {'type': 'alias', 'field': 'proto'},
1460             'source_preference': {'type': 'alias', 'field': 'source'},
1461             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1462             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1463             'samplerate': {'type': 'alias', 'field': 'asr'},
1464             'video_ext': {'type': 'alias', 'field': 'vext'},
1465             'audio_ext': {'type': 'alias', 'field': 'aext'},
1466             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1467             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1468             'video': {'type': 'alias', 'field': 'hasvid'},
1469             'has_video': {'type': 'alias', 'field': 'hasvid'},
1470             'audio': {'type': 'alias', 'field': 'hasaud'},
1471             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1472             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1473             'preference': {'type': 'alias', 'field': 'ie_pref'},
1474             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1475             'format_id': {'type': 'alias', 'field': 'id'},
1476         }
1477
1478         _order = []
1479
1480         def _get_field_setting(self, field, key):
1481             if field not in self.settings:
1482                 self.settings[field] = {}
1483             propObj = self.settings[field]
1484             if key not in propObj:
1485                 type = propObj.get('type')
1486                 if key == 'field':
1487                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1488                 elif key == 'convert':
1489                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1490                 else:
1491                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1492                 propObj[key] = default
1493             return propObj[key]
1494
1495         def _resolve_field_value(self, field, value, convertNone=False):
1496             if value is None:
1497                 if not convertNone:
1498                     return None
1499             else:
1500                 value = value.lower()
1501             conversion = self._get_field_setting(field, 'convert')
1502             if conversion == 'ignore':
1503                 return None
1504             if conversion == 'string':
1505                 return value
1506             elif conversion == 'float_none':
1507                 return float_or_none(value)
1508             elif conversion == 'bytes':
1509                 return FileDownloader.parse_bytes(value)
1510             elif conversion == 'order':
1511                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1512                 use_regex = self._get_field_setting(field, 'regex')
1513                 list_length = len(order_list)
1514                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1515                 if use_regex and value is not None:
1516                     for i, regex in enumerate(order_list):
1517                         if regex and re.match(regex, value):
1518                             return list_length - i
1519                     return list_length - empty_pos  # not in list
1520                 else:  # not regex or  value = None
1521                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1522             else:
1523                 if value.isnumeric():
1524                     return float(value)
1525                 else:
1526                     self.settings[field]['convert'] = 'string'
1527                     return value
1528
1529         def evaluate_params(self, params, sort_extractor):
1530             self._use_free_order = params.get('prefer_free_formats', False)
1531             self._sort_user = params.get('format_sort', [])
1532             self._sort_extractor = sort_extractor
1533
1534             def add_item(field, reverse, closest, limit_text):
1535                 field = field.lower()
1536                 if field in self._order:
1537                     return
1538                 self._order.append(field)
1539                 limit = self._resolve_field_value(field, limit_text)
1540                 data = {
1541                     'reverse': reverse,
1542                     'closest': False if limit is None else closest,
1543                     'limit_text': limit_text,
1544                     'limit': limit}
1545                 if field in self.settings:
1546                     self.settings[field].update(data)
1547                 else:
1548                     self.settings[field] = data
1549
1550             sort_list = (
1551                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1552                 + (tuple() if params.get('format_sort_force', False)
1553                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1554                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1555
1556             for item in sort_list:
1557                 match = re.match(self.regex, item)
1558                 if match is None:
1559                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1560                 field = match.group('field')
1561                 if field is None:
1562                     continue
1563                 if self._get_field_setting(field, 'type') == 'alias':
1564                     field = self._get_field_setting(field, 'field')
1565                 reverse = match.group('reverse') is not None
1566                 closest = match.group('separator') == '~'
1567                 limit_text = match.group('limit')
1568
1569                 has_limit = limit_text is not None
1570                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1571                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1572
1573                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1574                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1575                 limit_count = len(limits)
1576                 for (i, f) in enumerate(fields):
1577                     add_item(f, reverse, closest,
1578                              limits[i] if i < limit_count
1579                              else limits[0] if has_limit and not has_multiple_limits
1580                              else None)
1581
1582         def print_verbose_info(self, to_screen):
1583             to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
1584             if self._sort_extractor:
1585                 to_screen('[debug] Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1586             to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1587                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1588                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1589                               self._get_field_setting(field, 'limit_text'),
1590                               self._get_field_setting(field, 'limit'))
1591                 if self._get_field_setting(field, 'limit_text') is not None else '')
1592                 for field in self._order if self._get_field_setting(field, 'visible')]))
1593
1594         def _calculate_field_preference_from_value(self, format, field, type, value):
1595             reverse = self._get_field_setting(field, 'reverse')
1596             closest = self._get_field_setting(field, 'closest')
1597             limit = self._get_field_setting(field, 'limit')
1598
1599             if type == 'extractor':
1600                 maximum = self._get_field_setting(field, 'max')
1601                 if value is None or (maximum is not None and value >= maximum):
1602                     value = -1
1603             elif type == 'boolean':
1604                 in_list = self._get_field_setting(field, 'in_list')
1605                 not_in_list = self._get_field_setting(field, 'not_in_list')
1606                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1607             elif type == 'ordered':
1608                 value = self._resolve_field_value(field, value, True)
1609
1610             # try to convert to number
1611             val_num = float_or_none(value)
1612             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1613             if is_num:
1614                 value = val_num
1615
1616             return ((-10, 0) if value is None
1617                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1618                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1619                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1620                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1621                     else (-1, value, 0))
1622
1623         def _calculate_field_preference(self, format, field):
1624             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1625             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1626             if type == 'multiple':
1627                 type = 'field'  # Only 'field' is allowed in multiple for now
1628                 actual_fields = self._get_field_setting(field, 'field')
1629
1630                 def wrapped_function(values):
1631                     values = tuple(filter(lambda x: x is not None, values))
1632                     return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1633                             else values[0] if values
1634                             else None)
1635
1636                 value = wrapped_function((get_value(f) for f in actual_fields))
1637             else:
1638                 value = get_value(field)
1639             return self._calculate_field_preference_from_value(format, field, type, value)
1640
1641         def calculate_preference(self, format):
1642             # Determine missing protocol
1643             if not format.get('protocol'):
1644                 format['protocol'] = determine_protocol(format)
1645
1646             # Determine missing ext
1647             if not format.get('ext') and 'url' in format:
1648                 format['ext'] = determine_ext(format['url'])
1649             if format.get('vcodec') == 'none':
1650                 format['audio_ext'] = format['ext']
1651                 format['video_ext'] = 'none'
1652             else:
1653                 format['video_ext'] = format['ext']
1654                 format['audio_ext'] = 'none'
1655             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1656             #    format['preference'] = -1000
1657
1658             # Determine missing bitrates
1659             if format.get('tbr') is None:
1660                 if format.get('vbr') is not None and format.get('abr') is not None:
1661                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1662             else:
1663                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1664                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1665                 if format.get('acodec') != "none" and format.get('abr') is None:
1666                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1667
1668             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1669
1670     def _sort_formats(self, formats, field_preference=[]):
1671         if not formats:
1672             raise ExtractorError('No video formats found')
1673         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1674         format_sort.evaluate_params(self._downloader.params, field_preference)
1675         if self._downloader.params.get('verbose', False):
1676             format_sort.print_verbose_info(self._downloader.to_screen)
1677         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1678
1679     def _check_formats(self, formats, video_id):
1680         if formats:
1681             formats[:] = filter(
1682                 lambda f: self._is_valid_url(
1683                     f['url'], video_id,
1684                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1685                 formats)
1686
1687     @staticmethod
1688     def _remove_duplicate_formats(formats):
1689         format_urls = set()
1690         unique_formats = []
1691         for f in formats:
1692             if f['url'] not in format_urls:
1693                 format_urls.add(f['url'])
1694                 unique_formats.append(f)
1695         formats[:] = unique_formats
1696
1697     def _is_valid_url(self, url, video_id, item='video', headers={}):
1698         url = self._proto_relative_url(url, scheme='http:')
1699         # For now assume non HTTP(S) URLs always valid
1700         if not (url.startswith('http://') or url.startswith('https://')):
1701             return True
1702         try:
1703             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1704             return True
1705         except ExtractorError as e:
1706             self.to_screen(
1707                 '%s: %s URL is invalid, skipping: %s'
1708                 % (video_id, item, error_to_compat_str(e.cause)))
1709             return False
1710
1711     def http_scheme(self):
1712         """ Either "http:" or "https:", depending on the user's preferences """
1713         return (
1714             'http:'
1715             if self._downloader.params.get('prefer_insecure', False)
1716             else 'https:')
1717
1718     def _proto_relative_url(self, url, scheme=None):
1719         if url is None:
1720             return url
1721         if url.startswith('//'):
1722             if scheme is None:
1723                 scheme = self.http_scheme()
1724             return scheme + url
1725         else:
1726             return url
1727
1728     def _sleep(self, timeout, video_id, msg_template=None):
1729         if msg_template is None:
1730             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1731         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1732         self.to_screen(msg)
1733         time.sleep(timeout)
1734
1735     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1736                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1737                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1738         manifest = self._download_xml(
1739             manifest_url, video_id, 'Downloading f4m manifest',
1740             'Unable to download f4m manifest',
1741             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1742             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1743             transform_source=transform_source,
1744             fatal=fatal, data=data, headers=headers, query=query)
1745
1746         if manifest is False:
1747             return []
1748
1749         return self._parse_f4m_formats(
1750             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1751             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1752
1753     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1754                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1755                            fatal=True, m3u8_id=None):
1756         if not isinstance(manifest, compat_etree_Element) and not fatal:
1757             return []
1758
1759         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1760         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1761         if akamai_pv is not None and ';' in akamai_pv.text:
1762             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1763             if playerVerificationChallenge.strip() != '':
1764                 return []
1765
1766         formats = []
1767         manifest_version = '1.0'
1768         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1769         if not media_nodes:
1770             manifest_version = '2.0'
1771             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1772         # Remove unsupported DRM protected media from final formats
1773         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1774         media_nodes = remove_encrypted_media(media_nodes)
1775         if not media_nodes:
1776             return formats
1777
1778         manifest_base_url = get_base_url(manifest)
1779
1780         bootstrap_info = xpath_element(
1781             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1782             'bootstrap info', default=None)
1783
1784         vcodec = None
1785         mime_type = xpath_text(
1786             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1787             'base URL', default=None)
1788         if mime_type and mime_type.startswith('audio/'):
1789             vcodec = 'none'
1790
1791         for i, media_el in enumerate(media_nodes):
1792             tbr = int_or_none(media_el.attrib.get('bitrate'))
1793             width = int_or_none(media_el.attrib.get('width'))
1794             height = int_or_none(media_el.attrib.get('height'))
1795             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1796             # If <bootstrapInfo> is present, the specified f4m is a
1797             # stream-level manifest, and only set-level manifests may refer to
1798             # external resources.  See section 11.4 and section 4 of F4M spec
1799             if bootstrap_info is None:
1800                 media_url = None
1801                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1802                 if manifest_version == '2.0':
1803                     media_url = media_el.attrib.get('href')
1804                 if media_url is None:
1805                     media_url = media_el.attrib.get('url')
1806                 if not media_url:
1807                     continue
1808                 manifest_url = (
1809                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1810                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1811                 # If media_url is itself a f4m manifest do the recursive extraction
1812                 # since bitrates in parent manifest (this one) and media_url manifest
1813                 # may differ leading to inability to resolve the format by requested
1814                 # bitrate in f4m downloader
1815                 ext = determine_ext(manifest_url)
1816                 if ext == 'f4m':
1817                     f4m_formats = self._extract_f4m_formats(
1818                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1819                         transform_source=transform_source, fatal=fatal)
1820                     # Sometimes stream-level manifest contains single media entry that
1821                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1822                     # At the same time parent's media entry in set-level manifest may
1823                     # contain it. We will copy it from parent in such cases.
1824                     if len(f4m_formats) == 1:
1825                         f = f4m_formats[0]
1826                         f.update({
1827                             'tbr': f.get('tbr') or tbr,
1828                             'width': f.get('width') or width,
1829                             'height': f.get('height') or height,
1830                             'format_id': f.get('format_id') if not tbr else format_id,
1831                             'vcodec': vcodec,
1832                         })
1833                     formats.extend(f4m_formats)
1834                     continue
1835                 elif ext == 'm3u8':
1836                     formats.extend(self._extract_m3u8_formats(
1837                         manifest_url, video_id, 'mp4', preference=preference,
1838                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1839                     continue
1840             formats.append({
1841                 'format_id': format_id,
1842                 'url': manifest_url,
1843                 'manifest_url': manifest_url,
1844                 'ext': 'flv' if bootstrap_info is not None else None,
1845                 'protocol': 'f4m',
1846                 'tbr': tbr,
1847                 'width': width,
1848                 'height': height,
1849                 'vcodec': vcodec,
1850                 'preference': preference,
1851                 'quality': quality,
1852             })
1853         return formats
1854
1855     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1856         return {
1857             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1858             'url': m3u8_url,
1859             'ext': ext,
1860             'protocol': 'm3u8',
1861             'preference': preference - 100 if preference else -100,
1862             'quality': quality,
1863             'resolution': 'multiple',
1864             'format_note': 'Quality selection URL',
1865         }
1866
1867     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1868                               entry_protocol='m3u8', preference=None, quality=None,
1869                               m3u8_id=None, note=None, errnote=None,
1870                               fatal=True, live=False, data=None, headers={},
1871                               query={}):
1872         res = self._download_webpage_handle(
1873             m3u8_url, video_id,
1874             note=note or 'Downloading m3u8 information',
1875             errnote=errnote or 'Failed to download m3u8 information',
1876             fatal=fatal, data=data, headers=headers, query=query)
1877
1878         if res is False:
1879             return []
1880
1881         m3u8_doc, urlh = res
1882         m3u8_url = urlh.geturl()
1883
1884         return self._parse_m3u8_formats(
1885             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1886             preference=preference, quality=quality, m3u8_id=m3u8_id,
1887             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1888             headers=headers, query=query, video_id=video_id)
1889
1890     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1891                             entry_protocol='m3u8', preference=None, quality=None,
1892                             m3u8_id=None, live=False, note=None, errnote=None,
1893                             fatal=True, data=None, headers={}, query={}, video_id=None):
1894         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1895             return []
1896
1897         if (not self._downloader.params.get('allow_unplayable_formats')
1898                 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)):  # Apple FairPlay
1899             return []
1900
1901         formats = []
1902
1903         format_url = lambda u: (
1904             u
1905             if re.match(r'^https?://', u)
1906             else compat_urlparse.urljoin(m3u8_url, u))
1907
1908         split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
1909
1910         # References:
1911         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1912         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1913         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1914
1915         # We should try extracting formats only from master playlists [1, 4.3.4],
1916         # i.e. playlists that describe available qualities. On the other hand
1917         # media playlists [1, 4.3.3] should be returned as is since they contain
1918         # just the media without qualities renditions.
1919         # Fortunately, master playlist can be easily distinguished from media
1920         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1921         # master playlist tags MUST NOT appear in a media playlist and vice versa.
1922         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1923         # media playlist and MUST NOT appear in master playlist thus we can
1924         # clearly detect media playlist with this criterion.
1925
1926         def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None,
1927                                            fatal=True, data=None, headers={}):
1928             if not m3u8_doc:
1929                 if not format_url:
1930                     return []
1931                 res = self._download_webpage_handle(
1932                     format_url, video_id,
1933                     note=False,
1934                     errnote='Failed to download m3u8 playlist information',
1935                     fatal=fatal, data=data, headers=headers)
1936
1937                 if res is False:
1938                     return []
1939
1940                 m3u8_doc, urlh = res
1941                 format_url = urlh.geturl()
1942
1943             playlist_formats = []
1944             i = (
1945                 0
1946                 if split_discontinuity
1947                 else None)
1948             format_info = {
1949                 'index': i,
1950                 'key_data': None,
1951                 'files': [],
1952             }
1953             for line in m3u8_doc.splitlines():
1954                 if not line.startswith('#'):
1955                     format_info['files'].append(line)
1956                 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
1957                     i += 1
1958                     playlist_formats.append(format_info)
1959                     format_info = {
1960                         'index': i,
1961                         'url': format_url,
1962                         'files': [],
1963                     }
1964             playlist_formats.append(format_info)
1965             return playlist_formats
1966
1967         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1968
1969             playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
1970
1971             for format in playlist_formats:
1972                 format_id = []
1973                 if m3u8_id:
1974                     format_id.append(m3u8_id)
1975                 format_index = format.get('index')
1976                 if format_index:
1977                     format_id.append(str(format_index))
1978                 f = {
1979                     'format_id': '-'.join(format_id),
1980                     'format_index': format_index,
1981                     'url': m3u8_url,
1982                     'ext': ext,
1983                     'protocol': entry_protocol,
1984                     'preference': preference,
1985                     'quality': quality,
1986                 }
1987                 formats.append(f)
1988
1989             return formats
1990
1991         groups = {}
1992         last_stream_inf = {}
1993
1994         def extract_media(x_media_line):
1995             media = parse_m3u8_attributes(x_media_line)
1996             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1997             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1998             if not (media_type and group_id and name):
1999                 return
2000             groups.setdefault(group_id, []).append(media)
2001             if media_type not in ('VIDEO', 'AUDIO'):
2002                 return
2003             media_url = media.get('URI')
2004             if media_url:
2005                 manifest_url = format_url(media_url)
2006                 format_id = []
2007                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2008                                                                   fatal=fatal, data=data, headers=headers)
2009
2010                 for format in playlist_formats:
2011                     format_index = format.get('index')
2012                     for v in (m3u8_id, group_id, name):
2013                         if v:
2014                             format_id.append(v)
2015                     if format_index:
2016                         format_id.append(str(format_index))
2017                     f = {
2018                         'format_id': '-'.join(format_id),
2019                         'format_index': format_index,
2020                         'url': manifest_url,
2021                         'manifest_url': m3u8_url,
2022                         'language': media.get('LANGUAGE'),
2023                         'ext': ext,
2024                         'protocol': entry_protocol,
2025                         'preference': preference,
2026                         'quality': quality,
2027                     }
2028                     if media_type == 'AUDIO':
2029                         f['vcodec'] = 'none'
2030                     formats.append(f)
2031
2032         def build_stream_name():
2033             # Despite specification does not mention NAME attribute for
2034             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2035             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2036             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2037             stream_name = last_stream_inf.get('NAME')
2038             if stream_name:
2039                 return stream_name
2040             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2041             # from corresponding rendition group
2042             stream_group_id = last_stream_inf.get('VIDEO')
2043             if not stream_group_id:
2044                 return
2045             stream_group = groups.get(stream_group_id)
2046             if not stream_group:
2047                 return stream_group_id
2048             rendition = stream_group[0]
2049             return rendition.get('NAME') or stream_group_id
2050
2051         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2052         # chance to detect video only formats when EXT-X-STREAM-INF tags
2053         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2054         for line in m3u8_doc.splitlines():
2055             if line.startswith('#EXT-X-MEDIA:'):
2056                 extract_media(line)
2057
2058         for line in m3u8_doc.splitlines():
2059             if line.startswith('#EXT-X-STREAM-INF:'):
2060                 last_stream_inf = parse_m3u8_attributes(line)
2061             elif line.startswith('#') or not line.strip():
2062                 continue
2063             else:
2064                 tbr = float_or_none(
2065                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2066                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2067                 manifest_url = format_url(line.strip())
2068
2069                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2070                                                                   fatal=fatal, data=data, headers=headers)
2071
2072                 for frmt in playlist_formats:
2073                     format_id = []
2074                     if m3u8_id:
2075                         format_id.append(m3u8_id)
2076                     format_index = frmt.get('index')
2077                     stream_name = build_stream_name()
2078                     # Bandwidth of live streams may differ over time thus making
2079                     # format_id unpredictable. So it's better to keep provided
2080                     # format_id intact.
2081                     if not live:
2082                         format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2083                     if format_index:
2084                         format_id.append(str(format_index))
2085                     f = {
2086                         'format_id': '-'.join(format_id),
2087                         'format_index': format_index,
2088                         'url': manifest_url,
2089                         'manifest_url': m3u8_url,
2090                         'tbr': tbr,
2091                         'ext': ext,
2092                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2093                         'protocol': entry_protocol,
2094                         'preference': preference,
2095                         'quality': quality,
2096                     }
2097                     resolution = last_stream_inf.get('RESOLUTION')
2098                     if resolution:
2099                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2100                         if mobj:
2101                             f['width'] = int(mobj.group('width'))
2102                             f['height'] = int(mobj.group('height'))
2103                     # Unified Streaming Platform
2104                     mobj = re.search(
2105                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2106                     if mobj:
2107                         abr, vbr = mobj.groups()
2108                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2109                         f.update({
2110                             'vbr': vbr,
2111                             'abr': abr,
2112                         })
2113                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2114                     f.update(codecs)
2115                     audio_group_id = last_stream_inf.get('AUDIO')
2116                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2117                     # references a rendition group MUST have a CODECS attribute.
2118                     # However, this is not always respected, for example, [2]
2119                     # contains EXT-X-STREAM-INF tag which references AUDIO
2120                     # rendition group but does not have CODECS and despite
2121                     # referencing an audio group it represents a complete
2122                     # (with audio and video) format. So, for such cases we will
2123                     # ignore references to rendition groups and treat them
2124                     # as complete formats.
2125                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2126                         audio_group = groups.get(audio_group_id)
2127                         if audio_group and audio_group[0].get('URI'):
2128                             # TODO: update acodec for audio only formats with
2129                             # the same GROUP-ID
2130                             f['acodec'] = 'none'
2131                     if not f.get('ext'):
2132                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2133                     formats.append(f)
2134
2135                     # for DailyMotion
2136                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2137                     if progressive_uri:
2138                         http_f = f.copy()
2139                         del http_f['manifest_url']
2140                         http_f.update({
2141                             'format_id': f['format_id'].replace('hls-', 'http-'),
2142                             'protocol': 'http',
2143                             'url': progressive_uri,
2144                         })
2145                         formats.append(http_f)
2146
2147                 last_stream_inf = {}
2148         return formats
2149
2150     @staticmethod
2151     def _xpath_ns(path, namespace=None):
2152         if not namespace:
2153             return path
2154         out = []
2155         for c in path.split('/'):
2156             if not c or c == '.':
2157                 out.append(c)
2158             else:
2159                 out.append('{%s}%s' % (namespace, c))
2160         return '/'.join(out)
2161
2162     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2163         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2164
2165         if smil is False:
2166             assert not fatal
2167             return []
2168
2169         namespace = self._parse_smil_namespace(smil)
2170
2171         return self._parse_smil_formats(
2172             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2173
2174     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2175         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2176         if smil is False:
2177             return {}
2178         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2179
2180     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2181         return self._download_xml(
2182             smil_url, video_id, 'Downloading SMIL file',
2183             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2184
2185     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2186         namespace = self._parse_smil_namespace(smil)
2187
2188         formats = self._parse_smil_formats(
2189             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2190         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2191
2192         video_id = os.path.splitext(url_basename(smil_url))[0]
2193         title = None
2194         description = None
2195         upload_date = None
2196         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2197             name = meta.attrib.get('name')
2198             content = meta.attrib.get('content')
2199             if not name or not content:
2200                 continue
2201             if not title and name == 'title':
2202                 title = content
2203             elif not description and name in ('description', 'abstract'):
2204                 description = content
2205             elif not upload_date and name == 'date':
2206                 upload_date = unified_strdate(content)
2207
2208         thumbnails = [{
2209             'id': image.get('type'),
2210             'url': image.get('src'),
2211             'width': int_or_none(image.get('width')),
2212             'height': int_or_none(image.get('height')),
2213         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2214
2215         return {
2216             'id': video_id,
2217             'title': title or video_id,
2218             'description': description,
2219             'upload_date': upload_date,
2220             'thumbnails': thumbnails,
2221             'formats': formats,
2222             'subtitles': subtitles,
2223         }
2224
2225     def _parse_smil_namespace(self, smil):
2226         return self._search_regex(
2227             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2228
2229     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2230         base = smil_url
2231         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2232             b = meta.get('base') or meta.get('httpBase')
2233             if b:
2234                 base = b
2235                 break
2236
2237         formats = []
2238         rtmp_count = 0
2239         http_count = 0
2240         m3u8_count = 0
2241
2242         srcs = []
2243         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2244         for medium in media:
2245             src = medium.get('src')
2246             if not src or src in srcs:
2247                 continue
2248             srcs.append(src)
2249
2250             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2251             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2252             width = int_or_none(medium.get('width'))
2253             height = int_or_none(medium.get('height'))
2254             proto = medium.get('proto')
2255             ext = medium.get('ext')
2256             src_ext = determine_ext(src)
2257             streamer = medium.get('streamer') or base
2258
2259             if proto == 'rtmp' or streamer.startswith('rtmp'):
2260                 rtmp_count += 1
2261                 formats.append({
2262                     'url': streamer,
2263                     'play_path': src,
2264                     'ext': 'flv',
2265                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2266                     'tbr': bitrate,
2267                     'filesize': filesize,
2268                     'width': width,
2269                     'height': height,
2270                 })
2271                 if transform_rtmp_url:
2272                     streamer, src = transform_rtmp_url(streamer, src)
2273                     formats[-1].update({
2274                         'url': streamer,
2275                         'play_path': src,
2276                     })
2277                 continue
2278
2279             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2280             src_url = src_url.strip()
2281
2282             if proto == 'm3u8' or src_ext == 'm3u8':
2283                 m3u8_formats = self._extract_m3u8_formats(
2284                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2285                 if len(m3u8_formats) == 1:
2286                     m3u8_count += 1
2287                     m3u8_formats[0].update({
2288                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2289                         'tbr': bitrate,
2290                         'width': width,
2291                         'height': height,
2292                     })
2293                 formats.extend(m3u8_formats)
2294             elif src_ext == 'f4m':
2295                 f4m_url = src_url
2296                 if not f4m_params:
2297                     f4m_params = {
2298                         'hdcore': '3.2.0',
2299                         'plugin': 'flowplayer-3.2.0.1',
2300                     }
2301                 f4m_url += '&' if '?' in f4m_url else '?'
2302                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2303                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2304             elif src_ext == 'mpd':
2305                 formats.extend(self._extract_mpd_formats(
2306                     src_url, video_id, mpd_id='dash', fatal=False))
2307             elif re.search(r'\.ism/[Mm]anifest', src_url):
2308                 formats.extend(self._extract_ism_formats(
2309                     src_url, video_id, ism_id='mss', fatal=False))
2310             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2311                 http_count += 1
2312                 formats.append({
2313                     'url': src_url,
2314                     'ext': ext or src_ext or 'flv',
2315                     'format_id': 'http-%d' % (bitrate or http_count),
2316                     'tbr': bitrate,
2317                     'filesize': filesize,
2318                     'width': width,
2319                     'height': height,
2320                 })
2321
2322         return formats
2323
2324     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2325         urls = []
2326         subtitles = {}
2327         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2328             src = textstream.get('src')
2329             if not src or src in urls:
2330                 continue
2331             urls.append(src)
2332             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2333             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2334             subtitles.setdefault(lang, []).append({
2335                 'url': src,
2336                 'ext': ext,
2337             })
2338         return subtitles
2339
2340     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2341         xspf = self._download_xml(
2342             xspf_url, playlist_id, 'Downloading xpsf playlist',
2343             'Unable to download xspf manifest', fatal=fatal)
2344         if xspf is False:
2345             return []
2346         return self._parse_xspf(
2347             xspf, playlist_id, xspf_url=xspf_url,
2348             xspf_base_url=base_url(xspf_url))
2349
2350     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2351         NS_MAP = {
2352             'xspf': 'http://xspf.org/ns/0/',
2353             's1': 'http://static.streamone.nl/player/ns/0',
2354         }
2355
2356         entries = []
2357         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2358             title = xpath_text(
2359                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2360             description = xpath_text(
2361                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2362             thumbnail = xpath_text(
2363                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2364             duration = float_or_none(
2365                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2366
2367             formats = []
2368             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2369                 format_url = urljoin(xspf_base_url, location.text)
2370                 if not format_url:
2371                     continue
2372                 formats.append({
2373                     'url': format_url,
2374                     'manifest_url': xspf_url,
2375                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2376                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2377                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2378                 })
2379             self._sort_formats(formats)
2380
2381             entries.append({
2382                 'id': playlist_id,
2383                 'title': title,
2384                 'description': description,
2385                 'thumbnail': thumbnail,
2386                 'duration': duration,
2387                 'formats': formats,
2388             })
2389         return entries
2390
2391     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2392         res = self._download_xml_handle(
2393             mpd_url, video_id,
2394             note=note or 'Downloading MPD manifest',
2395             errnote=errnote or 'Failed to download MPD manifest',
2396             fatal=fatal, data=data, headers=headers, query=query)
2397         if res is False:
2398             return []
2399         mpd_doc, urlh = res
2400         if mpd_doc is None:
2401             return []
2402         mpd_base_url = base_url(urlh.geturl())
2403
2404         return self._parse_mpd_formats(
2405             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2406
2407     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2408         """
2409         Parse formats from MPD manifest.
2410         References:
2411          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2412             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2413          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2414         """
2415         if not self._downloader.params.get('dynamic_mpd', True):
2416             if mpd_doc.get('type') == 'dynamic':
2417                 return []
2418
2419         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2420
2421         def _add_ns(path):
2422             return self._xpath_ns(path, namespace)
2423
2424         def is_drm_protected(element):
2425             return element.find(_add_ns('ContentProtection')) is not None
2426
2427         def extract_multisegment_info(element, ms_parent_info):
2428             ms_info = ms_parent_info.copy()
2429
2430             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2431             # common attributes and elements.  We will only extract relevant
2432             # for us.
2433             def extract_common(source):
2434                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2435                 if segment_timeline is not None:
2436                     s_e = segment_timeline.findall(_add_ns('S'))
2437                     if s_e:
2438                         ms_info['total_number'] = 0
2439                         ms_info['s'] = []
2440                         for s in s_e:
2441                             r = int(s.get('r', 0))
2442                             ms_info['total_number'] += 1 + r
2443                             ms_info['s'].append({
2444                                 't': int(s.get('t', 0)),
2445                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2446                                 'd': int(s.attrib['d']),
2447                                 'r': r,
2448                             })
2449                 start_number = source.get('startNumber')
2450                 if start_number:
2451                     ms_info['start_number'] = int(start_number)
2452                 timescale = source.get('timescale')
2453                 if timescale:
2454                     ms_info['timescale'] = int(timescale)
2455                 segment_duration = source.get('duration')
2456                 if segment_duration:
2457                     ms_info['segment_duration'] = float(segment_duration)
2458
2459             def extract_Initialization(source):
2460                 initialization = source.find(_add_ns('Initialization'))
2461                 if initialization is not None:
2462                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2463
2464             segment_list = element.find(_add_ns('SegmentList'))
2465             if segment_list is not None:
2466                 extract_common(segment_list)
2467                 extract_Initialization(segment_list)
2468                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2469                 if segment_urls_e:
2470                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2471             else:
2472                 segment_template = element.find(_add_ns('SegmentTemplate'))
2473                 if segment_template is not None:
2474                     extract_common(segment_template)
2475                     media = segment_template.get('media')
2476                     if media:
2477                         ms_info['media'] = media
2478                     initialization = segment_template.get('initialization')
2479                     if initialization:
2480                         ms_info['initialization'] = initialization
2481                     else:
2482                         extract_Initialization(segment_template)
2483             return ms_info
2484
2485         skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
2486
2487         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2488         formats = []
2489         for period in mpd_doc.findall(_add_ns('Period')):
2490             period_duration = parse_duration(period.get('duration')) or mpd_duration
2491             period_ms_info = extract_multisegment_info(period, {
2492                 'start_number': 1,
2493                 'timescale': 1,
2494             })
2495             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2496                 if skip_unplayable and is_drm_protected(adaptation_set):
2497                     continue
2498                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2499                 for representation in adaptation_set.findall(_add_ns('Representation')):
2500                     if skip_unplayable and is_drm_protected(representation):
2501                         continue
2502                     representation_attrib = adaptation_set.attrib.copy()
2503                     representation_attrib.update(representation.attrib)
2504                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2505                     mime_type = representation_attrib['mimeType']
2506                     content_type = mime_type.split('/')[0]
2507                     if content_type == 'text':
2508                         # TODO implement WebVTT downloading
2509                         pass
2510                     elif content_type in ('video', 'audio'):
2511                         base_url = ''
2512                         for element in (representation, adaptation_set, period, mpd_doc):
2513                             base_url_e = element.find(_add_ns('BaseURL'))
2514                             if base_url_e is not None:
2515                                 base_url = base_url_e.text + base_url
2516                                 if re.match(r'^https?://', base_url):
2517                                     break
2518                         if mpd_base_url and not re.match(r'^https?://', base_url):
2519                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2520                                 mpd_base_url += '/'
2521                             base_url = mpd_base_url + base_url
2522                         representation_id = representation_attrib.get('id')
2523                         lang = representation_attrib.get('lang')
2524                         url_el = representation.find(_add_ns('BaseURL'))
2525                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2526                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2527                         f = {
2528                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2529                             'manifest_url': mpd_url,
2530                             'ext': mimetype2ext(mime_type),
2531                             'width': int_or_none(representation_attrib.get('width')),
2532                             'height': int_or_none(representation_attrib.get('height')),
2533                             'tbr': float_or_none(bandwidth, 1000),
2534                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2535                             'fps': int_or_none(representation_attrib.get('frameRate')),
2536                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2537                             'format_note': 'DASH %s' % content_type,
2538                             'filesize': filesize,
2539                             'container': mimetype2ext(mime_type) + '_dash',
2540                         }
2541                         f.update(parse_codecs(representation_attrib.get('codecs')))
2542                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2543
2544                         def prepare_template(template_name, identifiers):
2545                             tmpl = representation_ms_info[template_name]
2546                             # First of, % characters outside $...$ templates
2547                             # must be escaped by doubling for proper processing
2548                             # by % operator string formatting used further (see
2549                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2550                             t = ''
2551                             in_template = False
2552                             for c in tmpl:
2553                                 t += c
2554                                 if c == '$':
2555                                     in_template = not in_template
2556                                 elif c == '%' and not in_template:
2557                                     t += c
2558                             # Next, $...$ templates are translated to their
2559                             # %(...) counterparts to be used with % operator
2560                             t = t.replace('$RepresentationID$', representation_id)
2561                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2562                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2563                             t.replace('$$', '$')
2564                             return t
2565
2566                         # @initialization is a regular template like @media one
2567                         # so it should be handled just the same way (see
2568                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2569                         if 'initialization' in representation_ms_info:
2570                             initialization_template = prepare_template(
2571                                 'initialization',
2572                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2573                                 # $Time$ shall not be included for @initialization thus
2574                                 # only $Bandwidth$ remains
2575                                 ('Bandwidth', ))
2576                             representation_ms_info['initialization_url'] = initialization_template % {
2577                                 'Bandwidth': bandwidth,
2578                             }
2579
2580                         def location_key(location):
2581                             return 'url' if re.match(r'^https?://', location) else 'path'
2582
2583                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2584
2585                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2586                             media_location_key = location_key(media_template)
2587
2588                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2589                             # can't be used at the same time
2590                             if '%(Number' in media_template and 's' not in representation_ms_info:
2591                                 segment_duration = None
2592                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2593                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2594                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2595                                 representation_ms_info['fragments'] = [{
2596                                     media_location_key: media_template % {
2597                                         'Number': segment_number,
2598                                         'Bandwidth': bandwidth,
2599                                     },
2600                                     'duration': segment_duration,
2601                                 } for segment_number in range(
2602                                     representation_ms_info['start_number'],
2603                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2604                             else:
2605                                 # $Number*$ or $Time$ in media template with S list available
2606                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2607                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2608                                 representation_ms_info['fragments'] = []
2609                                 segment_time = 0
2610                                 segment_d = None
2611                                 segment_number = representation_ms_info['start_number']
2612
2613                                 def add_segment_url():
2614                                     segment_url = media_template % {
2615                                         'Time': segment_time,
2616                                         'Bandwidth': bandwidth,
2617                                         'Number': segment_number,
2618                                     }
2619                                     representation_ms_info['fragments'].append({
2620                                         media_location_key: segment_url,
2621                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2622                                     })
2623
2624                                 for num, s in enumerate(representation_ms_info['s']):
2625                                     segment_time = s.get('t') or segment_time
2626                                     segment_d = s['d']
2627                                     add_segment_url()
2628                                     segment_number += 1
2629                                     for r in range(s.get('r', 0)):
2630                                         segment_time += segment_d
2631                                         add_segment_url()
2632                                         segment_number += 1
2633                                     segment_time += segment_d
2634                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2635                             # No media template
2636                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2637                             # or any YouTube dashsegments video
2638                             fragments = []
2639                             segment_index = 0
2640                             timescale = representation_ms_info['timescale']
2641                             for s in representation_ms_info['s']:
2642                                 duration = float_or_none(s['d'], timescale)
2643                                 for r in range(s.get('r', 0) + 1):
2644                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2645                                     fragments.append({
2646                                         location_key(segment_uri): segment_uri,
2647                                         'duration': duration,
2648                                     })
2649                                     segment_index += 1
2650                             representation_ms_info['fragments'] = fragments
2651                         elif 'segment_urls' in representation_ms_info:
2652                             # Segment URLs with no SegmentTimeline
2653                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2654                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2655                             fragments = []
2656                             segment_duration = float_or_none(
2657                                 representation_ms_info['segment_duration'],
2658                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2659                             for segment_url in representation_ms_info['segment_urls']:
2660                                 fragment = {
2661                                     location_key(segment_url): segment_url,
2662                                 }
2663                                 if segment_duration:
2664                                     fragment['duration'] = segment_duration
2665                                 fragments.append(fragment)
2666                             representation_ms_info['fragments'] = fragments
2667                         # If there is a fragments key available then we correctly recognized fragmented media.
2668                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2669                         # assumption is not necessarily correct since we may simply have no support for
2670                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2671                         if 'fragments' in representation_ms_info:
2672                             f.update({
2673                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2674                                 'url': mpd_url or base_url,
2675                                 'fragment_base_url': base_url,
2676                                 'fragments': [],
2677                                 'protocol': 'http_dash_segments',
2678                             })
2679                             if 'initialization_url' in representation_ms_info:
2680                                 initialization_url = representation_ms_info['initialization_url']
2681                                 if not f.get('url'):
2682                                     f['url'] = initialization_url
2683                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2684                             f['fragments'].extend(representation_ms_info['fragments'])
2685                         else:
2686                             # Assuming direct URL to unfragmented media.
2687                             f['url'] = base_url
2688                         formats.append(f)
2689                     else:
2690                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2691         return formats
2692
2693     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2694         res = self._download_xml_handle(
2695             ism_url, video_id,
2696             note=note or 'Downloading ISM manifest',
2697             errnote=errnote or 'Failed to download ISM manifest',
2698             fatal=fatal, data=data, headers=headers, query=query)
2699         if res is False:
2700             return []
2701         ism_doc, urlh = res
2702         if ism_doc is None:
2703             return []
2704
2705         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2706
2707     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2708         """
2709         Parse formats from ISM manifest.
2710         References:
2711          1. [MS-SSTR]: Smooth Streaming Protocol,
2712             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2713         """
2714         if ism_doc.get('IsLive') == 'TRUE':
2715             return []
2716         if (not self._downloader.params.get('allow_unplayable_formats')
2717                 and ism_doc.find('Protection') is not None):
2718             return []
2719
2720         duration = int(ism_doc.attrib['Duration'])
2721         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2722
2723         formats = []
2724         for stream in ism_doc.findall('StreamIndex'):
2725             stream_type = stream.get('Type')
2726             if stream_type not in ('video', 'audio'):
2727                 continue
2728             url_pattern = stream.attrib['Url']
2729             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2730             stream_name = stream.get('Name')
2731             for track in stream.findall('QualityLevel'):
2732                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2733                 # TODO: add support for WVC1 and WMAP
2734                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2735                     self.report_warning('%s is not a supported codec' % fourcc)
2736                     continue
2737                 tbr = int(track.attrib['Bitrate']) // 1000
2738                 # [1] does not mention Width and Height attributes. However,
2739                 # they're often present while MaxWidth and MaxHeight are
2740                 # missing, so should be used as fallbacks
2741                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2742                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2743                 sampling_rate = int_or_none(track.get('SamplingRate'))
2744
2745                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2746                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2747
2748                 fragments = []
2749                 fragment_ctx = {
2750                     'time': 0,
2751                 }
2752                 stream_fragments = stream.findall('c')
2753                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2754                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2755                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2756                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2757                     if not fragment_ctx['duration']:
2758                         try:
2759                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2760                         except IndexError:
2761                             next_fragment_time = duration
2762                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2763                     for _ in range(fragment_repeat):
2764                         fragments.append({
2765                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2766                             'duration': fragment_ctx['duration'] / stream_timescale,
2767                         })
2768                         fragment_ctx['time'] += fragment_ctx['duration']
2769
2770                 format_id = []
2771                 if ism_id:
2772                     format_id.append(ism_id)
2773                 if stream_name:
2774                     format_id.append(stream_name)
2775                 format_id.append(compat_str(tbr))
2776
2777                 formats.append({
2778                     'format_id': '-'.join(format_id),
2779                     'url': ism_url,
2780                     'manifest_url': ism_url,
2781                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2782                     'width': width,
2783                     'height': height,
2784                     'tbr': tbr,
2785                     'asr': sampling_rate,
2786                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2787                     'acodec': 'none' if stream_type == 'video' else fourcc,
2788                     'protocol': 'ism',
2789                     'fragments': fragments,
2790                     '_download_params': {
2791                         'duration': duration,
2792                         'timescale': stream_timescale,
2793                         'width': width or 0,
2794                         'height': height or 0,
2795                         'fourcc': fourcc,
2796                         'codec_private_data': track.get('CodecPrivateData'),
2797                         'sampling_rate': sampling_rate,
2798                         'channels': int_or_none(track.get('Channels', 2)),
2799                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2800                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2801                     },
2802                 })
2803         return formats
2804
2805     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2806         def absolute_url(item_url):
2807             return urljoin(base_url, item_url)
2808
2809         def parse_content_type(content_type):
2810             if not content_type:
2811                 return {}
2812             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2813             if ctr:
2814                 mimetype, codecs = ctr.groups()
2815                 f = parse_codecs(codecs)
2816                 f['ext'] = mimetype2ext(mimetype)
2817                 return f
2818             return {}
2819
2820         def _media_formats(src, cur_media_type, type_info={}):
2821             full_url = absolute_url(src)
2822             ext = type_info.get('ext') or determine_ext(full_url)
2823             if ext == 'm3u8':
2824                 is_plain_url = False
2825                 formats = self._extract_m3u8_formats(
2826                     full_url, video_id, ext='mp4',
2827                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2828                     preference=preference, quality=quality, fatal=False)
2829             elif ext == 'mpd':
2830                 is_plain_url = False
2831                 formats = self._extract_mpd_formats(
2832                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2833             else:
2834                 is_plain_url = True
2835                 formats = [{
2836                     'url': full_url,
2837                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2838                 }]
2839             return is_plain_url, formats
2840
2841         entries = []
2842         # amp-video and amp-audio are very similar to their HTML5 counterparts
2843         # so we wll include them right here (see
2844         # https://www.ampproject.org/docs/reference/components/amp-video)
2845         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2846         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2847         media_tags = [(media_tag, media_tag_name, media_type, '')
2848                       for media_tag, media_tag_name, media_type
2849                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2850         media_tags.extend(re.findall(
2851             # We only allow video|audio followed by a whitespace or '>'.
2852             # Allowing more characters may end up in significant slow down (see
2853             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2854             # http://www.porntrex.com/maps/videositemap.xml).
2855             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2856         for media_tag, _, media_type, media_content in media_tags:
2857             media_info = {
2858                 'formats': [],
2859                 'subtitles': {},
2860             }
2861             media_attributes = extract_attributes(media_tag)
2862             src = strip_or_none(media_attributes.get('src'))
2863             if src:
2864                 _, formats = _media_formats(src, media_type)
2865                 media_info['formats'].extend(formats)
2866             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2867             if media_content:
2868                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2869                     s_attr = extract_attributes(source_tag)
2870                     # data-video-src and data-src are non standard but seen
2871                     # several times in the wild
2872                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2873                     if not src:
2874                         continue
2875                     f = parse_content_type(s_attr.get('type'))
2876                     is_plain_url, formats = _media_formats(src, media_type, f)
2877                     if is_plain_url:
2878                         # width, height, res, label and title attributes are
2879                         # all not standard but seen several times in the wild
2880                         labels = [
2881                             s_attr.get(lbl)
2882                             for lbl in ('label', 'title')
2883                             if str_or_none(s_attr.get(lbl))
2884                         ]
2885                         width = int_or_none(s_attr.get('width'))
2886                         height = (int_or_none(s_attr.get('height'))
2887                                   or int_or_none(s_attr.get('res')))
2888                         if not width or not height:
2889                             for lbl in labels:
2890                                 resolution = parse_resolution(lbl)
2891                                 if not resolution:
2892                                     continue
2893                                 width = width or resolution.get('width')
2894                                 height = height or resolution.get('height')
2895                         for lbl in labels:
2896                             tbr = parse_bitrate(lbl)
2897                             if tbr:
2898                                 break
2899                         else:
2900                             tbr = None
2901                         f.update({
2902                             'width': width,
2903                             'height': height,
2904                             'tbr': tbr,
2905                             'format_id': s_attr.get('label') or s_attr.get('title'),
2906                         })
2907                         f.update(formats[0])
2908                         media_info['formats'].append(f)
2909                     else:
2910                         media_info['formats'].extend(formats)
2911                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2912                     track_attributes = extract_attributes(track_tag)
2913                     kind = track_attributes.get('kind')
2914                     if not kind or kind in ('subtitles', 'captions'):
2915                         src = strip_or_none(track_attributes.get('src'))
2916                         if not src:
2917                             continue
2918                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2919                         media_info['subtitles'].setdefault(lang, []).append({
2920                             'url': absolute_url(src),
2921                         })
2922             for f in media_info['formats']:
2923                 f.setdefault('http_headers', {})['Referer'] = base_url
2924             if media_info['formats'] or media_info['subtitles']:
2925                 entries.append(media_info)
2926         return entries
2927
2928     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2929         signed = 'hdnea=' in manifest_url
2930         if not signed:
2931             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2932             manifest_url = re.sub(
2933                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2934                 '', manifest_url).strip('?')
2935
2936         formats = []
2937
2938         hdcore_sign = 'hdcore=3.7.0'
2939         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2940         hds_host = hosts.get('hds')
2941         if hds_host:
2942             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2943         if 'hdcore=' not in f4m_url:
2944             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2945         f4m_formats = self._extract_f4m_formats(
2946             f4m_url, video_id, f4m_id='hds', fatal=False)
2947         for entry in f4m_formats:
2948             entry.update({'extra_param_to_segment_url': hdcore_sign})
2949         formats.extend(f4m_formats)
2950
2951         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2952         hls_host = hosts.get('hls')
2953         if hls_host:
2954             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2955         m3u8_formats = self._extract_m3u8_formats(
2956             m3u8_url, video_id, 'mp4', 'm3u8_native',
2957             m3u8_id='hls', fatal=False)
2958         formats.extend(m3u8_formats)
2959
2960         http_host = hosts.get('http')
2961         if http_host and m3u8_formats and not signed:
2962             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
2963             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2964             qualities_length = len(qualities)
2965             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
2966                 i = 0
2967                 for f in m3u8_formats:
2968                     if f['vcodec'] != 'none':
2969                         for protocol in ('http', 'https'):
2970                             http_f = f.copy()
2971                             del http_f['manifest_url']
2972                             http_url = re.sub(
2973                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
2974                             http_f.update({
2975                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2976                                 'url': http_url,
2977                                 'protocol': protocol,
2978                             })
2979                             formats.append(http_f)
2980                         i += 1
2981
2982         return formats
2983
2984     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2985         query = compat_urlparse.urlparse(url).query
2986         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2987         mobj = re.search(
2988             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2989         url_base = mobj.group('url')
2990         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2991         formats = []
2992
2993         def manifest_url(manifest):
2994             m_url = '%s/%s' % (http_base_url, manifest)
2995             if query:
2996                 m_url += '?%s' % query
2997             return m_url
2998
2999         if 'm3u8' not in skip_protocols:
3000             formats.extend(self._extract_m3u8_formats(
3001                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3002                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3003         if 'f4m' not in skip_protocols:
3004             formats.extend(self._extract_f4m_formats(
3005                 manifest_url('manifest.f4m'),
3006                 video_id, f4m_id='hds', fatal=False))
3007         if 'dash' not in skip_protocols:
3008             formats.extend(self._extract_mpd_formats(
3009                 manifest_url('manifest.mpd'),
3010                 video_id, mpd_id='dash', fatal=False))
3011         if re.search(r'(?:/smil:|\.smil)', url_base):
3012             if 'smil' not in skip_protocols:
3013                 rtmp_formats = self._extract_smil_formats(
3014                     manifest_url('jwplayer.smil'),
3015                     video_id, fatal=False)
3016                 for rtmp_format in rtmp_formats:
3017                     rtsp_format = rtmp_format.copy()
3018                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3019                     del rtsp_format['play_path']
3020                     del rtsp_format['ext']
3021                     rtsp_format.update({
3022                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3023                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3024                         'protocol': 'rtsp',
3025                     })
3026                     formats.extend([rtmp_format, rtsp_format])
3027         else:
3028             for protocol in ('rtmp', 'rtsp'):
3029                 if protocol not in skip_protocols:
3030                     formats.append({
3031                         'url': '%s:%s' % (protocol, url_base),
3032                         'format_id': protocol,
3033                         'protocol': protocol,
3034                     })
3035         return formats
3036
3037     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3038         mobj = re.search(
3039             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3040             webpage)
3041         if mobj:
3042             try:
3043                 jwplayer_data = self._parse_json(mobj.group('options'),
3044                                                  video_id=video_id,
3045                                                  transform_source=transform_source)
3046             except ExtractorError:
3047                 pass
3048             else:
3049                 if isinstance(jwplayer_data, dict):
3050                     return jwplayer_data
3051
3052     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3053         jwplayer_data = self._find_jwplayer_data(
3054             webpage, video_id, transform_source=js_to_json)
3055         return self._parse_jwplayer_data(
3056             jwplayer_data, video_id, *args, **kwargs)
3057
3058     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3059                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3060         # JWPlayer backward compatibility: flattened playlists
3061         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3062         if 'playlist' not in jwplayer_data:
3063             jwplayer_data = {'playlist': [jwplayer_data]}
3064
3065         entries = []
3066
3067         # JWPlayer backward compatibility: single playlist item
3068         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3069         if not isinstance(jwplayer_data['playlist'], list):
3070             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3071
3072         for video_data in jwplayer_data['playlist']:
3073             # JWPlayer backward compatibility: flattened sources
3074             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3075             if 'sources' not in video_data:
3076                 video_data['sources'] = [video_data]
3077
3078             this_video_id = video_id or video_data['mediaid']
3079
3080             formats = self._parse_jwplayer_formats(
3081                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3082                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3083
3084             subtitles = {}
3085             tracks = video_data.get('tracks')
3086             if tracks and isinstance(tracks, list):
3087                 for track in tracks:
3088                     if not isinstance(track, dict):
3089                         continue
3090                     track_kind = track.get('kind')
3091                     if not track_kind or not isinstance(track_kind, compat_str):
3092                         continue
3093                     if track_kind.lower() not in ('captions', 'subtitles'):
3094                         continue
3095                     track_url = urljoin(base_url, track.get('file'))
3096                     if not track_url:
3097                         continue
3098                     subtitles.setdefault(track.get('label') or 'en', []).append({
3099                         'url': self._proto_relative_url(track_url)
3100                     })
3101
3102             entry = {
3103                 'id': this_video_id,
3104                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3105                 'description': clean_html(video_data.get('description')),
3106                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3107                 'timestamp': int_or_none(video_data.get('pubdate')),
3108                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3109                 'subtitles': subtitles,
3110             }
3111             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3112             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3113                 entry.update({
3114                     '_type': 'url_transparent',
3115                     'url': formats[0]['url'],
3116                 })
3117             else:
3118                 self._sort_formats(formats)
3119                 entry['formats'] = formats
3120             entries.append(entry)
3121         if len(entries) == 1:
3122             return entries[0]
3123         else:
3124             return self.playlist_result(entries)
3125
3126     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3127                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3128         urls = []
3129         formats = []
3130         for source in jwplayer_sources_data:
3131             if not isinstance(source, dict):
3132                 continue
3133             source_url = urljoin(
3134                 base_url, self._proto_relative_url(source.get('file')))
3135             if not source_url or source_url in urls:
3136                 continue
3137             urls.append(source_url)
3138             source_type = source.get('type') or ''
3139             ext = mimetype2ext(source_type) or determine_ext(source_url)
3140             if source_type == 'hls' or ext == 'm3u8':
3141                 formats.extend(self._extract_m3u8_formats(
3142                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3143                     m3u8_id=m3u8_id, fatal=False))
3144             elif source_type == 'dash' or ext == 'mpd':
3145                 formats.extend(self._extract_mpd_formats(
3146                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3147             elif ext == 'smil':
3148                 formats.extend(self._extract_smil_formats(
3149                     source_url, video_id, fatal=False))
3150             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3151             elif source_type.startswith('audio') or ext in (
3152                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3153                 formats.append({
3154                     'url': source_url,
3155                     'vcodec': 'none',
3156                     'ext': ext,
3157                 })
3158             else:
3159                 height = int_or_none(source.get('height'))
3160                 if height is None:
3161                     # Often no height is provided but there is a label in
3162                     # format like "1080p", "720p SD", or 1080.
3163                     height = int_or_none(self._search_regex(
3164                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3165                         'height', default=None))
3166                 a_format = {
3167                     'url': source_url,
3168                     'width': int_or_none(source.get('width')),
3169                     'height': height,
3170                     'tbr': int_or_none(source.get('bitrate')),
3171                     'ext': ext,
3172                 }
3173                 if source_url.startswith('rtmp'):
3174                     a_format['ext'] = 'flv'
3175                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3176                     # of jwplayer.flash.swf
3177                     rtmp_url_parts = re.split(
3178                         r'((?:mp4|mp3|flv):)', source_url, 1)
3179                     if len(rtmp_url_parts) == 3:
3180                         rtmp_url, prefix, play_path = rtmp_url_parts
3181                         a_format.update({
3182                             'url': rtmp_url,
3183                             'play_path': prefix + play_path,
3184                         })
3185                     if rtmp_params:
3186                         a_format.update(rtmp_params)
3187                 formats.append(a_format)
3188         return formats
3189
3190     def _live_title(self, name):
3191         """ Generate the title for a live video """
3192         now = datetime.datetime.now()
3193         now_str = now.strftime('%Y-%m-%d %H:%M')
3194         return name + ' ' + now_str
3195
3196     def _int(self, v, name, fatal=False, **kwargs):
3197         res = int_or_none(v, **kwargs)
3198         if 'get_attr' in kwargs:
3199             print(getattr(v, kwargs['get_attr']))
3200         if res is None:
3201             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3202             if fatal:
3203                 raise ExtractorError(msg)
3204             else:
3205                 self._downloader.report_warning(msg)
3206         return res
3207
3208     def _float(self, v, name, fatal=False, **kwargs):
3209         res = float_or_none(v, **kwargs)
3210         if res is None:
3211             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3212             if fatal:
3213                 raise ExtractorError(msg)
3214             else:
3215                 self._downloader.report_warning(msg)
3216         return res
3217
3218     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3219                     path='/', secure=False, discard=False, rest={}, **kwargs):
3220         cookie = compat_cookiejar_Cookie(
3221             0, name, value, port, port is not None, domain, True,
3222             domain.startswith('.'), path, True, secure, expire_time,
3223             discard, None, None, rest)
3224         self._downloader.cookiejar.set_cookie(cookie)
3225
3226     def _get_cookies(self, url):
3227         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3228         req = sanitized_Request(url)
3229         self._downloader.cookiejar.add_cookie_header(req)
3230         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3231
3232     def _apply_first_set_cookie_header(self, url_handle, cookie):
3233         """
3234         Apply first Set-Cookie header instead of the last. Experimental.
3235
3236         Some sites (e.g. [1-3]) may serve two cookies under the same name
3237         in Set-Cookie header and expect the first (old) one to be set rather
3238         than second (new). However, as of RFC6265 the newer one cookie
3239         should be set into cookie store what actually happens.
3240         We will workaround this issue by resetting the cookie to
3241         the first one manually.
3242         1. https://new.vk.com/
3243         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3244         3. https://learning.oreilly.com/
3245         """
3246         for header, cookies in url_handle.headers.items():
3247             if header.lower() != 'set-cookie':
3248                 continue
3249             if sys.version_info[0] >= 3:
3250                 cookies = cookies.encode('iso-8859-1')
3251             cookies = cookies.decode('utf-8')
3252             cookie_value = re.search(
3253                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3254             if cookie_value:
3255                 value, domain = cookie_value.groups()
3256                 self._set_cookie(domain, cookie, value)
3257                 break
3258
3259     def get_testcases(self, include_onlymatching=False):
3260         t = getattr(self, '_TEST', None)
3261         if t:
3262             assert not hasattr(self, '_TESTS'), \
3263                 '%s has _TEST and _TESTS' % type(self).__name__
3264             tests = [t]
3265         else:
3266             tests = getattr(self, '_TESTS', [])
3267         for t in tests:
3268             if not include_onlymatching and t.get('only_matching', False):
3269                 continue
3270             t['name'] = type(self).__name__[:-len('IE')]
3271             yield t
3272
3273     def is_suitable(self, age_limit):
3274         """ Test whether the extractor is generally suitable for the given
3275         age limit (i.e. pornographic sites are not, all others usually are) """
3276
3277         any_restricted = False
3278         for tc in self.get_testcases(include_onlymatching=False):
3279             if tc.get('playlist', []):
3280                 tc = tc['playlist'][0]
3281             is_restricted = age_restricted(
3282                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3283             if not is_restricted:
3284                 return True
3285             any_restricted = any_restricted or is_restricted
3286         return not any_restricted
3287
3288     def extract_subtitles(self, *args, **kwargs):
3289         if (self._downloader.params.get('writesubtitles', False)
3290                 or self._downloader.params.get('listsubtitles')):
3291             return self._get_subtitles(*args, **kwargs)
3292         return {}
3293
3294     def _get_subtitles(self, *args, **kwargs):
3295         raise NotImplementedError('This method must be implemented by subclasses')
3296
3297     @staticmethod
3298     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3299         """ Merge subtitle items for one language. Items with duplicated URLs
3300         will be dropped. """
3301         list1_urls = set([item['url'] for item in subtitle_list1])
3302         ret = list(subtitle_list1)
3303         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3304         return ret
3305
3306     @classmethod
3307     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
3308         """ Merge two subtitle dictionaries, language by language. """
3309         ret = dict(subtitle_dict1)
3310         for lang in subtitle_dict2:
3311             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
3312         return ret
3313
3314     def extract_automatic_captions(self, *args, **kwargs):
3315         if (self._downloader.params.get('writeautomaticsub', False)
3316                 or self._downloader.params.get('listsubtitles')):
3317             return self._get_automatic_captions(*args, **kwargs)
3318         return {}
3319
3320     def _get_automatic_captions(self, *args, **kwargs):
3321         raise NotImplementedError('This method must be implemented by subclasses')
3322
3323     def mark_watched(self, *args, **kwargs):
3324         if (self._downloader.params.get('mark_watched', False)
3325                 and (self._get_login_info()[0] is not None
3326                      or self._downloader.params.get('cookiefile') is not None)):
3327             self._mark_watched(*args, **kwargs)
3328
3329     def _mark_watched(self, *args, **kwargs):
3330         raise NotImplementedError('This method must be implemented by subclasses')
3331
3332     def geo_verification_headers(self):
3333         headers = {}
3334         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3335         if geo_verification_proxy:
3336             headers['Ytdl-request-proxy'] = geo_verification_proxy
3337         return headers
3338
3339     def _generic_id(self, url):
3340         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3341
3342     def _generic_title(self, url):
3343         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3344
3345     @staticmethod
3346     def _availability(is_private, needs_premium, needs_subscription, needs_auth, is_unlisted):
3347         all_known = all(map(
3348             lambda x: x is not None,
3349             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3350         return (
3351             'private' if is_private
3352             else 'premium_only' if needs_premium
3353             else 'subscriber_only' if needs_subscription
3354             else 'needs_auth' if needs_auth
3355             else 'unlisted' if is_unlisted
3356             else 'public' if all_known
3357             else None)
3358
3359
3360 class SearchInfoExtractor(InfoExtractor):
3361     """
3362     Base class for paged search queries extractors.
3363     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3364     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3365     """
3366
3367     @classmethod
3368     def _make_valid_url(cls):
3369         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3370
3371     @classmethod
3372     def suitable(cls, url):
3373         return re.match(cls._make_valid_url(), url) is not None
3374
3375     def _real_extract(self, query):
3376         mobj = re.match(self._make_valid_url(), query)
3377         if mobj is None:
3378             raise ExtractorError('Invalid search query "%s"' % query)
3379
3380         prefix = mobj.group('prefix')
3381         query = mobj.group('query')
3382         if prefix == '':
3383             return self._get_n_results(query, 1)
3384         elif prefix == 'all':
3385             return self._get_n_results(query, self._MAX_RESULTS)
3386         else:
3387             n = int(prefix)
3388             if n <= 0:
3389                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3390             elif n > self._MAX_RESULTS:
3391                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3392                 n = self._MAX_RESULTS
3393             return self._get_n_results(query, n)
3394
3395     def _get_n_results(self, query, n):
3396         """Get a specified number of results for a query"""
3397         raise NotImplementedError('This method must be implemented by subclasses')
3398
3399     @property
3400     def SEARCH_KEY(self):
3401         return self._SEARCH_KEY