yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import ssl
  14 import sys
  15 import time
  16 import math
  17
  18 from ..compat import (
  19     compat_cookiejar_Cookie,
  20     compat_cookies_SimpleCookie,
  21     compat_etree_Element,
  22     compat_etree_fromstring,
  23     compat_getpass,
  24     compat_integer_types,
  25     compat_http_client,
  26     compat_os_name,
  27     compat_str,
  28     compat_urllib_error,
  29     compat_urllib_parse_unquote,
  30     compat_urllib_parse_urlencode,
  31     compat_urllib_request,
  32     compat_urlparse,
  33     compat_xml_parse_error,
  34 )
  35 from ..downloader import FileDownloader
  36 from ..downloader.f4m import (
  37     get_base_url,
  38     remove_encrypted_media,
  39 )
  40 from ..utils import (
  41     NO_DEFAULT,
  42     age_restricted,
  43     base_url,
  44     bug_reports_message,
  45     clean_html,
  46     compiled_regex_type,
  47     determine_ext,
  48     determine_protocol,
  49     dict_get,
  50     error_to_compat_str,
  51     ExtractorError,
  52     extract_attributes,
  53     fix_xml_ampersands,
  54     float_or_none,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     js_to_json,
  59     JSON_LD_RE,
  60     mimetype2ext,
  61     orderedSet,
  62     parse_bitrate,
  63     parse_codecs,
  64     parse_duration,
  65     parse_iso8601,
  66     parse_m3u8_attributes,
  67     parse_resolution,
  68     RegexNotFoundError,
  69     sanitized_Request,
  70     sanitize_filename,
  71     str_or_none,
  72     str_to_int,
  73     strip_or_none,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     urljoin,
  80     url_basename,
  81     url_or_none,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor(object):
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped.
 107
 108     Additionally, it must contain either a formats entry or a url one:
 109
 110     formats:        A list of dictionaries for each format available, ordered
 111                     from worst to best quality.
 112
 113                     Potential fields:
 114                     * url        The mandatory URL representing the media:
 115                                    for plain file media - HTTP URL of this file,
 116                                    for RTMP - RTMP URL,
 117                                    for HLS - URL of the M3U8 media playlist,
 118                                    for HDS - URL of the F4M manifest,
 119                                    for DASH
 120                                      - HTTP URL to plain file media (in case of
 121                                        unfragmented media)
 122                                      - URL of the MPD manifest or base URL
 123                                        representing the media if MPD manifest
 124                                        is parsed from a string (in case of
 125                                        fragmented media)
 126                                    for MSS - URL of the ISM manifest.
 127                     * manifest_url
 128                                  The URL of the manifest file in case of
 129                                  fragmented media:
 130                                    for HLS - URL of the M3U8 master playlist,
 131                                    for HDS - URL of the F4M manifest,
 132                                    for DASH - URL of the MPD manifest,
 133                                    for MSS - URL of the ISM manifest.
 134                     * ext        Will be calculated from URL if missing
 135                     * format     A human-readable description of the format
 136                                  ("mp4 container with h264/opus").
 137                                  Calculated from the format_id, width, height.
 138                                  and format_note fields if missing.
 139                     * format_id  A short description of the format
 140                                  ("mp4_h264_opus" or "19").
 141                                 Technically optional, but strongly recommended.
 142                     * format_note Additional info about the format
 143                                  ("3D" or "DASH video")
 144                     * width      Width of the video, if known
 145                     * height     Height of the video, if known
 146                     * resolution Textual description of width and height
 147                     * tbr        Average bitrate of audio and video in KBit/s
 148                     * abr        Average audio bitrate in KBit/s
 149                     * acodec     Name of the audio codec in use
 150                     * asr        Audio sampling rate in Hertz
 151                     * vbr        Average video bitrate in KBit/s
 152                     * fps        Frame rate
 153                     * vcodec     Name of the video codec in use
 154                     * container  Name of the container format
 155                     * filesize   The number of bytes, if known in advance
 156                     * filesize_approx  An estimate for the number of bytes
 157                     * player_url SWF Player URL (used for rtmpdump).
 158                     * protocol   The protocol that will be used for the actual
 159                                  download, lower-case.
 160                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 161                                  "m3u8", "m3u8_native" or "http_dash_segments".
 162                     * fragment_base_url
 163                                  Base URL for fragments. Each fragment's path
 164                                  value (if present) will be relative to
 165                                  this URL.
 166                     * fragments  A list of fragments of a fragmented media.
 167                                  Each fragment entry must contain either an url
 168                                  or a path. If an url is present it should be
 169                                  considered by a client. Otherwise both path and
 170                                  fragment_base_url must be present. Here is
 171                                  the list of all potential fields:
 172                                  * "url" - fragment's URL
 173                                  * "path" - fragment's path relative to
 174                                             fragment_base_url
 175                                  * "duration" (optional, int or float)
 176                                  * "filesize" (optional, int)
 177                     * preference Order number of this format. If this field is
 178                                  present and not None, the formats get sorted
 179                                  by this field, regardless of all other values.
 180                                  -1 for default (order by other properties),
 181                                  -2 or smaller for less than default.
 182                                  < -1000 to hide the format (if there is
 183                                     another one which is strictly better)
 184                     * language   Language code, e.g. "de" or "en-US".
 185                     * language_preference  Is this in the language mentioned in
 186                                  the URL?
 187                                  10 if it's what the URL is about,
 188                                  -1 for default (don't know),
 189                                  -10 otherwise, other values reserved for now.
 190                     * quality    Order number of the video quality of this
 191                                  format, irrespective of the file format.
 192                                  -1 for default (order by other properties),
 193                                  -2 or smaller for less than default.
 194                     * source_preference  Order number for this video source
 195                                   (quality takes higher priority)
 196                                  -1 for default (order by other properties),
 197                                  -2 or smaller for less than default.
 198                     * http_headers  A dictionary of additional HTTP headers
 199                                  to add to the request.
 200                     * stretched_ratio  If given and not 1, indicates that the
 201                                  video's pixels are not square.
 202                                  width : height ratio as float.
 203                     * no_resume  The server does not support resuming the
 204                                  (HTTP or RTMP) download. Boolean.
 205                     * downloader_options  A dictionary of downloader options as
 206                                  described in FileDownloader
 207
 208     url:            Final video URL.
 209     ext:            Video filename extension.
 210     format:         The video format, defaults to ext (used for --get-format)
 211     player_url:     SWF Player URL (used for rtmpdump).
 212
 213     The following fields are optional:
 214
 215     alt_title:      A secondary title of the video.
 216     display_id      An alternative identifier for the video, not necessarily
 217                     unique, but available before title. Typically, id is
 218                     something like "4234987", title "Dancing naked mole rats",
 219                     and display_id "dancing-naked-mole-rats"
 220     thumbnails:     A list of dictionaries, with the following entries:
 221                         * "id" (optional, string) - Thumbnail format ID
 222                         * "url"
 223                         * "preference" (optional, int) - quality of the image
 224                         * "width" (optional, int)
 225                         * "height" (optional, int)
 226                         * "resolution" (optional, string "{width}x{height}",
 227                                         deprecated)
 228                         * "filesize" (optional, int)
 229     thumbnail:      Full URL to a video thumbnail image.
 230     description:    Full video description.
 231     uploader:       Full name of the video uploader.
 232     license:        License name the video is licensed under.
 233     creator:        The creator of the video.
 234     release_timestamp: UNIX timestamp of the moment the video was released.
 235     release_date:   The date (YYYYMMDD) when the video was released.
 236     timestamp:      UNIX timestamp of the moment the video was uploaded
 237     upload_date:    Video upload date (YYYYMMDD).
 238                     If not explicitly set, calculated from timestamp.
 239     uploader_id:    Nickname or id of the video uploader.
 240     uploader_url:   Full URL to a personal webpage of the video uploader.
 241     channel:        Full name of the channel the video is uploaded on.
 242                     Note that channel fields may or may not repeat uploader
 243                     fields. This depends on a particular extractor.
 244     channel_id:     Id of the channel.
 245     channel_url:    Full URL to a channel webpage.
 246     location:       Physical location where the video was filmed.
 247     subtitles:      The available subtitles as a dictionary in the format
 248                     {tag: subformats}. "tag" is usually a language code, and
 249                     "subformats" is a list sorted from lower to higher
 250                     preference, each element is a dictionary with the "ext"
 251                     entry and one of:
 252                         * "data": The subtitles file contents
 253                         * "url": A URL pointing to the subtitles file
 254                     "ext" will be calculated from URL if missing
 255     automatic_captions: Like 'subtitles'; contains automatically generated
 256                     captions instead of normal subtitles
 257     duration:       Length of the video in seconds, as an integer or float.
 258     view_count:     How many users have watched the video on the platform.
 259     like_count:     Number of positive ratings of the video
 260     dislike_count:  Number of negative ratings of the video
 261     repost_count:   Number of reposts of the video
 262     average_rating: Average rating give by users, the scale used depends on the webpage
 263     comment_count:  Number of comments on the video
 264     comments:       A list of comments, each with one or more of the following
 265                     properties (all but one of text or html optional):
 266                         * "author" - human-readable name of the comment author
 267                         * "author_id" - user ID of the comment author
 268                         * "author_thumbnail" - The thumbnail of the comment author
 269                         * "id" - Comment ID
 270                         * "html" - Comment as HTML
 271                         * "text" - Plain text of the comment
 272                         * "timestamp" - UNIX timestamp of comment
 273                         * "parent" - ID of the comment this one is replying to.
 274                                      Set to "root" to indicate that this is a
 275                                      comment to the original video.
 276                         * "like_count" - Number of positive ratings of the comment
 277                         * "dislike_count" - Number of negative ratings of the comment
 278                         * "is_favorited" - Whether the comment is marked as
 279                                            favorite by the video uploader
 280                         * "author_is_uploader" - Whether the comment is made by
 281                                                  the video uploader
 282     age_limit:      Age restriction for the video, as an integer (years)
 283     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 284                     should allow to get the same result again. (It will be set
 285                     by YoutubeDL if it's missing)
 286     categories:     A list of categories that the video falls in, for example
 287                     ["Sports", "Berlin"]
 288     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 289     is_live:        True, False, or None (=unknown). Whether this video is a
 290                     live stream that goes on instead of a fixed-length video.
 291     was_live:       True, False, or None (=unknown). Whether this video was
 292                     originally a live stream.
 293     start_time:     Time in seconds where the reproduction should start, as
 294                     specified in the URL.
 295     end_time:       Time in seconds where the reproduction should end, as
 296                     specified in the URL.
 297     chapters:       A list of dictionaries, with the following entries:
 298                         * "start_time" - The start time of the chapter in seconds
 299                         * "end_time" - The end time of the chapter in seconds
 300                         * "title" (optional, string)
 301     playable_in_embed: Whether this video is allowed to play in embedded
 302                     players on other sites. Can be True (=always allowed),
 303                     False (=never allowed), None (=unknown), or a string
 304                     specifying the criteria for embedability (Eg: 'whitelist')
 305     availability:   Under what condition the video is available. One of
 306                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 307                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 308                     to set it
 309     __post_extractor: A function to be called just before the metadata is
 310                     written to either disk, logger or console. The function
 311                     must return a dict which will be added to the info_dict.
 312                     This is usefull for additional information that is
 313                     time-consuming to extract. Note that the fields thus
 314                     extracted will not be available to output template and
 315                     match_filter. So, only "comments" and "comment_count" are
 316                     currently allowed to be extracted via this method.
 317
 318     The following fields should only be used when the video belongs to some logical
 319     chapter or section:
 320
 321     chapter:        Name or title of the chapter the video belongs to.
 322     chapter_number: Number of the chapter the video belongs to, as an integer.
 323     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 324
 325     The following fields should only be used when the video is an episode of some
 326     series, programme or podcast:
 327
 328     series:         Title of the series or programme the video episode belongs to.
 329     season:         Title of the season the video episode belongs to.
 330     season_number:  Number of the season the video episode belongs to, as an integer.
 331     season_id:      Id of the season the video episode belongs to, as a unicode string.
 332     episode:        Title of the video episode. Unlike mandatory video title field,
 333                     this field should denote the exact title of the video episode
 334                     without any kind of decoration.
 335     episode_number: Number of the video episode within a season, as an integer.
 336     episode_id:     Id of the video episode, as a unicode string.
 337
 338     The following fields should only be used when the media is a track or a part of
 339     a music album:
 340
 341     track:          Title of the track.
 342     track_number:   Number of the track within an album or a disc, as an integer.
 343     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 344                     as a unicode string.
 345     artist:         Artist(s) of the track.
 346     genre:          Genre(s) of the track.
 347     album:          Title of the album the track belongs to.
 348     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 349     album_artist:   List of all artists appeared on the album (e.g.
 350                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 351                     and compilations).
 352     disc_number:    Number of the disc or other physical medium the track belongs to,
 353                     as an integer.
 354     release_year:   Year (YYYY) when the album was released.
 355
 356     Unless mentioned otherwise, the fields should be Unicode strings.
 357
 358     Unless mentioned otherwise, None is equivalent to absence of information.
 359
 360
 361     _type "playlist" indicates multiple videos.
 362     There must be a key "entries", which is a list, an iterable, or a PagedList
 363     object, each element of which is a valid dictionary by this specification.
 364
 365     Additionally, playlists can have "id", "title", and any other relevent
 366     attributes with the same semantics as videos (see above).
 367
 368
 369     _type "multi_video" indicates that there are multiple videos that
 370     form a single show, for examples multiple acts of an opera or TV episode.
 371     It must have an entries key like a playlist and contain all the keys
 372     required for a video at the same time.
 373
 374
 375     _type "url" indicates that the video must be extracted from another
 376     location, possibly by a different extractor. Its only required key is:
 377     "url" - the next URL to extract.
 378     The key "ie_key" can be set to the class name (minus the trailing "IE",
 379     e.g. "Youtube") if the extractor class is known in advance.
 380     Additionally, the dictionary may have any properties of the resolved entity
 381     known in advance, for example "title" if the title of the referred video is
 382     known ahead of time.
 383
 384
 385     _type "url_transparent" entities have the same specification as "url", but
 386     indicate that the given additional information is more precise than the one
 387     associated with the resolved URL.
 388     This is useful when a site employs a video service that hosts the video and
 389     its technical metadata, but that video service does not embed a useful
 390     title, description etc.
 391
 392
 393     Subclasses of this one should re-define the _real_initialize() and
 394     _real_extract() methods and define a _VALID_URL regexp.
 395     Probably, they should also be added to the list of extractors.
 396
 397     _GEO_BYPASS attribute may be set to False in order to disable
 398     geo restriction bypass mechanisms for a particular extractor.
 399     Though it won't disable explicit geo restriction bypass based on
 400     country code provided with geo_bypass_country.
 401
 402     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 403     countries for this extractor. One of these countries will be used by
 404     geo restriction bypass mechanism right away in order to bypass
 405     geo restriction, of course, if the mechanism is not disabled.
 406
 407     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 408     IP blocks in CIDR notation for this extractor. One of these IP blocks
 409     will be used by geo restriction bypass mechanism similarly
 410     to _GEO_COUNTRIES.
 411
 412     Finally, the _WORKING attribute should be set to False for broken IEs
 413     in order to warn the users and skip the tests.
 414     """
 415
 416     _ready = False
 417     _downloader = None
 418     _x_forwarded_for_ip = None
 419     _GEO_BYPASS = True
 420     _GEO_COUNTRIES = None
 421     _GEO_IP_BLOCKS = None
 422     _WORKING = True
 423
 424     def __init__(self, downloader=None):
 425         """Constructor. Receives an optional downloader."""
 426         self._ready = False
 427         self._x_forwarded_for_ip = None
 428         self.set_downloader(downloader)
 429
 430     @classmethod
 431     def suitable(cls, url):
 432         """Receives a URL and returns True if suitable for this IE."""
 433
 434         # This does not use has/getattr intentionally - we want to know whether
 435         # we have cached the regexp for *this* class, whereas getattr would also
 436         # match the superclass
 437         if '_VALID_URL_RE' not in cls.__dict__:
 438             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 439         return cls._VALID_URL_RE.match(url) is not None
 440
 441     @classmethod
 442     def _match_id(cls, url):
 443         if '_VALID_URL_RE' not in cls.__dict__:
 444             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 445         m = cls._VALID_URL_RE.match(url)
 446         assert m
 447         return compat_str(m.group('id'))
 448
 449     @classmethod
 450     def working(cls):
 451         """Getter method for _WORKING."""
 452         return cls._WORKING
 453
 454     def initialize(self):
 455         """Initializes an instance (authentication, etc)."""
 456         self._initialize_geo_bypass({
 457             'countries': self._GEO_COUNTRIES,
 458             'ip_blocks': self._GEO_IP_BLOCKS,
 459         })
 460         if not self._ready:
 461             self._real_initialize()
 462             self._ready = True
 463
 464     def _initialize_geo_bypass(self, geo_bypass_context):
 465         """
 466         Initialize geo restriction bypass mechanism.
 467
 468         This method is used to initialize geo bypass mechanism based on faking
 469         X-Forwarded-For HTTP header. A random country from provided country list
 470         is selected and a random IP belonging to this country is generated. This
 471         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 472         HTTP requests.
 473
 474         This method will be used for initial geo bypass mechanism initialization
 475         during the instance initialization with _GEO_COUNTRIES and
 476         _GEO_IP_BLOCKS.
 477
 478         You may also manually call it from extractor's code if geo bypass
 479         information is not available beforehand (e.g. obtained during
 480         extraction) or due to some other reason. In this case you should pass
 481         this information in geo bypass context passed as first argument. It may
 482         contain following fields:
 483
 484         countries:  List of geo unrestricted countries (similar
 485                     to _GEO_COUNTRIES)
 486         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 487                     (similar to _GEO_IP_BLOCKS)
 488
 489         """
 490         if not self._x_forwarded_for_ip:
 491
 492             # Geo bypass mechanism is explicitly disabled by user
 493             if not self._downloader.params.get('geo_bypass', True):
 494                 return
 495
 496             if not geo_bypass_context:
 497                 geo_bypass_context = {}
 498
 499             # Backward compatibility: previously _initialize_geo_bypass
 500             # expected a list of countries, some 3rd party code may still use
 501             # it this way
 502             if isinstance(geo_bypass_context, (list, tuple)):
 503                 geo_bypass_context = {
 504                     'countries': geo_bypass_context,
 505                 }
 506
 507             # The whole point of geo bypass mechanism is to fake IP
 508             # as X-Forwarded-For HTTP header based on some IP block or
 509             # country code.
 510
 511             # Path 1: bypassing based on IP block in CIDR notation
 512
 513             # Explicit IP block specified by user, use it right away
 514             # regardless of whether extractor is geo bypassable or not
 515             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 516
 517             # Otherwise use random IP block from geo bypass context but only
 518             # if extractor is known as geo bypassable
 519             if not ip_block:
 520                 ip_blocks = geo_bypass_context.get('ip_blocks')
 521                 if self._GEO_BYPASS and ip_blocks:
 522                     ip_block = random.choice(ip_blocks)
 523
 524             if ip_block:
 525                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 526                 if self._downloader.params.get('verbose', False):
 527                     self._downloader.to_screen(
 528                         '[debug] Using fake IP %s as X-Forwarded-For.'
 529                         % self._x_forwarded_for_ip)
 530                 return
 531
 532             # Path 2: bypassing based on country code
 533
 534             # Explicit country code specified by user, use it right away
 535             # regardless of whether extractor is geo bypassable or not
 536             country = self._downloader.params.get('geo_bypass_country', None)
 537
 538             # Otherwise use random country code from geo bypass context but
 539             # only if extractor is known as geo bypassable
 540             if not country:
 541                 countries = geo_bypass_context.get('countries')
 542                 if self._GEO_BYPASS and countries:
 543                     country = random.choice(countries)
 544
 545             if country:
 546                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 547                 if self._downloader.params.get('verbose', False):
 548                     self._downloader.to_screen(
 549                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 550                         % (self._x_forwarded_for_ip, country.upper()))
 551
 552     def extract(self, url):
 553         """Extracts URL information and returns it in list of dicts."""
 554         try:
 555             for _ in range(2):
 556                 try:
 557                     self.initialize()
 558                     ie_result = self._real_extract(url)
 559                     if self._x_forwarded_for_ip:
 560                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 561                     return ie_result
 562                 except GeoRestrictedError as e:
 563                     if self.__maybe_fake_ip_and_retry(e.countries):
 564                         continue
 565                     raise
 566         except ExtractorError:
 567             raise
 568         except compat_http_client.IncompleteRead as e:
 569             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 570         except (KeyError, StopIteration) as e:
 571             raise ExtractorError('An extractor error has occurred.', cause=e)
 572
 573     def __maybe_fake_ip_and_retry(self, countries):
 574         if (not self._downloader.params.get('geo_bypass_country', None)
 575                 and self._GEO_BYPASS
 576                 and self._downloader.params.get('geo_bypass', True)
 577                 and not self._x_forwarded_for_ip
 578                 and countries):
 579             country_code = random.choice(countries)
 580             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 581             if self._x_forwarded_for_ip:
 582                 self.report_warning(
 583                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 584                     % (self._x_forwarded_for_ip, country_code.upper()))
 585                 return True
 586         return False
 587
 588     def set_downloader(self, downloader):
 589         """Sets the downloader for this IE."""
 590         self._downloader = downloader
 591
 592     def _real_initialize(self):
 593         """Real initialization process. Redefine in subclasses."""
 594         pass
 595
 596     def _real_extract(self, url):
 597         """Real extraction process. Redefine in subclasses."""
 598         pass
 599
 600     @classmethod
 601     def ie_key(cls):
 602         """A string for getting the InfoExtractor with get_info_extractor"""
 603         return compat_str(cls.__name__[:-2])
 604
 605     @property
 606     def IE_NAME(self):
 607         return compat_str(type(self).__name__[:-2])
 608
 609     @staticmethod
 610     def __can_accept_status_code(err, expected_status):
 611         assert isinstance(err, compat_urllib_error.HTTPError)
 612         if expected_status is None:
 613             return False
 614         if isinstance(expected_status, compat_integer_types):
 615             return err.code == expected_status
 616         elif isinstance(expected_status, (list, tuple)):
 617             return err.code in expected_status
 618         elif callable(expected_status):
 619             return expected_status(err.code) is True
 620         else:
 621             assert False
 622
 623     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 624         """
 625         Return the response handle.
 626
 627         See _download_webpage docstring for arguments specification.
 628         """
 629         if not self._downloader._first_webpage_request:
 630             sleep_interval = float_or_none(self._downloader.params.get('sleep_interval_requests')) or 0
 631             if sleep_interval > 0:
 632                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 633                 time.sleep(sleep_interval)
 634         else:
 635             self._downloader._first_webpage_request = False
 636
 637         if note is None:
 638             self.report_download_webpage(video_id)
 639         elif note is not False:
 640             if video_id is None:
 641                 self.to_screen('%s' % (note,))
 642             else:
 643                 self.to_screen('%s: %s' % (video_id, note))
 644
 645         # Some sites check X-Forwarded-For HTTP header in order to figure out
 646         # the origin of the client behind proxy. This allows bypassing geo
 647         # restriction by faking this header's value to IP that belongs to some
 648         # geo unrestricted country. We will do so once we encounter any
 649         # geo restriction error.
 650         if self._x_forwarded_for_ip:
 651             if 'X-Forwarded-For' not in headers:
 652                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 653
 654         if isinstance(url_or_request, compat_urllib_request.Request):
 655             url_or_request = update_Request(
 656                 url_or_request, data=data, headers=headers, query=query)
 657         else:
 658             if query:
 659                 url_or_request = update_url_query(url_or_request, query)
 660             if data is not None or headers:
 661                 url_or_request = sanitized_Request(url_or_request, data, headers)
 662         exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 663         if hasattr(ssl, 'CertificateError'):
 664             exceptions.append(ssl.CertificateError)
 665         try:
 666             return self._downloader.urlopen(url_or_request)
 667         except tuple(exceptions) as err:
 668             if isinstance(err, compat_urllib_error.HTTPError):
 669                 if self.__can_accept_status_code(err, expected_status):
 670                     # Retain reference to error to prevent file object from
 671                     # being closed before it can be read. Works around the
 672                     # effects of <https://bugs.python.org/issue15002>
 673                     # introduced in Python 3.4.1.
 674                     err.fp._error = err
 675                     return err.fp
 676
 677             if errnote is False:
 678                 return False
 679             if errnote is None:
 680                 errnote = 'Unable to download webpage'
 681
 682             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 683             if fatal:
 684                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 685             else:
 686                 self.report_warning(errmsg)
 687                 return False
 688
 689     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 690         """
 691         Return a tuple (page content as string, URL handle).
 692
 693         See _download_webpage docstring for arguments specification.
 694         """
 695         # Strip hashes from the URL (#1038)
 696         if isinstance(url_or_request, (compat_str, str)):
 697             url_or_request = url_or_request.partition('#')[0]
 698
 699         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 700         if urlh is False:
 701             assert not fatal
 702             return False
 703         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 704         return (content, urlh)
 705
 706     @staticmethod
 707     def _guess_encoding_from_content(content_type, webpage_bytes):
 708         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 709         if m:
 710             encoding = m.group(1)
 711         else:
 712             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 713                           webpage_bytes[:1024])
 714             if m:
 715                 encoding = m.group(1).decode('ascii')
 716             elif webpage_bytes.startswith(b'\xff\xfe'):
 717                 encoding = 'utf-16'
 718             else:
 719                 encoding = 'utf-8'
 720
 721         return encoding
 722
 723     def __check_blocked(self, content):
 724         first_block = content[:512]
 725         if ('<title>Access to this site is blocked</title>' in content
 726                 and 'Websense' in first_block):
 727             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 728             blocked_iframe = self._html_search_regex(
 729                 r'<iframe src="([^"]+)"', content,
 730                 'Websense information URL', default=None)
 731             if blocked_iframe:
 732                 msg += ' Visit %s for more details' % blocked_iframe
 733             raise ExtractorError(msg, expected=True)
 734         if '<title>The URL you requested has been blocked</title>' in first_block:
 735             msg = (
 736                 'Access to this webpage has been blocked by Indian censorship. '
 737                 'Use a VPN or proxy server (with --proxy) to route around it.')
 738             block_msg = self._html_search_regex(
 739                 r'</h1><p>(.*?)</p>',
 740                 content, 'block message', default=None)
 741             if block_msg:
 742                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 743             raise ExtractorError(msg, expected=True)
 744         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 745                 and 'blocklist.rkn.gov.ru' in content):
 746             raise ExtractorError(
 747                 'Access to this webpage has been blocked by decision of the Russian government. '
 748                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 749                 expected=True)
 750
 751     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 752         content_type = urlh.headers.get('Content-Type', '')
 753         webpage_bytes = urlh.read()
 754         if prefix is not None:
 755             webpage_bytes = prefix + webpage_bytes
 756         if not encoding:
 757             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 758         if self._downloader.params.get('dump_intermediate_pages', False):
 759             self.to_screen('Dumping request to ' + urlh.geturl())
 760             dump = base64.b64encode(webpage_bytes).decode('ascii')
 761             self._downloader.to_screen(dump)
 762         if self._downloader.params.get('write_pages', False):
 763             basen = '%s_%s' % (video_id, urlh.geturl())
 764             if len(basen) > 240:
 765                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 766                 basen = basen[:240 - len(h)] + h
 767             raw_filename = basen + '.dump'
 768             filename = sanitize_filename(raw_filename, restricted=True)
 769             self.to_screen('Saving request to ' + filename)
 770             # Working around MAX_PATH limitation on Windows (see
 771             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 772             if compat_os_name == 'nt':
 773                 absfilepath = os.path.abspath(filename)
 774                 if len(absfilepath) > 259:
 775                     filename = '\\\\?\\' + absfilepath
 776             with open(filename, 'wb') as outf:
 777                 outf.write(webpage_bytes)
 778
 779         try:
 780             content = webpage_bytes.decode(encoding, 'replace')
 781         except LookupError:
 782             content = webpage_bytes.decode('utf-8', 'replace')
 783
 784         self.__check_blocked(content)
 785
 786         return content
 787
 788     def _download_webpage(
 789             self, url_or_request, video_id, note=None, errnote=None,
 790             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 791             headers={}, query={}, expected_status=None):
 792         """
 793         Return the data of the page as a string.
 794
 795         Arguments:
 796         url_or_request -- plain text URL as a string or
 797             a compat_urllib_request.Requestobject
 798         video_id -- Video/playlist/item identifier (string)
 799
 800         Keyword arguments:
 801         note -- note printed before downloading (string)
 802         errnote -- note printed in case of an error (string)
 803         fatal -- flag denoting whether error should be considered fatal,
 804             i.e. whether it should cause ExtractionError to be raised,
 805             otherwise a warning will be reported and extraction continued
 806         tries -- number of tries
 807         timeout -- sleep interval between tries
 808         encoding -- encoding for a page content decoding, guessed automatically
 809             when not explicitly specified
 810         data -- POST data (bytes)
 811         headers -- HTTP headers (dict)
 812         query -- URL query (dict)
 813         expected_status -- allows to accept failed HTTP requests (non 2xx
 814             status code) by explicitly specifying a set of accepted status
 815             codes. Can be any of the following entities:
 816                 - an integer type specifying an exact failed status code to
 817                   accept
 818                 - a list or a tuple of integer types specifying a list of
 819                   failed status codes to accept
 820                 - a callable accepting an actual failed status code and
 821                   returning True if it should be accepted
 822             Note that this argument does not affect success status codes (2xx)
 823             which are always accepted.
 824         """
 825
 826         success = False
 827         try_count = 0
 828         while success is False:
 829             try:
 830                 res = self._download_webpage_handle(
 831                     url_or_request, video_id, note, errnote, fatal,
 832                     encoding=encoding, data=data, headers=headers, query=query,
 833                     expected_status=expected_status)
 834                 success = True
 835             except compat_http_client.IncompleteRead as e:
 836                 try_count += 1
 837                 if try_count >= tries:
 838                     raise e
 839                 self._sleep(timeout, video_id)
 840         if res is False:
 841             return res
 842         else:
 843             content, _ = res
 844             return content
 845
 846     def _download_xml_handle(
 847             self, url_or_request, video_id, note='Downloading XML',
 848             errnote='Unable to download XML', transform_source=None,
 849             fatal=True, encoding=None, data=None, headers={}, query={},
 850             expected_status=None):
 851         """
 852         Return a tuple (xml as an compat_etree_Element, URL handle).
 853
 854         See _download_webpage docstring for arguments specification.
 855         """
 856         res = self._download_webpage_handle(
 857             url_or_request, video_id, note, errnote, fatal=fatal,
 858             encoding=encoding, data=data, headers=headers, query=query,
 859             expected_status=expected_status)
 860         if res is False:
 861             return res
 862         xml_string, urlh = res
 863         return self._parse_xml(
 864             xml_string, video_id, transform_source=transform_source,
 865             fatal=fatal), urlh
 866
 867     def _download_xml(
 868             self, url_or_request, video_id,
 869             note='Downloading XML', errnote='Unable to download XML',
 870             transform_source=None, fatal=True, encoding=None,
 871             data=None, headers={}, query={}, expected_status=None):
 872         """
 873         Return the xml as an compat_etree_Element.
 874
 875         See _download_webpage docstring for arguments specification.
 876         """
 877         res = self._download_xml_handle(
 878             url_or_request, video_id, note=note, errnote=errnote,
 879             transform_source=transform_source, fatal=fatal, encoding=encoding,
 880             data=data, headers=headers, query=query,
 881             expected_status=expected_status)
 882         return res if res is False else res[0]
 883
 884     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 885         if transform_source:
 886             xml_string = transform_source(xml_string)
 887         try:
 888             return compat_etree_fromstring(xml_string.encode('utf-8'))
 889         except compat_xml_parse_error as ve:
 890             errmsg = '%s: Failed to parse XML ' % video_id
 891             if fatal:
 892                 raise ExtractorError(errmsg, cause=ve)
 893             else:
 894                 self.report_warning(errmsg + str(ve))
 895
 896     def _download_json_handle(
 897             self, url_or_request, video_id, note='Downloading JSON metadata',
 898             errnote='Unable to download JSON metadata', transform_source=None,
 899             fatal=True, encoding=None, data=None, headers={}, query={},
 900             expected_status=None):
 901         """
 902         Return a tuple (JSON object, URL handle).
 903
 904         See _download_webpage docstring for arguments specification.
 905         """
 906         res = self._download_webpage_handle(
 907             url_or_request, video_id, note, errnote, fatal=fatal,
 908             encoding=encoding, data=data, headers=headers, query=query,
 909             expected_status=expected_status)
 910         if res is False:
 911             return res
 912         json_string, urlh = res
 913         return self._parse_json(
 914             json_string, video_id, transform_source=transform_source,
 915             fatal=fatal), urlh
 916
 917     def _download_json(
 918             self, url_or_request, video_id, note='Downloading JSON metadata',
 919             errnote='Unable to download JSON metadata', transform_source=None,
 920             fatal=True, encoding=None, data=None, headers={}, query={},
 921             expected_status=None):
 922         """
 923         Return the JSON object as a dict.
 924
 925         See _download_webpage docstring for arguments specification.
 926         """
 927         res = self._download_json_handle(
 928             url_or_request, video_id, note=note, errnote=errnote,
 929             transform_source=transform_source, fatal=fatal, encoding=encoding,
 930             data=data, headers=headers, query=query,
 931             expected_status=expected_status)
 932         return res if res is False else res[0]
 933
 934     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 935         if transform_source:
 936             json_string = transform_source(json_string)
 937         try:
 938             return json.loads(json_string)
 939         except ValueError as ve:
 940             errmsg = '%s: Failed to parse JSON ' % video_id
 941             if fatal:
 942                 raise ExtractorError(errmsg, cause=ve)
 943             else:
 944                 self.report_warning(errmsg + str(ve))
 945
 946     def report_warning(self, msg, video_id=None):
 947         idstr = '' if video_id is None else '%s: ' % video_id
 948         self._downloader.report_warning(
 949             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 950
 951     def to_screen(self, msg):
 952         """Print msg to screen, prefixing it with '[ie_name]'"""
 953         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 954
 955     def report_extraction(self, id_or_name):
 956         """Report information extraction."""
 957         self.to_screen('%s: Extracting information' % id_or_name)
 958
 959     def report_download_webpage(self, video_id):
 960         """Report webpage download."""
 961         self.to_screen('%s: Downloading webpage' % video_id)
 962
 963     def report_age_confirmation(self):
 964         """Report attempt to confirm age."""
 965         self.to_screen('Confirming age')
 966
 967     def report_login(self):
 968         """Report attempt to log in."""
 969         self.to_screen('Logging in')
 970
 971     def raise_login_required(
 972             self, msg='This video is only available for registered users', metadata_available=False):
 973         if metadata_available and self._downloader.params.get('ignore_no_formats_error'):
 974             self.report_warning(msg)
 975         raise ExtractorError(
 976             '%s. Use --cookies, --username and --password or --netrc to provide account credentials' % msg,
 977             expected=True)
 978
 979     def raise_geo_restricted(
 980             self, msg='This video is not available from your location due to geo restriction',
 981             countries=None, metadata_available=False):
 982         if metadata_available and self._downloader.params.get('ignore_no_formats_error'):
 983             self.report_warning(msg)
 984         else:
 985             raise GeoRestrictedError(msg, countries=countries)
 986
 987     def raise_no_formats(self, msg, expected=False, video_id=None):
 988         if expected and self._downloader.params.get('ignore_no_formats_error'):
 989             self.report_warning(msg, video_id)
 990         else:
 991             raise ExtractorError(msg, expected=expected, video_id=video_id)
 992
 993     # Methods for following #608
 994     @staticmethod
 995     def url_result(url, ie=None, video_id=None, video_title=None):
 996         """Returns a URL that points to a page that should be processed"""
 997         # TODO: ie should be the class used for getting the info
 998         video_info = {'_type': 'url',
 999                       'url': url,
1000                       'ie_key': ie}
1001         if video_id is not None:
1002             video_info['id'] = video_id
1003         if video_title is not None:
1004             video_info['title'] = video_title
1005         return video_info
1006
1007     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1008         urls = orderedSet(
1009             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1010             for m in matches)
1011         return self.playlist_result(
1012             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1013
1014     @staticmethod
1015     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1016         """Returns a playlist"""
1017         video_info = {'_type': 'playlist',
1018                       'entries': entries}
1019         video_info.update(kwargs)
1020         if playlist_id:
1021             video_info['id'] = playlist_id
1022         if playlist_title:
1023             video_info['title'] = playlist_title
1024         if playlist_description is not None:
1025             video_info['description'] = playlist_description
1026         return video_info
1027
1028     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1029         """
1030         Perform a regex search on the given string, using a single or a list of
1031         patterns returning the first matching group.
1032         In case of failure return a default value or raise a WARNING or a
1033         RegexNotFoundError, depending on fatal, specifying the field name.
1034         """
1035         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1036             mobj = re.search(pattern, string, flags)
1037         else:
1038             for p in pattern:
1039                 mobj = re.search(p, string, flags)
1040                 if mobj:
1041                     break
1042
1043         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1044             _name = '\033[0;34m%s\033[0m' % name
1045         else:
1046             _name = name
1047
1048         if mobj:
1049             if group is None:
1050                 # return the first matching group
1051                 return next(g for g in mobj.groups() if g is not None)
1052             else:
1053                 return mobj.group(group)
1054         elif default is not NO_DEFAULT:
1055             return default
1056         elif fatal:
1057             raise RegexNotFoundError('Unable to extract %s' % _name)
1058         else:
1059             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1060             return None
1061
1062     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1063         """
1064         Like _search_regex, but strips HTML tags and unescapes entities.
1065         """
1066         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1067         if res:
1068             return clean_html(res).strip()
1069         else:
1070             return res
1071
1072     def _get_netrc_login_info(self, netrc_machine=None):
1073         username = None
1074         password = None
1075         netrc_machine = netrc_machine or self._NETRC_MACHINE
1076
1077         if self._downloader.params.get('usenetrc', False):
1078             try:
1079                 info = netrc.netrc().authenticators(netrc_machine)
1080                 if info is not None:
1081                     username = info[0]
1082                     password = info[2]
1083                 else:
1084                     raise netrc.NetrcParseError(
1085                         'No authenticators for %s' % netrc_machine)
1086             except (IOError, netrc.NetrcParseError) as err:
1087                 self.report_warning(
1088                     'parsing .netrc: %s' % error_to_compat_str(err))
1089
1090         return username, password
1091
1092     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1093         """
1094         Get the login info as (username, password)
1095         First look for the manually specified credentials using username_option
1096         and password_option as keys in params dictionary. If no such credentials
1097         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1098         value.
1099         If there's no info available, return (None, None)
1100         """
1101         if self._downloader is None:
1102             return (None, None)
1103
1104         downloader_params = self._downloader.params
1105
1106         # Attempt to use provided username and password or .netrc data
1107         if downloader_params.get(username_option) is not None:
1108             username = downloader_params[username_option]
1109             password = downloader_params[password_option]
1110         else:
1111             username, password = self._get_netrc_login_info(netrc_machine)
1112
1113         return username, password
1114
1115     def _get_tfa_info(self, note='two-factor verification code'):
1116         """
1117         Get the two-factor authentication info
1118         TODO - asking the user will be required for sms/phone verify
1119         currently just uses the command line option
1120         If there's no info available, return None
1121         """
1122         if self._downloader is None:
1123             return None
1124         downloader_params = self._downloader.params
1125
1126         if downloader_params.get('twofactor') is not None:
1127             return downloader_params['twofactor']
1128
1129         return compat_getpass('Type %s and press [Return]: ' % note)
1130
1131     # Helper functions for extracting OpenGraph info
1132     @staticmethod
1133     def _og_regexes(prop):
1134         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1135         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1136                        % {'prop': re.escape(prop)})
1137         template = r'<meta[^>]+?%s[^>]+?%s'
1138         return [
1139             template % (property_re, content_re),
1140             template % (content_re, property_re),
1141         ]
1142
1143     @staticmethod
1144     def _meta_regex(prop):
1145         return r'''(?isx)<meta
1146                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1147                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1148
1149     def _og_search_property(self, prop, html, name=None, **kargs):
1150         if not isinstance(prop, (list, tuple)):
1151             prop = [prop]
1152         if name is None:
1153             name = 'OpenGraph %s' % prop[0]
1154         og_regexes = []
1155         for p in prop:
1156             og_regexes.extend(self._og_regexes(p))
1157         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1158         if escaped is None:
1159             return None
1160         return unescapeHTML(escaped)
1161
1162     def _og_search_thumbnail(self, html, **kargs):
1163         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1164
1165     def _og_search_description(self, html, **kargs):
1166         return self._og_search_property('description', html, fatal=False, **kargs)
1167
1168     def _og_search_title(self, html, **kargs):
1169         return self._og_search_property('title', html, **kargs)
1170
1171     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1172         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1173         if secure:
1174             regexes = self._og_regexes('video:secure_url') + regexes
1175         return self._html_search_regex(regexes, html, name, **kargs)
1176
1177     def _og_search_url(self, html, **kargs):
1178         return self._og_search_property('url', html, **kargs)
1179
1180     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1181         if not isinstance(name, (list, tuple)):
1182             name = [name]
1183         if display_name is None:
1184             display_name = name[0]
1185         return self._html_search_regex(
1186             [self._meta_regex(n) for n in name],
1187             html, display_name, fatal=fatal, group='content', **kwargs)
1188
1189     def _dc_search_uploader(self, html):
1190         return self._html_search_meta('dc.creator', html, 'uploader')
1191
1192     def _rta_search(self, html):
1193         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1194         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1195                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1196                      html):
1197             return 18
1198         return 0
1199
1200     def _media_rating_search(self, html):
1201         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1202         rating = self._html_search_meta('rating', html)
1203
1204         if not rating:
1205             return None
1206
1207         RATING_TABLE = {
1208             'safe for kids': 0,
1209             'general': 8,
1210             '14 years': 14,
1211             'mature': 17,
1212             'restricted': 19,
1213         }
1214         return RATING_TABLE.get(rating.lower())
1215
1216     def _family_friendly_search(self, html):
1217         # See http://schema.org/VideoObject
1218         family_friendly = self._html_search_meta(
1219             'isFamilyFriendly', html, default=None)
1220
1221         if not family_friendly:
1222             return None
1223
1224         RATING_TABLE = {
1225             '1': 0,
1226             'true': 0,
1227             '0': 18,
1228             'false': 18,
1229         }
1230         return RATING_TABLE.get(family_friendly.lower())
1231
1232     def _twitter_search_player(self, html):
1233         return self._html_search_meta('twitter:player', html,
1234                                       'twitter card player')
1235
1236     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1237         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1238         default = kwargs.get('default', NO_DEFAULT)
1239         # JSON-LD may be malformed and thus `fatal` should be respected.
1240         # At the same time `default` may be passed that assumes `fatal=False`
1241         # for _search_regex. Let's simulate the same behavior here as well.
1242         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1243         json_ld = []
1244         for mobj in json_ld_list:
1245             json_ld_item = self._parse_json(
1246                 mobj.group('json_ld'), video_id, fatal=fatal)
1247             if not json_ld_item:
1248                 continue
1249             if isinstance(json_ld_item, dict):
1250                 json_ld.append(json_ld_item)
1251             elif isinstance(json_ld_item, (list, tuple)):
1252                 json_ld.extend(json_ld_item)
1253         if json_ld:
1254             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1255         if json_ld:
1256             return json_ld
1257         if default is not NO_DEFAULT:
1258             return default
1259         elif fatal:
1260             raise RegexNotFoundError('Unable to extract JSON-LD')
1261         else:
1262             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1263             return {}
1264
1265     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1266         if isinstance(json_ld, compat_str):
1267             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1268         if not json_ld:
1269             return {}
1270         info = {}
1271         if not isinstance(json_ld, (list, tuple, dict)):
1272             return info
1273         if isinstance(json_ld, dict):
1274             json_ld = [json_ld]
1275
1276         INTERACTION_TYPE_MAP = {
1277             'CommentAction': 'comment',
1278             'AgreeAction': 'like',
1279             'DisagreeAction': 'dislike',
1280             'LikeAction': 'like',
1281             'DislikeAction': 'dislike',
1282             'ListenAction': 'view',
1283             'WatchAction': 'view',
1284             'ViewAction': 'view',
1285         }
1286
1287         def extract_interaction_type(e):
1288             interaction_type = e.get('interactionType')
1289             if isinstance(interaction_type, dict):
1290                 interaction_type = interaction_type.get('@type')
1291             return str_or_none(interaction_type)
1292
1293         def extract_interaction_statistic(e):
1294             interaction_statistic = e.get('interactionStatistic')
1295             if isinstance(interaction_statistic, dict):
1296                 interaction_statistic = [interaction_statistic]
1297             if not isinstance(interaction_statistic, list):
1298                 return
1299             for is_e in interaction_statistic:
1300                 if not isinstance(is_e, dict):
1301                     continue
1302                 if is_e.get('@type') != 'InteractionCounter':
1303                     continue
1304                 interaction_type = extract_interaction_type(is_e)
1305                 if not interaction_type:
1306                     continue
1307                 # For interaction count some sites provide string instead of
1308                 # an integer (as per spec) with non digit characters (e.g. ",")
1309                 # so extracting count with more relaxed str_to_int
1310                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1311                 if interaction_count is None:
1312                     continue
1313                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1314                 if not count_kind:
1315                     continue
1316                 count_key = '%s_count' % count_kind
1317                 if info.get(count_key) is not None:
1318                     continue
1319                 info[count_key] = interaction_count
1320
1321         def extract_video_object(e):
1322             assert e['@type'] == 'VideoObject'
1323             author = e.get('author')
1324             info.update({
1325                 'url': url_or_none(e.get('contentUrl')),
1326                 'title': unescapeHTML(e.get('name')),
1327                 'description': unescapeHTML(e.get('description')),
1328                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1329                 'duration': parse_duration(e.get('duration')),
1330                 'timestamp': unified_timestamp(e.get('uploadDate')),
1331                 # author can be an instance of 'Organization' or 'Person' types.
1332                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1333                 # however some websites are using 'Text' type instead.
1334                 # 1. https://schema.org/VideoObject
1335                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1336                 'filesize': float_or_none(e.get('contentSize')),
1337                 'tbr': int_or_none(e.get('bitrate')),
1338                 'width': int_or_none(e.get('width')),
1339                 'height': int_or_none(e.get('height')),
1340                 'view_count': int_or_none(e.get('interactionCount')),
1341             })
1342             extract_interaction_statistic(e)
1343
1344         for e in json_ld:
1345             if '@context' in e:
1346                 item_type = e.get('@type')
1347                 if expected_type is not None and expected_type != item_type:
1348                     continue
1349                 if item_type in ('TVEpisode', 'Episode'):
1350                     episode_name = unescapeHTML(e.get('name'))
1351                     info.update({
1352                         'episode': episode_name,
1353                         'episode_number': int_or_none(e.get('episodeNumber')),
1354                         'description': unescapeHTML(e.get('description')),
1355                     })
1356                     if not info.get('title') and episode_name:
1357                         info['title'] = episode_name
1358                     part_of_season = e.get('partOfSeason')
1359                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1360                         info.update({
1361                             'season': unescapeHTML(part_of_season.get('name')),
1362                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1363                         })
1364                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1365                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1366                         info['series'] = unescapeHTML(part_of_series.get('name'))
1367                 elif item_type == 'Movie':
1368                     info.update({
1369                         'title': unescapeHTML(e.get('name')),
1370                         'description': unescapeHTML(e.get('description')),
1371                         'duration': parse_duration(e.get('duration')),
1372                         'timestamp': unified_timestamp(e.get('dateCreated')),
1373                     })
1374                 elif item_type in ('Article', 'NewsArticle'):
1375                     info.update({
1376                         'timestamp': parse_iso8601(e.get('datePublished')),
1377                         'title': unescapeHTML(e.get('headline')),
1378                         'description': unescapeHTML(e.get('articleBody')),
1379                     })
1380                 elif item_type == 'VideoObject':
1381                     extract_video_object(e)
1382                     if expected_type is None:
1383                         continue
1384                     else:
1385                         break
1386                 video = e.get('video')
1387                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1388                     extract_video_object(video)
1389                 if expected_type is None:
1390                     continue
1391                 else:
1392                     break
1393         return dict((k, v) for k, v in info.items() if v is not None)
1394
1395     @staticmethod
1396     def _hidden_inputs(html):
1397         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1398         hidden_inputs = {}
1399         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1400             attrs = extract_attributes(input)
1401             if not input:
1402                 continue
1403             if attrs.get('type') not in ('hidden', 'submit'):
1404                 continue
1405             name = attrs.get('name') or attrs.get('id')
1406             value = attrs.get('value')
1407             if name and value is not None:
1408                 hidden_inputs[name] = value
1409         return hidden_inputs
1410
1411     def _form_hidden_inputs(self, form_id, html):
1412         form = self._search_regex(
1413             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1414             html, '%s form' % form_id, group='form')
1415         return self._hidden_inputs(form)
1416
1417     class FormatSort:
1418         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1419
1420         default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
1421                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1422                    'proto', 'ext', 'has_audio', 'source', 'format_id')  # These must not be aliases
1423
1424         settings = {
1425             'vcodec': {'type': 'ordered', 'regex': True,
1426                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1427             'acodec': {'type': 'ordered', 'regex': True,
1428                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1429             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1430                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
1431             'vext': {'type': 'ordered', 'field': 'video_ext',
1432                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1433                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1434             'aext': {'type': 'ordered', 'field': 'audio_ext',
1435                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1436                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1437             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1438             'ie_pref': {'priority': True, 'type': 'extractor'},
1439             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1440             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1441             'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
1442             'quality': {'convert': 'float_none'},
1443             'filesize': {'convert': 'bytes'},
1444             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1445             'id': {'convert': 'string', 'field': 'format_id'},
1446             'height': {'convert': 'float_none'},
1447             'width': {'convert': 'float_none'},
1448             'fps': {'convert': 'float_none'},
1449             'tbr': {'convert': 'float_none'},
1450             'vbr': {'convert': 'float_none'},
1451             'abr': {'convert': 'float_none'},
1452             'asr': {'convert': 'float_none'},
1453             'source': {'convert': 'ignore', 'field': 'source_preference'},
1454
1455             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1456             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1457             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1458             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1459             'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1460
1461             # Most of these exist only for compatibility reasons
1462             'dimension': {'type': 'alias', 'field': 'res'},
1463             'resolution': {'type': 'alias', 'field': 'res'},
1464             'extension': {'type': 'alias', 'field': 'ext'},
1465             'bitrate': {'type': 'alias', 'field': 'br'},
1466             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1467             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1468             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1469             'framerate': {'type': 'alias', 'field': 'fps'},
1470             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1471             'protocol': {'type': 'alias', 'field': 'proto'},
1472             'source_preference': {'type': 'alias', 'field': 'source'},
1473             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1474             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1475             'samplerate': {'type': 'alias', 'field': 'asr'},
1476             'video_ext': {'type': 'alias', 'field': 'vext'},
1477             'audio_ext': {'type': 'alias', 'field': 'aext'},
1478             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1479             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1480             'video': {'type': 'alias', 'field': 'hasvid'},
1481             'has_video': {'type': 'alias', 'field': 'hasvid'},
1482             'audio': {'type': 'alias', 'field': 'hasaud'},
1483             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1484             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1485             'preference': {'type': 'alias', 'field': 'ie_pref'},
1486             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1487             'format_id': {'type': 'alias', 'field': 'id'},
1488         }
1489
1490         _order = []
1491
1492         def _get_field_setting(self, field, key):
1493             if field not in self.settings:
1494                 self.settings[field] = {}
1495             propObj = self.settings[field]
1496             if key not in propObj:
1497                 type = propObj.get('type')
1498                 if key == 'field':
1499                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1500                 elif key == 'convert':
1501                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1502                 else:
1503                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1504                 propObj[key] = default
1505             return propObj[key]
1506
1507         def _resolve_field_value(self, field, value, convertNone=False):
1508             if value is None:
1509                 if not convertNone:
1510                     return None
1511             else:
1512                 value = value.lower()
1513             conversion = self._get_field_setting(field, 'convert')
1514             if conversion == 'ignore':
1515                 return None
1516             if conversion == 'string':
1517                 return value
1518             elif conversion == 'float_none':
1519                 return float_or_none(value)
1520             elif conversion == 'bytes':
1521                 return FileDownloader.parse_bytes(value)
1522             elif conversion == 'order':
1523                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1524                 use_regex = self._get_field_setting(field, 'regex')
1525                 list_length = len(order_list)
1526                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1527                 if use_regex and value is not None:
1528                     for i, regex in enumerate(order_list):
1529                         if regex and re.match(regex, value):
1530                             return list_length - i
1531                     return list_length - empty_pos  # not in list
1532                 else:  # not regex or  value = None
1533                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1534             else:
1535                 if value.isnumeric():
1536                     return float(value)
1537                 else:
1538                     self.settings[field]['convert'] = 'string'
1539                     return value
1540
1541         def evaluate_params(self, params, sort_extractor):
1542             self._use_free_order = params.get('prefer_free_formats', False)
1543             self._sort_user = params.get('format_sort', [])
1544             self._sort_extractor = sort_extractor
1545
1546             def add_item(field, reverse, closest, limit_text):
1547                 field = field.lower()
1548                 if field in self._order:
1549                     return
1550                 self._order.append(field)
1551                 limit = self._resolve_field_value(field, limit_text)
1552                 data = {
1553                     'reverse': reverse,
1554                     'closest': False if limit is None else closest,
1555                     'limit_text': limit_text,
1556                     'limit': limit}
1557                 if field in self.settings:
1558                     self.settings[field].update(data)
1559                 else:
1560                     self.settings[field] = data
1561
1562             sort_list = (
1563                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1564                 + (tuple() if params.get('format_sort_force', False)
1565                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1566                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1567
1568             for item in sort_list:
1569                 match = re.match(self.regex, item)
1570                 if match is None:
1571                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1572                 field = match.group('field')
1573                 if field is None:
1574                     continue
1575                 if self._get_field_setting(field, 'type') == 'alias':
1576                     field = self._get_field_setting(field, 'field')
1577                 reverse = match.group('reverse') is not None
1578                 closest = match.group('separator') == '~'
1579                 limit_text = match.group('limit')
1580
1581                 has_limit = limit_text is not None
1582                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1583                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1584
1585                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1586                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1587                 limit_count = len(limits)
1588                 for (i, f) in enumerate(fields):
1589                     add_item(f, reverse, closest,
1590                              limits[i] if i < limit_count
1591                              else limits[0] if has_limit and not has_multiple_limits
1592                              else None)
1593
1594         def print_verbose_info(self, to_screen):
1595             if self._sort_user:
1596                 to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
1597             if self._sort_extractor:
1598                 to_screen('[debug] Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1599             to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1600                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1601                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1602                               self._get_field_setting(field, 'limit_text'),
1603                               self._get_field_setting(field, 'limit'))
1604                 if self._get_field_setting(field, 'limit_text') is not None else '')
1605                 for field in self._order if self._get_field_setting(field, 'visible')]))
1606
1607         def _calculate_field_preference_from_value(self, format, field, type, value):
1608             reverse = self._get_field_setting(field, 'reverse')
1609             closest = self._get_field_setting(field, 'closest')
1610             limit = self._get_field_setting(field, 'limit')
1611
1612             if type == 'extractor':
1613                 maximum = self._get_field_setting(field, 'max')
1614                 if value is None or (maximum is not None and value >= maximum):
1615                     value = -1
1616             elif type == 'boolean':
1617                 in_list = self._get_field_setting(field, 'in_list')
1618                 not_in_list = self._get_field_setting(field, 'not_in_list')
1619                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1620             elif type == 'ordered':
1621                 value = self._resolve_field_value(field, value, True)
1622
1623             # try to convert to number
1624             val_num = float_or_none(value)
1625             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1626             if is_num:
1627                 value = val_num
1628
1629             return ((-10, 0) if value is None
1630                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1631                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1632                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1633                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1634                     else (-1, value, 0))
1635
1636         def _calculate_field_preference(self, format, field):
1637             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1638             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1639             if type == 'multiple':
1640                 type = 'field'  # Only 'field' is allowed in multiple for now
1641                 actual_fields = self._get_field_setting(field, 'field')
1642
1643                 def wrapped_function(values):
1644                     values = tuple(filter(lambda x: x is not None, values))
1645                     return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1646                             else values[0] if values
1647                             else None)
1648
1649                 value = wrapped_function((get_value(f) for f in actual_fields))
1650             else:
1651                 value = get_value(field)
1652             return self._calculate_field_preference_from_value(format, field, type, value)
1653
1654         def calculate_preference(self, format):
1655             # Determine missing protocol
1656             if not format.get('protocol'):
1657                 format['protocol'] = determine_protocol(format)
1658
1659             # Determine missing ext
1660             if not format.get('ext') and 'url' in format:
1661                 format['ext'] = determine_ext(format['url'])
1662             if format.get('vcodec') == 'none':
1663                 format['audio_ext'] = format['ext']
1664                 format['video_ext'] = 'none'
1665             else:
1666                 format['video_ext'] = format['ext']
1667                 format['audio_ext'] = 'none'
1668             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1669             #    format['preference'] = -1000
1670
1671             # Determine missing bitrates
1672             if format.get('tbr') is None:
1673                 if format.get('vbr') is not None and format.get('abr') is not None:
1674                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1675             else:
1676                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1677                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1678                 if format.get('acodec') != "none" and format.get('abr') is None:
1679                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1680
1681             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1682
1683     def _sort_formats(self, formats, field_preference=[]):
1684         if not formats:
1685             if self._downloader.params.get('ignore_no_formats_error'):
1686                 return
1687             raise ExtractorError('No video formats found')
1688         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1689         format_sort.evaluate_params(self._downloader.params, field_preference)
1690         if self._downloader.params.get('verbose', False):
1691             format_sort.print_verbose_info(self._downloader.to_screen)
1692         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1693
1694     def _check_formats(self, formats, video_id):
1695         if formats:
1696             formats[:] = filter(
1697                 lambda f: self._is_valid_url(
1698                     f['url'], video_id,
1699                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1700                 formats)
1701
1702     @staticmethod
1703     def _remove_duplicate_formats(formats):
1704         format_urls = set()
1705         unique_formats = []
1706         for f in formats:
1707             if f['url'] not in format_urls:
1708                 format_urls.add(f['url'])
1709                 unique_formats.append(f)
1710         formats[:] = unique_formats
1711
1712     def _is_valid_url(self, url, video_id, item='video', headers={}):
1713         url = self._proto_relative_url(url, scheme='http:')
1714         # For now assume non HTTP(S) URLs always valid
1715         if not (url.startswith('http://') or url.startswith('https://')):
1716             return True
1717         try:
1718             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1719             return True
1720         except ExtractorError as e:
1721             self.to_screen(
1722                 '%s: %s URL is invalid, skipping: %s'
1723                 % (video_id, item, error_to_compat_str(e.cause)))
1724             return False
1725
1726     def http_scheme(self):
1727         """ Either "http:" or "https:", depending on the user's preferences """
1728         return (
1729             'http:'
1730             if self._downloader.params.get('prefer_insecure', False)
1731             else 'https:')
1732
1733     def _proto_relative_url(self, url, scheme=None):
1734         if url is None:
1735             return url
1736         if url.startswith('//'):
1737             if scheme is None:
1738                 scheme = self.http_scheme()
1739             return scheme + url
1740         else:
1741             return url
1742
1743     def _sleep(self, timeout, video_id, msg_template=None):
1744         if msg_template is None:
1745             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1746         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1747         self.to_screen(msg)
1748         time.sleep(timeout)
1749
1750     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1751                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1752                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1753         manifest = self._download_xml(
1754             manifest_url, video_id, 'Downloading f4m manifest',
1755             'Unable to download f4m manifest',
1756             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1757             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1758             transform_source=transform_source,
1759             fatal=fatal, data=data, headers=headers, query=query)
1760
1761         if manifest is False:
1762             return []
1763
1764         return self._parse_f4m_formats(
1765             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1766             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1767
1768     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1769                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1770                            fatal=True, m3u8_id=None):
1771         if not isinstance(manifest, compat_etree_Element) and not fatal:
1772             return []
1773
1774         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1775         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1776         if akamai_pv is not None and ';' in akamai_pv.text:
1777             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1778             if playerVerificationChallenge.strip() != '':
1779                 return []
1780
1781         formats = []
1782         manifest_version = '1.0'
1783         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1784         if not media_nodes:
1785             manifest_version = '2.0'
1786             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1787         # Remove unsupported DRM protected media from final formats
1788         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1789         media_nodes = remove_encrypted_media(media_nodes)
1790         if not media_nodes:
1791             return formats
1792
1793         manifest_base_url = get_base_url(manifest)
1794
1795         bootstrap_info = xpath_element(
1796             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1797             'bootstrap info', default=None)
1798
1799         vcodec = None
1800         mime_type = xpath_text(
1801             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1802             'base URL', default=None)
1803         if mime_type and mime_type.startswith('audio/'):
1804             vcodec = 'none'
1805
1806         for i, media_el in enumerate(media_nodes):
1807             tbr = int_or_none(media_el.attrib.get('bitrate'))
1808             width = int_or_none(media_el.attrib.get('width'))
1809             height = int_or_none(media_el.attrib.get('height'))
1810             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1811             # If <bootstrapInfo> is present, the specified f4m is a
1812             # stream-level manifest, and only set-level manifests may refer to
1813             # external resources.  See section 11.4 and section 4 of F4M spec
1814             if bootstrap_info is None:
1815                 media_url = None
1816                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1817                 if manifest_version == '2.0':
1818                     media_url = media_el.attrib.get('href')
1819                 if media_url is None:
1820                     media_url = media_el.attrib.get('url')
1821                 if not media_url:
1822                     continue
1823                 manifest_url = (
1824                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1825                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1826                 # If media_url is itself a f4m manifest do the recursive extraction
1827                 # since bitrates in parent manifest (this one) and media_url manifest
1828                 # may differ leading to inability to resolve the format by requested
1829                 # bitrate in f4m downloader
1830                 ext = determine_ext(manifest_url)
1831                 if ext == 'f4m':
1832                     f4m_formats = self._extract_f4m_formats(
1833                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1834                         transform_source=transform_source, fatal=fatal)
1835                     # Sometimes stream-level manifest contains single media entry that
1836                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1837                     # At the same time parent's media entry in set-level manifest may
1838                     # contain it. We will copy it from parent in such cases.
1839                     if len(f4m_formats) == 1:
1840                         f = f4m_formats[0]
1841                         f.update({
1842                             'tbr': f.get('tbr') or tbr,
1843                             'width': f.get('width') or width,
1844                             'height': f.get('height') or height,
1845                             'format_id': f.get('format_id') if not tbr else format_id,
1846                             'vcodec': vcodec,
1847                         })
1848                     formats.extend(f4m_formats)
1849                     continue
1850                 elif ext == 'm3u8':
1851                     formats.extend(self._extract_m3u8_formats(
1852                         manifest_url, video_id, 'mp4', preference=preference,
1853                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1854                     continue
1855             formats.append({
1856                 'format_id': format_id,
1857                 'url': manifest_url,
1858                 'manifest_url': manifest_url,
1859                 'ext': 'flv' if bootstrap_info is not None else None,
1860                 'protocol': 'f4m',
1861                 'tbr': tbr,
1862                 'width': width,
1863                 'height': height,
1864                 'vcodec': vcodec,
1865                 'preference': preference,
1866                 'quality': quality,
1867             })
1868         return formats
1869
1870     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1871         return {
1872             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1873             'url': m3u8_url,
1874             'ext': ext,
1875             'protocol': 'm3u8',
1876             'preference': preference - 100 if preference else -100,
1877             'quality': quality,
1878             'resolution': 'multiple',
1879             'format_note': 'Quality selection URL',
1880         }
1881
1882     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1883                               entry_protocol='m3u8', preference=None, quality=None,
1884                               m3u8_id=None, note=None, errnote=None,
1885                               fatal=True, live=False, data=None, headers={},
1886                               query={}):
1887         res = self._download_webpage_handle(
1888             m3u8_url, video_id,
1889             note=note or 'Downloading m3u8 information',
1890             errnote=errnote or 'Failed to download m3u8 information',
1891             fatal=fatal, data=data, headers=headers, query=query)
1892
1893         if res is False:
1894             return []
1895
1896         m3u8_doc, urlh = res
1897         m3u8_url = urlh.geturl()
1898
1899         return self._parse_m3u8_formats(
1900             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1901             preference=preference, quality=quality, m3u8_id=m3u8_id,
1902             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1903             headers=headers, query=query, video_id=video_id)
1904
1905     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1906                             entry_protocol='m3u8', preference=None, quality=None,
1907                             m3u8_id=None, live=False, note=None, errnote=None,
1908                             fatal=True, data=None, headers={}, query={}, video_id=None):
1909         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1910             return []
1911
1912         if (not self._downloader.params.get('allow_unplayable_formats')
1913                 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)):  # Apple FairPlay
1914             return []
1915
1916         formats = []
1917
1918         format_url = lambda u: (
1919             u
1920             if re.match(r'^https?://', u)
1921             else compat_urlparse.urljoin(m3u8_url, u))
1922
1923         split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
1924
1925         # References:
1926         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1927         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1928         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1929
1930         # We should try extracting formats only from master playlists [1, 4.3.4],
1931         # i.e. playlists that describe available qualities. On the other hand
1932         # media playlists [1, 4.3.3] should be returned as is since they contain
1933         # just the media without qualities renditions.
1934         # Fortunately, master playlist can be easily distinguished from media
1935         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1936         # master playlist tags MUST NOT appear in a media playlist and vice versa.
1937         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1938         # media playlist and MUST NOT appear in master playlist thus we can
1939         # clearly detect media playlist with this criterion.
1940
1941         def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None,
1942                                            fatal=True, data=None, headers={}):
1943             if not m3u8_doc:
1944                 if not format_url:
1945                     return []
1946                 res = self._download_webpage_handle(
1947                     format_url, video_id,
1948                     note=False,
1949                     errnote='Failed to download m3u8 playlist information',
1950                     fatal=fatal, data=data, headers=headers)
1951
1952                 if res is False:
1953                     return []
1954
1955                 m3u8_doc, urlh = res
1956                 format_url = urlh.geturl()
1957
1958             playlist_formats = []
1959             i = (
1960                 0
1961                 if split_discontinuity
1962                 else None)
1963             format_info = {
1964                 'index': i,
1965                 'key_data': None,
1966                 'files': [],
1967             }
1968             for line in m3u8_doc.splitlines():
1969                 if not line.startswith('#'):
1970                     format_info['files'].append(line)
1971                 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
1972                     i += 1
1973                     playlist_formats.append(format_info)
1974                     format_info = {
1975                         'index': i,
1976                         'url': format_url,
1977                         'files': [],
1978                     }
1979             playlist_formats.append(format_info)
1980             return playlist_formats
1981
1982         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1983
1984             playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
1985
1986             for format in playlist_formats:
1987                 format_id = []
1988                 if m3u8_id:
1989                     format_id.append(m3u8_id)
1990                 format_index = format.get('index')
1991                 if format_index:
1992                     format_id.append(str(format_index))
1993                 f = {
1994                     'format_id': '-'.join(format_id),
1995                     'format_index': format_index,
1996                     'url': m3u8_url,
1997                     'ext': ext,
1998                     'protocol': entry_protocol,
1999                     'preference': preference,
2000                     'quality': quality,
2001                 }
2002                 formats.append(f)
2003
2004             return formats
2005
2006         groups = {}
2007         last_stream_inf = {}
2008
2009         def extract_media(x_media_line):
2010             media = parse_m3u8_attributes(x_media_line)
2011             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2012             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2013             if not (media_type and group_id and name):
2014                 return
2015             groups.setdefault(group_id, []).append(media)
2016             if media_type not in ('VIDEO', 'AUDIO'):
2017                 return
2018             media_url = media.get('URI')
2019             if media_url:
2020                 manifest_url = format_url(media_url)
2021                 format_id = []
2022                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2023                                                                   fatal=fatal, data=data, headers=headers)
2024
2025                 for format in playlist_formats:
2026                     format_index = format.get('index')
2027                     for v in (m3u8_id, group_id, name):
2028                         if v:
2029                             format_id.append(v)
2030                     if format_index:
2031                         format_id.append(str(format_index))
2032                     f = {
2033                         'format_id': '-'.join(format_id),
2034                         'format_index': format_index,
2035                         'url': manifest_url,
2036                         'manifest_url': m3u8_url,
2037                         'language': media.get('LANGUAGE'),
2038                         'ext': ext,
2039                         'protocol': entry_protocol,
2040                         'preference': preference,
2041                         'quality': quality,
2042                     }
2043                     if media_type == 'AUDIO':
2044                         f['vcodec'] = 'none'
2045                     formats.append(f)
2046
2047         def build_stream_name():
2048             # Despite specification does not mention NAME attribute for
2049             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2050             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2051             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2052             stream_name = last_stream_inf.get('NAME')
2053             if stream_name:
2054                 return stream_name
2055             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2056             # from corresponding rendition group
2057             stream_group_id = last_stream_inf.get('VIDEO')
2058             if not stream_group_id:
2059                 return
2060             stream_group = groups.get(stream_group_id)
2061             if not stream_group:
2062                 return stream_group_id
2063             rendition = stream_group[0]
2064             return rendition.get('NAME') or stream_group_id
2065
2066         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2067         # chance to detect video only formats when EXT-X-STREAM-INF tags
2068         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2069         for line in m3u8_doc.splitlines():
2070             if line.startswith('#EXT-X-MEDIA:'):
2071                 extract_media(line)
2072
2073         for line in m3u8_doc.splitlines():
2074             if line.startswith('#EXT-X-STREAM-INF:'):
2075                 last_stream_inf = parse_m3u8_attributes(line)
2076             elif line.startswith('#') or not line.strip():
2077                 continue
2078             else:
2079                 tbr = float_or_none(
2080                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2081                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2082                 manifest_url = format_url(line.strip())
2083
2084                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2085                                                                   fatal=fatal, data=data, headers=headers)
2086
2087                 for frmt in playlist_formats:
2088                     format_id = []
2089                     if m3u8_id:
2090                         format_id.append(m3u8_id)
2091                     format_index = frmt.get('index')
2092                     stream_name = build_stream_name()
2093                     # Bandwidth of live streams may differ over time thus making
2094                     # format_id unpredictable. So it's better to keep provided
2095                     # format_id intact.
2096                     if not live:
2097                         format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2098                     if format_index:
2099                         format_id.append(str(format_index))
2100                     f = {
2101                         'format_id': '-'.join(format_id),
2102                         'format_index': format_index,
2103                         'url': manifest_url,
2104                         'manifest_url': m3u8_url,
2105                         'tbr': tbr,
2106                         'ext': ext,
2107                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2108                         'protocol': entry_protocol,
2109                         'preference': preference,
2110                         'quality': quality,
2111                     }
2112                     resolution = last_stream_inf.get('RESOLUTION')
2113                     if resolution:
2114                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2115                         if mobj:
2116                             f['width'] = int(mobj.group('width'))
2117                             f['height'] = int(mobj.group('height'))
2118                     # Unified Streaming Platform
2119                     mobj = re.search(
2120                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2121                     if mobj:
2122                         abr, vbr = mobj.groups()
2123                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2124                         f.update({
2125                             'vbr': vbr,
2126                             'abr': abr,
2127                         })
2128                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2129                     f.update(codecs)
2130                     audio_group_id = last_stream_inf.get('AUDIO')
2131                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2132                     # references a rendition group MUST have a CODECS attribute.
2133                     # However, this is not always respected, for example, [2]
2134                     # contains EXT-X-STREAM-INF tag which references AUDIO
2135                     # rendition group but does not have CODECS and despite
2136                     # referencing an audio group it represents a complete
2137                     # (with audio and video) format. So, for such cases we will
2138                     # ignore references to rendition groups and treat them
2139                     # as complete formats.
2140                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2141                         audio_group = groups.get(audio_group_id)
2142                         if audio_group and audio_group[0].get('URI'):
2143                             # TODO: update acodec for audio only formats with
2144                             # the same GROUP-ID
2145                             f['acodec'] = 'none'
2146                     if not f.get('ext'):
2147                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2148                     formats.append(f)
2149
2150                     # for DailyMotion
2151                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2152                     if progressive_uri:
2153                         http_f = f.copy()
2154                         del http_f['manifest_url']
2155                         http_f.update({
2156                             'format_id': f['format_id'].replace('hls-', 'http-'),
2157                             'protocol': 'http',
2158                             'url': progressive_uri,
2159                         })
2160                         formats.append(http_f)
2161
2162                 last_stream_inf = {}
2163         return formats
2164
2165     @staticmethod
2166     def _xpath_ns(path, namespace=None):
2167         if not namespace:
2168             return path
2169         out = []
2170         for c in path.split('/'):
2171             if not c or c == '.':
2172                 out.append(c)
2173             else:
2174                 out.append('{%s}%s' % (namespace, c))
2175         return '/'.join(out)
2176
2177     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2178         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2179
2180         if smil is False:
2181             assert not fatal
2182             return []
2183
2184         namespace = self._parse_smil_namespace(smil)
2185
2186         return self._parse_smil_formats(
2187             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2188
2189     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2190         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2191         if smil is False:
2192             return {}
2193         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2194
2195     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2196         return self._download_xml(
2197             smil_url, video_id, 'Downloading SMIL file',
2198             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2199
2200     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2201         namespace = self._parse_smil_namespace(smil)
2202
2203         formats = self._parse_smil_formats(
2204             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2205         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2206
2207         video_id = os.path.splitext(url_basename(smil_url))[0]
2208         title = None
2209         description = None
2210         upload_date = None
2211         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2212             name = meta.attrib.get('name')
2213             content = meta.attrib.get('content')
2214             if not name or not content:
2215                 continue
2216             if not title and name == 'title':
2217                 title = content
2218             elif not description and name in ('description', 'abstract'):
2219                 description = content
2220             elif not upload_date and name == 'date':
2221                 upload_date = unified_strdate(content)
2222
2223         thumbnails = [{
2224             'id': image.get('type'),
2225             'url': image.get('src'),
2226             'width': int_or_none(image.get('width')),
2227             'height': int_or_none(image.get('height')),
2228         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2229
2230         return {
2231             'id': video_id,
2232             'title': title or video_id,
2233             'description': description,
2234             'upload_date': upload_date,
2235             'thumbnails': thumbnails,
2236             'formats': formats,
2237             'subtitles': subtitles,
2238         }
2239
2240     def _parse_smil_namespace(self, smil):
2241         return self._search_regex(
2242             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2243
2244     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2245         base = smil_url
2246         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2247             b = meta.get('base') or meta.get('httpBase')
2248             if b:
2249                 base = b
2250                 break
2251
2252         formats = []
2253         rtmp_count = 0
2254         http_count = 0
2255         m3u8_count = 0
2256
2257         srcs = []
2258         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2259         for medium in media:
2260             src = medium.get('src')
2261             if not src or src in srcs:
2262                 continue
2263             srcs.append(src)
2264
2265             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2266             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2267             width = int_or_none(medium.get('width'))
2268             height = int_or_none(medium.get('height'))
2269             proto = medium.get('proto')
2270             ext = medium.get('ext')
2271             src_ext = determine_ext(src)
2272             streamer = medium.get('streamer') or base
2273
2274             if proto == 'rtmp' or streamer.startswith('rtmp'):
2275                 rtmp_count += 1
2276                 formats.append({
2277                     'url': streamer,
2278                     'play_path': src,
2279                     'ext': 'flv',
2280                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2281                     'tbr': bitrate,
2282                     'filesize': filesize,
2283                     'width': width,
2284                     'height': height,
2285                 })
2286                 if transform_rtmp_url:
2287                     streamer, src = transform_rtmp_url(streamer, src)
2288                     formats[-1].update({
2289                         'url': streamer,
2290                         'play_path': src,
2291                     })
2292                 continue
2293
2294             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2295             src_url = src_url.strip()
2296
2297             if proto == 'm3u8' or src_ext == 'm3u8':
2298                 m3u8_formats = self._extract_m3u8_formats(
2299                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2300                 if len(m3u8_formats) == 1:
2301                     m3u8_count += 1
2302                     m3u8_formats[0].update({
2303                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2304                         'tbr': bitrate,
2305                         'width': width,
2306                         'height': height,
2307                     })
2308                 formats.extend(m3u8_formats)
2309             elif src_ext == 'f4m':
2310                 f4m_url = src_url
2311                 if not f4m_params:
2312                     f4m_params = {
2313                         'hdcore': '3.2.0',
2314                         'plugin': 'flowplayer-3.2.0.1',
2315                     }
2316                 f4m_url += '&' if '?' in f4m_url else '?'
2317                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2318                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2319             elif src_ext == 'mpd':
2320                 formats.extend(self._extract_mpd_formats(
2321                     src_url, video_id, mpd_id='dash', fatal=False))
2322             elif re.search(r'\.ism/[Mm]anifest', src_url):
2323                 formats.extend(self._extract_ism_formats(
2324                     src_url, video_id, ism_id='mss', fatal=False))
2325             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2326                 http_count += 1
2327                 formats.append({
2328                     'url': src_url,
2329                     'ext': ext or src_ext or 'flv',
2330                     'format_id': 'http-%d' % (bitrate or http_count),
2331                     'tbr': bitrate,
2332                     'filesize': filesize,
2333                     'width': width,
2334                     'height': height,
2335                 })
2336
2337         return formats
2338
2339     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2340         urls = []
2341         subtitles = {}
2342         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2343             src = textstream.get('src')
2344             if not src or src in urls:
2345                 continue
2346             urls.append(src)
2347             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2348             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2349             subtitles.setdefault(lang, []).append({
2350                 'url': src,
2351                 'ext': ext,
2352             })
2353         return subtitles
2354
2355     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2356         xspf = self._download_xml(
2357             xspf_url, playlist_id, 'Downloading xpsf playlist',
2358             'Unable to download xspf manifest', fatal=fatal)
2359         if xspf is False:
2360             return []
2361         return self._parse_xspf(
2362             xspf, playlist_id, xspf_url=xspf_url,
2363             xspf_base_url=base_url(xspf_url))
2364
2365     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2366         NS_MAP = {
2367             'xspf': 'http://xspf.org/ns/0/',
2368             's1': 'http://static.streamone.nl/player/ns/0',
2369         }
2370
2371         entries = []
2372         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2373             title = xpath_text(
2374                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2375             description = xpath_text(
2376                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2377             thumbnail = xpath_text(
2378                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2379             duration = float_or_none(
2380                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2381
2382             formats = []
2383             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2384                 format_url = urljoin(xspf_base_url, location.text)
2385                 if not format_url:
2386                     continue
2387                 formats.append({
2388                     'url': format_url,
2389                     'manifest_url': xspf_url,
2390                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2391                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2392                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2393                 })
2394             self._sort_formats(formats)
2395
2396             entries.append({
2397                 'id': playlist_id,
2398                 'title': title,
2399                 'description': description,
2400                 'thumbnail': thumbnail,
2401                 'duration': duration,
2402                 'formats': formats,
2403             })
2404         return entries
2405
2406     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2407         res = self._download_xml_handle(
2408             mpd_url, video_id,
2409             note=note or 'Downloading MPD manifest',
2410             errnote=errnote or 'Failed to download MPD manifest',
2411             fatal=fatal, data=data, headers=headers, query=query)
2412         if res is False:
2413             return []
2414         mpd_doc, urlh = res
2415         if mpd_doc is None:
2416             return []
2417         mpd_base_url = base_url(urlh.geturl())
2418
2419         return self._parse_mpd_formats(
2420             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2421
2422     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2423         """
2424         Parse formats from MPD manifest.
2425         References:
2426          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2427             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2428          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2429         """
2430         if not self._downloader.params.get('dynamic_mpd', True):
2431             if mpd_doc.get('type') == 'dynamic':
2432                 return []
2433
2434         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2435
2436         def _add_ns(path):
2437             return self._xpath_ns(path, namespace)
2438
2439         def is_drm_protected(element):
2440             return element.find(_add_ns('ContentProtection')) is not None
2441
2442         def extract_multisegment_info(element, ms_parent_info):
2443             ms_info = ms_parent_info.copy()
2444
2445             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2446             # common attributes and elements.  We will only extract relevant
2447             # for us.
2448             def extract_common(source):
2449                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2450                 if segment_timeline is not None:
2451                     s_e = segment_timeline.findall(_add_ns('S'))
2452                     if s_e:
2453                         ms_info['total_number'] = 0
2454                         ms_info['s'] = []
2455                         for s in s_e:
2456                             r = int(s.get('r', 0))
2457                             ms_info['total_number'] += 1 + r
2458                             ms_info['s'].append({
2459                                 't': int(s.get('t', 0)),
2460                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2461                                 'd': int(s.attrib['d']),
2462                                 'r': r,
2463                             })
2464                 start_number = source.get('startNumber')
2465                 if start_number:
2466                     ms_info['start_number'] = int(start_number)
2467                 timescale = source.get('timescale')
2468                 if timescale:
2469                     ms_info['timescale'] = int(timescale)
2470                 segment_duration = source.get('duration')
2471                 if segment_duration:
2472                     ms_info['segment_duration'] = float(segment_duration)
2473
2474             def extract_Initialization(source):
2475                 initialization = source.find(_add_ns('Initialization'))
2476                 if initialization is not None:
2477                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2478
2479             segment_list = element.find(_add_ns('SegmentList'))
2480             if segment_list is not None:
2481                 extract_common(segment_list)
2482                 extract_Initialization(segment_list)
2483                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2484                 if segment_urls_e:
2485                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2486             else:
2487                 segment_template = element.find(_add_ns('SegmentTemplate'))
2488                 if segment_template is not None:
2489                     extract_common(segment_template)
2490                     media = segment_template.get('media')
2491                     if media:
2492                         ms_info['media'] = media
2493                     initialization = segment_template.get('initialization')
2494                     if initialization:
2495                         ms_info['initialization'] = initialization
2496                     else:
2497                         extract_Initialization(segment_template)
2498             return ms_info
2499
2500         skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
2501
2502         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2503         formats = []
2504         for period in mpd_doc.findall(_add_ns('Period')):
2505             period_duration = parse_duration(period.get('duration')) or mpd_duration
2506             period_ms_info = extract_multisegment_info(period, {
2507                 'start_number': 1,
2508                 'timescale': 1,
2509             })
2510             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2511                 if skip_unplayable and is_drm_protected(adaptation_set):
2512                     continue
2513                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2514                 for representation in adaptation_set.findall(_add_ns('Representation')):
2515                     if skip_unplayable and is_drm_protected(representation):
2516                         continue
2517                     representation_attrib = adaptation_set.attrib.copy()
2518                     representation_attrib.update(representation.attrib)
2519                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2520                     mime_type = representation_attrib['mimeType']
2521                     content_type = mime_type.split('/')[0]
2522                     if content_type == 'text':
2523                         # TODO implement WebVTT downloading
2524                         pass
2525                     elif content_type in ('video', 'audio'):
2526                         base_url = ''
2527                         for element in (representation, adaptation_set, period, mpd_doc):
2528                             base_url_e = element.find(_add_ns('BaseURL'))
2529                             if base_url_e is not None:
2530                                 base_url = base_url_e.text + base_url
2531                                 if re.match(r'^https?://', base_url):
2532                                     break
2533                         if mpd_base_url and not re.match(r'^https?://', base_url):
2534                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2535                                 mpd_base_url += '/'
2536                             base_url = mpd_base_url + base_url
2537                         representation_id = representation_attrib.get('id')
2538                         lang = representation_attrib.get('lang')
2539                         url_el = representation.find(_add_ns('BaseURL'))
2540                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2541                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2542                         f = {
2543                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2544                             'manifest_url': mpd_url,
2545                             'ext': mimetype2ext(mime_type),
2546                             'width': int_or_none(representation_attrib.get('width')),
2547                             'height': int_or_none(representation_attrib.get('height')),
2548                             'tbr': float_or_none(bandwidth, 1000),
2549                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2550                             'fps': int_or_none(representation_attrib.get('frameRate')),
2551                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2552                             'format_note': 'DASH %s' % content_type,
2553                             'filesize': filesize,
2554                             'container': mimetype2ext(mime_type) + '_dash',
2555                         }
2556                         f.update(parse_codecs(representation_attrib.get('codecs')))
2557                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2558
2559                         def prepare_template(template_name, identifiers):
2560                             tmpl = representation_ms_info[template_name]
2561                             # First of, % characters outside $...$ templates
2562                             # must be escaped by doubling for proper processing
2563                             # by % operator string formatting used further (see
2564                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2565                             t = ''
2566                             in_template = False
2567                             for c in tmpl:
2568                                 t += c
2569                                 if c == '$':
2570                                     in_template = not in_template
2571                                 elif c == '%' and not in_template:
2572                                     t += c
2573                             # Next, $...$ templates are translated to their
2574                             # %(...) counterparts to be used with % operator
2575                             t = t.replace('$RepresentationID$', representation_id)
2576                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2577                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2578                             t.replace('$$', '$')
2579                             return t
2580
2581                         # @initialization is a regular template like @media one
2582                         # so it should be handled just the same way (see
2583                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2584                         if 'initialization' in representation_ms_info:
2585                             initialization_template = prepare_template(
2586                                 'initialization',
2587                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2588                                 # $Time$ shall not be included for @initialization thus
2589                                 # only $Bandwidth$ remains
2590                                 ('Bandwidth', ))
2591                             representation_ms_info['initialization_url'] = initialization_template % {
2592                                 'Bandwidth': bandwidth,
2593                             }
2594
2595                         def location_key(location):
2596                             return 'url' if re.match(r'^https?://', location) else 'path'
2597
2598                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2599
2600                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2601                             media_location_key = location_key(media_template)
2602
2603                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2604                             # can't be used at the same time
2605                             if '%(Number' in media_template and 's' not in representation_ms_info:
2606                                 segment_duration = None
2607                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2608                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2609                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2610                                 representation_ms_info['fragments'] = [{
2611                                     media_location_key: media_template % {
2612                                         'Number': segment_number,
2613                                         'Bandwidth': bandwidth,
2614                                     },
2615                                     'duration': segment_duration,
2616                                 } for segment_number in range(
2617                                     representation_ms_info['start_number'],
2618                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2619                             else:
2620                                 # $Number*$ or $Time$ in media template with S list available
2621                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2622                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2623                                 representation_ms_info['fragments'] = []
2624                                 segment_time = 0
2625                                 segment_d = None
2626                                 segment_number = representation_ms_info['start_number']
2627
2628                                 def add_segment_url():
2629                                     segment_url = media_template % {
2630                                         'Time': segment_time,
2631                                         'Bandwidth': bandwidth,
2632                                         'Number': segment_number,
2633                                     }
2634                                     representation_ms_info['fragments'].append({
2635                                         media_location_key: segment_url,
2636                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2637                                     })
2638
2639                                 for num, s in enumerate(representation_ms_info['s']):
2640                                     segment_time = s.get('t') or segment_time
2641                                     segment_d = s['d']
2642                                     add_segment_url()
2643                                     segment_number += 1
2644                                     for r in range(s.get('r', 0)):
2645                                         segment_time += segment_d
2646                                         add_segment_url()
2647                                         segment_number += 1
2648                                     segment_time += segment_d
2649                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2650                             # No media template
2651                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2652                             # or any YouTube dashsegments video
2653                             fragments = []
2654                             segment_index = 0
2655                             timescale = representation_ms_info['timescale']
2656                             for s in representation_ms_info['s']:
2657                                 duration = float_or_none(s['d'], timescale)
2658                                 for r in range(s.get('r', 0) + 1):
2659                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2660                                     fragments.append({
2661                                         location_key(segment_uri): segment_uri,
2662                                         'duration': duration,
2663                                     })
2664                                     segment_index += 1
2665                             representation_ms_info['fragments'] = fragments
2666                         elif 'segment_urls' in representation_ms_info:
2667                             # Segment URLs with no SegmentTimeline
2668                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2669                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2670                             fragments = []
2671                             segment_duration = float_or_none(
2672                                 representation_ms_info['segment_duration'],
2673                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2674                             for segment_url in representation_ms_info['segment_urls']:
2675                                 fragment = {
2676                                     location_key(segment_url): segment_url,
2677                                 }
2678                                 if segment_duration:
2679                                     fragment['duration'] = segment_duration
2680                                 fragments.append(fragment)
2681                             representation_ms_info['fragments'] = fragments
2682                         # If there is a fragments key available then we correctly recognized fragmented media.
2683                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2684                         # assumption is not necessarily correct since we may simply have no support for
2685                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2686                         if 'fragments' in representation_ms_info:
2687                             f.update({
2688                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2689                                 'url': mpd_url or base_url,
2690                                 'fragment_base_url': base_url,
2691                                 'fragments': [],
2692                                 'protocol': 'http_dash_segments',
2693                             })
2694                             if 'initialization_url' in representation_ms_info:
2695                                 initialization_url = representation_ms_info['initialization_url']
2696                                 if not f.get('url'):
2697                                     f['url'] = initialization_url
2698                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2699                             f['fragments'].extend(representation_ms_info['fragments'])
2700                         else:
2701                             # Assuming direct URL to unfragmented media.
2702                             f['url'] = base_url
2703                         formats.append(f)
2704                     else:
2705                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2706         return formats
2707
2708     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2709         res = self._download_xml_handle(
2710             ism_url, video_id,
2711             note=note or 'Downloading ISM manifest',
2712             errnote=errnote or 'Failed to download ISM manifest',
2713             fatal=fatal, data=data, headers=headers, query=query)
2714         if res is False:
2715             return []
2716         ism_doc, urlh = res
2717         if ism_doc is None:
2718             return []
2719
2720         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2721
2722     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2723         """
2724         Parse formats from ISM manifest.
2725         References:
2726          1. [MS-SSTR]: Smooth Streaming Protocol,
2727             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2728         """
2729         if ism_doc.get('IsLive') == 'TRUE':
2730             return []
2731         if (not self._downloader.params.get('allow_unplayable_formats')
2732                 and ism_doc.find('Protection') is not None):
2733             return []
2734
2735         duration = int(ism_doc.attrib['Duration'])
2736         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2737
2738         formats = []
2739         for stream in ism_doc.findall('StreamIndex'):
2740             stream_type = stream.get('Type')
2741             if stream_type not in ('video', 'audio'):
2742                 continue
2743             url_pattern = stream.attrib['Url']
2744             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2745             stream_name = stream.get('Name')
2746             for track in stream.findall('QualityLevel'):
2747                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2748                 # TODO: add support for WVC1 and WMAP
2749                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2750                     self.report_warning('%s is not a supported codec' % fourcc)
2751                     continue
2752                 tbr = int(track.attrib['Bitrate']) // 1000
2753                 # [1] does not mention Width and Height attributes. However,
2754                 # they're often present while MaxWidth and MaxHeight are
2755                 # missing, so should be used as fallbacks
2756                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2757                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2758                 sampling_rate = int_or_none(track.get('SamplingRate'))
2759
2760                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2761                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2762
2763                 fragments = []
2764                 fragment_ctx = {
2765                     'time': 0,
2766                 }
2767                 stream_fragments = stream.findall('c')
2768                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2769                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2770                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2771                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2772                     if not fragment_ctx['duration']:
2773                         try:
2774                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2775                         except IndexError:
2776                             next_fragment_time = duration
2777                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2778                     for _ in range(fragment_repeat):
2779                         fragments.append({
2780                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2781                             'duration': fragment_ctx['duration'] / stream_timescale,
2782                         })
2783                         fragment_ctx['time'] += fragment_ctx['duration']
2784
2785                 format_id = []
2786                 if ism_id:
2787                     format_id.append(ism_id)
2788                 if stream_name:
2789                     format_id.append(stream_name)
2790                 format_id.append(compat_str(tbr))
2791
2792                 formats.append({
2793                     'format_id': '-'.join(format_id),
2794                     'url': ism_url,
2795                     'manifest_url': ism_url,
2796                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2797                     'width': width,
2798                     'height': height,
2799                     'tbr': tbr,
2800                     'asr': sampling_rate,
2801                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2802                     'acodec': 'none' if stream_type == 'video' else fourcc,
2803                     'protocol': 'ism',
2804                     'fragments': fragments,
2805                     '_download_params': {
2806                         'duration': duration,
2807                         'timescale': stream_timescale,
2808                         'width': width or 0,
2809                         'height': height or 0,
2810                         'fourcc': fourcc,
2811                         'codec_private_data': track.get('CodecPrivateData'),
2812                         'sampling_rate': sampling_rate,
2813                         'channels': int_or_none(track.get('Channels', 2)),
2814                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2815                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2816                     },
2817                 })
2818         return formats
2819
2820     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2821         def absolute_url(item_url):
2822             return urljoin(base_url, item_url)
2823
2824         def parse_content_type(content_type):
2825             if not content_type:
2826                 return {}
2827             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2828             if ctr:
2829                 mimetype, codecs = ctr.groups()
2830                 f = parse_codecs(codecs)
2831                 f['ext'] = mimetype2ext(mimetype)
2832                 return f
2833             return {}
2834
2835         def _media_formats(src, cur_media_type, type_info={}):
2836             full_url = absolute_url(src)
2837             ext = type_info.get('ext') or determine_ext(full_url)
2838             if ext == 'm3u8':
2839                 is_plain_url = False
2840                 formats = self._extract_m3u8_formats(
2841                     full_url, video_id, ext='mp4',
2842                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2843                     preference=preference, quality=quality, fatal=False)
2844             elif ext == 'mpd':
2845                 is_plain_url = False
2846                 formats = self._extract_mpd_formats(
2847                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2848             else:
2849                 is_plain_url = True
2850                 formats = [{
2851                     'url': full_url,
2852                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2853                 }]
2854             return is_plain_url, formats
2855
2856         entries = []
2857         # amp-video and amp-audio are very similar to their HTML5 counterparts
2858         # so we wll include them right here (see
2859         # https://www.ampproject.org/docs/reference/components/amp-video)
2860         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2861         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2862         media_tags = [(media_tag, media_tag_name, media_type, '')
2863                       for media_tag, media_tag_name, media_type
2864                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2865         media_tags.extend(re.findall(
2866             # We only allow video|audio followed by a whitespace or '>'.
2867             # Allowing more characters may end up in significant slow down (see
2868             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2869             # http://www.porntrex.com/maps/videositemap.xml).
2870             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2871         for media_tag, _, media_type, media_content in media_tags:
2872             media_info = {
2873                 'formats': [],
2874                 'subtitles': {},
2875             }
2876             media_attributes = extract_attributes(media_tag)
2877             src = strip_or_none(media_attributes.get('src'))
2878             if src:
2879                 _, formats = _media_formats(src, media_type)
2880                 media_info['formats'].extend(formats)
2881             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2882             if media_content:
2883                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2884                     s_attr = extract_attributes(source_tag)
2885                     # data-video-src and data-src are non standard but seen
2886                     # several times in the wild
2887                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2888                     if not src:
2889                         continue
2890                     f = parse_content_type(s_attr.get('type'))
2891                     is_plain_url, formats = _media_formats(src, media_type, f)
2892                     if is_plain_url:
2893                         # width, height, res, label and title attributes are
2894                         # all not standard but seen several times in the wild
2895                         labels = [
2896                             s_attr.get(lbl)
2897                             for lbl in ('label', 'title')
2898                             if str_or_none(s_attr.get(lbl))
2899                         ]
2900                         width = int_or_none(s_attr.get('width'))
2901                         height = (int_or_none(s_attr.get('height'))
2902                                   or int_or_none(s_attr.get('res')))
2903                         if not width or not height:
2904                             for lbl in labels:
2905                                 resolution = parse_resolution(lbl)
2906                                 if not resolution:
2907                                     continue
2908                                 width = width or resolution.get('width')
2909                                 height = height or resolution.get('height')
2910                         for lbl in labels:
2911                             tbr = parse_bitrate(lbl)
2912                             if tbr:
2913                                 break
2914                         else:
2915                             tbr = None
2916                         f.update({
2917                             'width': width,
2918                             'height': height,
2919                             'tbr': tbr,
2920                             'format_id': s_attr.get('label') or s_attr.get('title'),
2921                         })
2922                         f.update(formats[0])
2923                         media_info['formats'].append(f)
2924                     else:
2925                         media_info['formats'].extend(formats)
2926                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2927                     track_attributes = extract_attributes(track_tag)
2928                     kind = track_attributes.get('kind')
2929                     if not kind or kind in ('subtitles', 'captions'):
2930                         src = strip_or_none(track_attributes.get('src'))
2931                         if not src:
2932                             continue
2933                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2934                         media_info['subtitles'].setdefault(lang, []).append({
2935                             'url': absolute_url(src),
2936                         })
2937             for f in media_info['formats']:
2938                 f.setdefault('http_headers', {})['Referer'] = base_url
2939             if media_info['formats'] or media_info['subtitles']:
2940                 entries.append(media_info)
2941         return entries
2942
2943     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2944         signed = 'hdnea=' in manifest_url
2945         if not signed:
2946             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2947             manifest_url = re.sub(
2948                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2949                 '', manifest_url).strip('?')
2950
2951         formats = []
2952
2953         hdcore_sign = 'hdcore=3.7.0'
2954         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2955         hds_host = hosts.get('hds')
2956         if hds_host:
2957             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2958         if 'hdcore=' not in f4m_url:
2959             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2960         f4m_formats = self._extract_f4m_formats(
2961             f4m_url, video_id, f4m_id='hds', fatal=False)
2962         for entry in f4m_formats:
2963             entry.update({'extra_param_to_segment_url': hdcore_sign})
2964         formats.extend(f4m_formats)
2965
2966         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2967         hls_host = hosts.get('hls')
2968         if hls_host:
2969             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2970         m3u8_formats = self._extract_m3u8_formats(
2971             m3u8_url, video_id, 'mp4', 'm3u8_native',
2972             m3u8_id='hls', fatal=False)
2973         formats.extend(m3u8_formats)
2974
2975         http_host = hosts.get('http')
2976         if http_host and m3u8_formats and not signed:
2977             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
2978             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2979             qualities_length = len(qualities)
2980             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
2981                 i = 0
2982                 for f in m3u8_formats:
2983                     if f['vcodec'] != 'none':
2984                         for protocol in ('http', 'https'):
2985                             http_f = f.copy()
2986                             del http_f['manifest_url']
2987                             http_url = re.sub(
2988                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
2989                             http_f.update({
2990                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2991                                 'url': http_url,
2992                                 'protocol': protocol,
2993                             })
2994                             formats.append(http_f)
2995                         i += 1
2996
2997         return formats
2998
2999     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3000         query = compat_urlparse.urlparse(url).query
3001         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3002         mobj = re.search(
3003             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3004         url_base = mobj.group('url')
3005         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3006         formats = []
3007
3008         def manifest_url(manifest):
3009             m_url = '%s/%s' % (http_base_url, manifest)
3010             if query:
3011                 m_url += '?%s' % query
3012             return m_url
3013
3014         if 'm3u8' not in skip_protocols:
3015             formats.extend(self._extract_m3u8_formats(
3016                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3017                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3018         if 'f4m' not in skip_protocols:
3019             formats.extend(self._extract_f4m_formats(
3020                 manifest_url('manifest.f4m'),
3021                 video_id, f4m_id='hds', fatal=False))
3022         if 'dash' not in skip_protocols:
3023             formats.extend(self._extract_mpd_formats(
3024                 manifest_url('manifest.mpd'),
3025                 video_id, mpd_id='dash', fatal=False))
3026         if re.search(r'(?:/smil:|\.smil)', url_base):
3027             if 'smil' not in skip_protocols:
3028                 rtmp_formats = self._extract_smil_formats(
3029                     manifest_url('jwplayer.smil'),
3030                     video_id, fatal=False)
3031                 for rtmp_format in rtmp_formats:
3032                     rtsp_format = rtmp_format.copy()
3033                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3034                     del rtsp_format['play_path']
3035                     del rtsp_format['ext']
3036                     rtsp_format.update({
3037                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3038                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3039                         'protocol': 'rtsp',
3040                     })
3041                     formats.extend([rtmp_format, rtsp_format])
3042         else:
3043             for protocol in ('rtmp', 'rtsp'):
3044                 if protocol not in skip_protocols:
3045                     formats.append({
3046                         'url': '%s:%s' % (protocol, url_base),
3047                         'format_id': protocol,
3048                         'protocol': protocol,
3049                     })
3050         return formats
3051
3052     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3053         mobj = re.search(
3054             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3055             webpage)
3056         if mobj:
3057             try:
3058                 jwplayer_data = self._parse_json(mobj.group('options'),
3059                                                  video_id=video_id,
3060                                                  transform_source=transform_source)
3061             except ExtractorError:
3062                 pass
3063             else:
3064                 if isinstance(jwplayer_data, dict):
3065                     return jwplayer_data
3066
3067     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3068         jwplayer_data = self._find_jwplayer_data(
3069             webpage, video_id, transform_source=js_to_json)
3070         return self._parse_jwplayer_data(
3071             jwplayer_data, video_id, *args, **kwargs)
3072
3073     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3074                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3075         # JWPlayer backward compatibility: flattened playlists
3076         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3077         if 'playlist' not in jwplayer_data:
3078             jwplayer_data = {'playlist': [jwplayer_data]}
3079
3080         entries = []
3081
3082         # JWPlayer backward compatibility: single playlist item
3083         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3084         if not isinstance(jwplayer_data['playlist'], list):
3085             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3086
3087         for video_data in jwplayer_data['playlist']:
3088             # JWPlayer backward compatibility: flattened sources
3089             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3090             if 'sources' not in video_data:
3091                 video_data['sources'] = [video_data]
3092
3093             this_video_id = video_id or video_data['mediaid']
3094
3095             formats = self._parse_jwplayer_formats(
3096                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3097                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3098
3099             subtitles = {}
3100             tracks = video_data.get('tracks')
3101             if tracks and isinstance(tracks, list):
3102                 for track in tracks:
3103                     if not isinstance(track, dict):
3104                         continue
3105                     track_kind = track.get('kind')
3106                     if not track_kind or not isinstance(track_kind, compat_str):
3107                         continue
3108                     if track_kind.lower() not in ('captions', 'subtitles'):
3109                         continue
3110                     track_url = urljoin(base_url, track.get('file'))
3111                     if not track_url:
3112                         continue
3113                     subtitles.setdefault(track.get('label') or 'en', []).append({
3114                         'url': self._proto_relative_url(track_url)
3115                     })
3116
3117             entry = {
3118                 'id': this_video_id,
3119                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3120                 'description': clean_html(video_data.get('description')),
3121                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3122                 'timestamp': int_or_none(video_data.get('pubdate')),
3123                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3124                 'subtitles': subtitles,
3125             }
3126             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3127             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3128                 entry.update({
3129                     '_type': 'url_transparent',
3130                     'url': formats[0]['url'],
3131                 })
3132             else:
3133                 self._sort_formats(formats)
3134                 entry['formats'] = formats
3135             entries.append(entry)
3136         if len(entries) == 1:
3137             return entries[0]
3138         else:
3139             return self.playlist_result(entries)
3140
3141     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3142                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3143         urls = []
3144         formats = []
3145         for source in jwplayer_sources_data:
3146             if not isinstance(source, dict):
3147                 continue
3148             source_url = urljoin(
3149                 base_url, self._proto_relative_url(source.get('file')))
3150             if not source_url or source_url in urls:
3151                 continue
3152             urls.append(source_url)
3153             source_type = source.get('type') or ''
3154             ext = mimetype2ext(source_type) or determine_ext(source_url)
3155             if source_type == 'hls' or ext == 'm3u8':
3156                 formats.extend(self._extract_m3u8_formats(
3157                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3158                     m3u8_id=m3u8_id, fatal=False))
3159             elif source_type == 'dash' or ext == 'mpd':
3160                 formats.extend(self._extract_mpd_formats(
3161                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3162             elif ext == 'smil':
3163                 formats.extend(self._extract_smil_formats(
3164                     source_url, video_id, fatal=False))
3165             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3166             elif source_type.startswith('audio') or ext in (
3167                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3168                 formats.append({
3169                     'url': source_url,
3170                     'vcodec': 'none',
3171                     'ext': ext,
3172                 })
3173             else:
3174                 height = int_or_none(source.get('height'))
3175                 if height is None:
3176                     # Often no height is provided but there is a label in
3177                     # format like "1080p", "720p SD", or 1080.
3178                     height = int_or_none(self._search_regex(
3179                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3180                         'height', default=None))
3181                 a_format = {
3182                     'url': source_url,
3183                     'width': int_or_none(source.get('width')),
3184                     'height': height,
3185                     'tbr': int_or_none(source.get('bitrate')),
3186                     'ext': ext,
3187                 }
3188                 if source_url.startswith('rtmp'):
3189                     a_format['ext'] = 'flv'
3190                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3191                     # of jwplayer.flash.swf
3192                     rtmp_url_parts = re.split(
3193                         r'((?:mp4|mp3|flv):)', source_url, 1)
3194                     if len(rtmp_url_parts) == 3:
3195                         rtmp_url, prefix, play_path = rtmp_url_parts
3196                         a_format.update({
3197                             'url': rtmp_url,
3198                             'play_path': prefix + play_path,
3199                         })
3200                     if rtmp_params:
3201                         a_format.update(rtmp_params)
3202                 formats.append(a_format)
3203         return formats
3204
3205     def _live_title(self, name):
3206         """ Generate the title for a live video """
3207         now = datetime.datetime.now()
3208         now_str = now.strftime('%Y-%m-%d %H:%M')
3209         return name + ' ' + now_str
3210
3211     def _int(self, v, name, fatal=False, **kwargs):
3212         res = int_or_none(v, **kwargs)
3213         if 'get_attr' in kwargs:
3214             print(getattr(v, kwargs['get_attr']))
3215         if res is None:
3216             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3217             if fatal:
3218                 raise ExtractorError(msg)
3219             else:
3220                 self.report_warning(msg)
3221         return res
3222
3223     def _float(self, v, name, fatal=False, **kwargs):
3224         res = float_or_none(v, **kwargs)
3225         if res is None:
3226             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3227             if fatal:
3228                 raise ExtractorError(msg)
3229             else:
3230                 self.report_warning(msg)
3231         return res
3232
3233     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3234                     path='/', secure=False, discard=False, rest={}, **kwargs):
3235         cookie = compat_cookiejar_Cookie(
3236             0, name, value, port, port is not None, domain, True,
3237             domain.startswith('.'), path, True, secure, expire_time,
3238             discard, None, None, rest)
3239         self._downloader.cookiejar.set_cookie(cookie)
3240
3241     def _get_cookies(self, url):
3242         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3243         req = sanitized_Request(url)
3244         self._downloader.cookiejar.add_cookie_header(req)
3245         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3246
3247     def _apply_first_set_cookie_header(self, url_handle, cookie):
3248         """
3249         Apply first Set-Cookie header instead of the last. Experimental.
3250
3251         Some sites (e.g. [1-3]) may serve two cookies under the same name
3252         in Set-Cookie header and expect the first (old) one to be set rather
3253         than second (new). However, as of RFC6265 the newer one cookie
3254         should be set into cookie store what actually happens.
3255         We will workaround this issue by resetting the cookie to
3256         the first one manually.
3257         1. https://new.vk.com/
3258         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3259         3. https://learning.oreilly.com/
3260         """
3261         for header, cookies in url_handle.headers.items():
3262             if header.lower() != 'set-cookie':
3263                 continue
3264             if sys.version_info[0] >= 3:
3265                 cookies = cookies.encode('iso-8859-1')
3266             cookies = cookies.decode('utf-8')
3267             cookie_value = re.search(
3268                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3269             if cookie_value:
3270                 value, domain = cookie_value.groups()
3271                 self._set_cookie(domain, cookie, value)
3272                 break
3273
3274     def get_testcases(self, include_onlymatching=False):
3275         t = getattr(self, '_TEST', None)
3276         if t:
3277             assert not hasattr(self, '_TESTS'), \
3278                 '%s has _TEST and _TESTS' % type(self).__name__
3279             tests = [t]
3280         else:
3281             tests = getattr(self, '_TESTS', [])
3282         for t in tests:
3283             if not include_onlymatching and t.get('only_matching', False):
3284                 continue
3285             t['name'] = type(self).__name__[:-len('IE')]
3286             yield t
3287
3288     def is_suitable(self, age_limit):
3289         """ Test whether the extractor is generally suitable for the given
3290         age limit (i.e. pornographic sites are not, all others usually are) """
3291
3292         any_restricted = False
3293         for tc in self.get_testcases(include_onlymatching=False):
3294             if tc.get('playlist', []):
3295                 tc = tc['playlist'][0]
3296             is_restricted = age_restricted(
3297                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3298             if not is_restricted:
3299                 return True
3300             any_restricted = any_restricted or is_restricted
3301         return not any_restricted
3302
3303     def extract_subtitles(self, *args, **kwargs):
3304         if (self._downloader.params.get('writesubtitles', False)
3305                 or self._downloader.params.get('listsubtitles')):
3306             return self._get_subtitles(*args, **kwargs)
3307         return {}
3308
3309     def _get_subtitles(self, *args, **kwargs):
3310         raise NotImplementedError('This method must be implemented by subclasses')
3311
3312     @staticmethod
3313     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3314         """ Merge subtitle items for one language. Items with duplicated URLs
3315         will be dropped. """
3316         list1_urls = set([item['url'] for item in subtitle_list1])
3317         ret = list(subtitle_list1)
3318         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3319         return ret
3320
3321     @classmethod
3322     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
3323         """ Merge two subtitle dictionaries, language by language. """
3324         ret = dict(subtitle_dict1)
3325         for lang in subtitle_dict2:
3326             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
3327         return ret
3328
3329     def extract_automatic_captions(self, *args, **kwargs):
3330         if (self._downloader.params.get('writeautomaticsub', False)
3331                 or self._downloader.params.get('listsubtitles')):
3332             return self._get_automatic_captions(*args, **kwargs)
3333         return {}
3334
3335     def _get_automatic_captions(self, *args, **kwargs):
3336         raise NotImplementedError('This method must be implemented by subclasses')
3337
3338     def mark_watched(self, *args, **kwargs):
3339         if (self._downloader.params.get('mark_watched', False)
3340                 and (self._get_login_info()[0] is not None
3341                      or self._downloader.params.get('cookiefile') is not None)):
3342             self._mark_watched(*args, **kwargs)
3343
3344     def _mark_watched(self, *args, **kwargs):
3345         raise NotImplementedError('This method must be implemented by subclasses')
3346
3347     def geo_verification_headers(self):
3348         headers = {}
3349         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3350         if geo_verification_proxy:
3351             headers['Ytdl-request-proxy'] = geo_verification_proxy
3352         return headers
3353
3354     def _generic_id(self, url):
3355         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3356
3357     def _generic_title(self, url):
3358         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3359
3360     @staticmethod
3361     def _availability(is_private, needs_premium, needs_subscription, needs_auth, is_unlisted):
3362         all_known = all(map(
3363             lambda x: x is not None,
3364             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3365         return (
3366             'private' if is_private
3367             else 'premium_only' if needs_premium
3368             else 'subscriber_only' if needs_subscription
3369             else 'needs_auth' if needs_auth
3370             else 'unlisted' if is_unlisted
3371             else 'public' if all_known
3372             else None)
3373
3374
3375 class SearchInfoExtractor(InfoExtractor):
3376     """
3377     Base class for paged search queries extractors.
3378     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3379     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3380     """
3381
3382     @classmethod
3383     def _make_valid_url(cls):
3384         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3385
3386     @classmethod
3387     def suitable(cls, url):
3388         return re.match(cls._make_valid_url(), url) is not None
3389
3390     def _real_extract(self, query):
3391         mobj = re.match(self._make_valid_url(), query)
3392         if mobj is None:
3393             raise ExtractorError('Invalid search query "%s"' % query)
3394
3395         prefix = mobj.group('prefix')
3396         query = mobj.group('query')
3397         if prefix == '':
3398             return self._get_n_results(query, 1)
3399         elif prefix == 'all':
3400             return self._get_n_results(query, self._MAX_RESULTS)
3401         else:
3402             n = int(prefix)
3403             if n <= 0:
3404                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3405             elif n > self._MAX_RESULTS:
3406                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3407                 n = self._MAX_RESULTS
3408             return self._get_n_results(query, n)
3409
3410     def _get_n_results(self, query, n):
3411         """Get a specified number of results for a query"""
3412         raise NotImplementedError('This method must be implemented by subclasses')
3413
3414     @property
3415     def SEARCH_KEY(self):
3416         return self._SEARCH_KEY