yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar_Cookie,
  18     compat_cookies_SimpleCookie,
  19     compat_etree_Element,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_integer_types,
  23     compat_http_client,
  24     compat_os_name,
  25     compat_str,
  26     compat_urllib_error,
  27     compat_urllib_parse_unquote,
  28     compat_urllib_parse_urlencode,
  29     compat_urllib_request,
  30     compat_urlparse,
  31     compat_xml_parse_error,
  32 )
  33 from ..downloader import FileDownloader
  34 from ..downloader.f4m import (
  35     get_base_url,
  36     remove_encrypted_media,
  37 )
  38 from ..utils import (
  39     NO_DEFAULT,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     error_to_compat_str,
  49     ExtractorError,
  50     extract_attributes,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     GeoRestrictedError,
  54     GeoUtils,
  55     int_or_none,
  56     js_to_json,
  57     JSON_LD_RE,
  58     mimetype2ext,
  59     network_exceptions,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     RegexNotFoundError,
  68     sanitized_Request,
  69     sanitize_filename,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     unescapeHTML,
  74     unified_strdate,
  75     unified_timestamp,
  76     update_Request,
  77     update_url_query,
  78     urljoin,
  79     url_basename,
  80     url_or_none,
  81     xpath_element,
  82     xpath_text,
  83     xpath_with_ns,
  84 )
  85
  86
  87 class InfoExtractor(object):
  88     """Information Extractor class.
  89
  90     Information extractors are the classes that, given a URL, extract
  91     information about the video (or videos) the URL refers to. This
  92     information includes the real video URL, the video title, author and
  93     others. The information is stored in a dictionary which is then
  94     passed to the YoutubeDL. The YoutubeDL processes this
  95     information possibly downloading the video to the file system, among
  96     other possible outcomes.
  97
  98     The type field determines the type of the result.
  99     By far the most common value (and the default if _type is missing) is
 100     "video", which indicates a single video.
 101
 102     For a video, the dictionaries must include the following fields:
 103
 104     id:             Video identifier.
 105     title:          Video title, unescaped.
 106
 107     Additionally, it must contain either a formats entry or a url one:
 108
 109     formats:        A list of dictionaries for each format available, ordered
 110                     from worst to best quality.
 111
 112                     Potential fields:
 113                     * url        The mandatory URL representing the media:
 114                                    for plain file media - HTTP URL of this file,
 115                                    for RTMP - RTMP URL,
 116                                    for HLS - URL of the M3U8 media playlist,
 117                                    for HDS - URL of the F4M manifest,
 118                                    for DASH
 119                                      - HTTP URL to plain file media (in case of
 120                                        unfragmented media)
 121                                      - URL of the MPD manifest or base URL
 122                                        representing the media if MPD manifest
 123                                        is parsed from a string (in case of
 124                                        fragmented media)
 125                                    for MSS - URL of the ISM manifest.
 126                     * manifest_url
 127                                  The URL of the manifest file in case of
 128                                  fragmented media:
 129                                    for HLS - URL of the M3U8 master playlist,
 130                                    for HDS - URL of the F4M manifest,
 131                                    for DASH - URL of the MPD manifest,
 132                                    for MSS - URL of the ISM manifest.
 133                     * ext        Will be calculated from URL if missing
 134                     * format     A human-readable description of the format
 135                                  ("mp4 container with h264/opus").
 136                                  Calculated from the format_id, width, height.
 137                                  and format_note fields if missing.
 138                     * format_id  A short description of the format
 139                                  ("mp4_h264_opus" or "19").
 140                                 Technically optional, but strongly recommended.
 141                     * format_note Additional info about the format
 142                                  ("3D" or "DASH video")
 143                     * width      Width of the video, if known
 144                     * height     Height of the video, if known
 145                     * resolution Textual description of width and height
 146                     * tbr        Average bitrate of audio and video in KBit/s
 147                     * abr        Average audio bitrate in KBit/s
 148                     * acodec     Name of the audio codec in use
 149                     * asr        Audio sampling rate in Hertz
 150                     * vbr        Average video bitrate in KBit/s
 151                     * fps        Frame rate
 152                     * vcodec     Name of the video codec in use
 153                     * container  Name of the container format
 154                     * filesize   The number of bytes, if known in advance
 155                     * filesize_approx  An estimate for the number of bytes
 156                     * player_url SWF Player URL (used for rtmpdump).
 157                     * protocol   The protocol that will be used for the actual
 158                                  download, lower-case.
 159                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 160                                  "m3u8", "m3u8_native" or "http_dash_segments".
 161                     * fragment_base_url
 162                                  Base URL for fragments. Each fragment's path
 163                                  value (if present) will be relative to
 164                                  this URL.
 165                     * fragments  A list of fragments of a fragmented media.
 166                                  Each fragment entry must contain either an url
 167                                  or a path. If an url is present it should be
 168                                  considered by a client. Otherwise both path and
 169                                  fragment_base_url must be present. Here is
 170                                  the list of all potential fields:
 171                                  * "url" - fragment's URL
 172                                  * "path" - fragment's path relative to
 173                                             fragment_base_url
 174                                  * "duration" (optional, int or float)
 175                                  * "filesize" (optional, int)
 176                     * preference Order number of this format. If this field is
 177                                  present and not None, the formats get sorted
 178                                  by this field, regardless of all other values.
 179                                  -1 for default (order by other properties),
 180                                  -2 or smaller for less than default.
 181                                  < -1000 to hide the format (if there is
 182                                     another one which is strictly better)
 183                     * language   Language code, e.g. "de" or "en-US".
 184                     * language_preference  Is this in the language mentioned in
 185                                  the URL?
 186                                  10 if it's what the URL is about,
 187                                  -1 for default (don't know),
 188                                  -10 otherwise, other values reserved for now.
 189                     * quality    Order number of the video quality of this
 190                                  format, irrespective of the file format.
 191                                  -1 for default (order by other properties),
 192                                  -2 or smaller for less than default.
 193                     * source_preference  Order number for this video source
 194                                   (quality takes higher priority)
 195                                  -1 for default (order by other properties),
 196                                  -2 or smaller for less than default.
 197                     * http_headers  A dictionary of additional HTTP headers
 198                                  to add to the request.
 199                     * stretched_ratio  If given and not 1, indicates that the
 200                                  video's pixels are not square.
 201                                  width : height ratio as float.
 202                     * no_resume  The server does not support resuming the
 203                                  (HTTP or RTMP) download. Boolean.
 204                     * downloader_options  A dictionary of downloader options as
 205                                  described in FileDownloader
 206
 207     url:            Final video URL.
 208     ext:            Video filename extension.
 209     format:         The video format, defaults to ext (used for --get-format)
 210     player_url:     SWF Player URL (used for rtmpdump).
 211
 212     The following fields are optional:
 213
 214     alt_title:      A secondary title of the video.
 215     display_id      An alternative identifier for the video, not necessarily
 216                     unique, but available before title. Typically, id is
 217                     something like "4234987", title "Dancing naked mole rats",
 218                     and display_id "dancing-naked-mole-rats"
 219     thumbnails:     A list of dictionaries, with the following entries:
 220                         * "id" (optional, string) - Thumbnail format ID
 221                         * "url"
 222                         * "preference" (optional, int) - quality of the image
 223                         * "width" (optional, int)
 224                         * "height" (optional, int)
 225                         * "resolution" (optional, string "{width}x{height}",
 226                                         deprecated)
 227                         * "filesize" (optional, int)
 228     thumbnail:      Full URL to a video thumbnail image.
 229     description:    Full video description.
 230     uploader:       Full name of the video uploader.
 231     license:        License name the video is licensed under.
 232     creator:        The creator of the video.
 233     release_timestamp: UNIX timestamp of the moment the video was released.
 234     release_date:   The date (YYYYMMDD) when the video was released.
 235     timestamp:      UNIX timestamp of the moment the video was uploaded
 236     upload_date:    Video upload date (YYYYMMDD).
 237                     If not explicitly set, calculated from timestamp.
 238     uploader_id:    Nickname or id of the video uploader.
 239     uploader_url:   Full URL to a personal webpage of the video uploader.
 240     channel:        Full name of the channel the video is uploaded on.
 241                     Note that channel fields may or may not repeat uploader
 242                     fields. This depends on a particular extractor.
 243     channel_id:     Id of the channel.
 244     channel_url:    Full URL to a channel webpage.
 245     location:       Physical location where the video was filmed.
 246     subtitles:      The available subtitles as a dictionary in the format
 247                     {tag: subformats}. "tag" is usually a language code, and
 248                     "subformats" is a list sorted from lower to higher
 249                     preference, each element is a dictionary with the "ext"
 250                     entry and one of:
 251                         * "data": The subtitles file contents
 252                         * "url": A URL pointing to the subtitles file
 253                     "ext" will be calculated from URL if missing
 254     automatic_captions: Like 'subtitles'; contains automatically generated
 255                     captions instead of normal subtitles
 256     duration:       Length of the video in seconds, as an integer or float.
 257     view_count:     How many users have watched the video on the platform.
 258     like_count:     Number of positive ratings of the video
 259     dislike_count:  Number of negative ratings of the video
 260     repost_count:   Number of reposts of the video
 261     average_rating: Average rating give by users, the scale used depends on the webpage
 262     comment_count:  Number of comments on the video
 263     comments:       A list of comments, each with one or more of the following
 264                     properties (all but one of text or html optional):
 265                         * "author" - human-readable name of the comment author
 266                         * "author_id" - user ID of the comment author
 267                         * "author_thumbnail" - The thumbnail of the comment author
 268                         * "id" - Comment ID
 269                         * "html" - Comment as HTML
 270                         * "text" - Plain text of the comment
 271                         * "timestamp" - UNIX timestamp of comment
 272                         * "parent" - ID of the comment this one is replying to.
 273                                      Set to "root" to indicate that this is a
 274                                      comment to the original video.
 275                         * "like_count" - Number of positive ratings of the comment
 276                         * "dislike_count" - Number of negative ratings of the comment
 277                         * "is_favorited" - Whether the comment is marked as
 278                                            favorite by the video uploader
 279                         * "author_is_uploader" - Whether the comment is made by
 280                                                  the video uploader
 281     age_limit:      Age restriction for the video, as an integer (years)
 282     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 283                     should allow to get the same result again. (It will be set
 284                     by YoutubeDL if it's missing)
 285     categories:     A list of categories that the video falls in, for example
 286                     ["Sports", "Berlin"]
 287     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 288     is_live:        True, False, or None (=unknown). Whether this video is a
 289                     live stream that goes on instead of a fixed-length video.
 290     was_live:       True, False, or None (=unknown). Whether this video was
 291                     originally a live stream.
 292     start_time:     Time in seconds where the reproduction should start, as
 293                     specified in the URL.
 294     end_time:       Time in seconds where the reproduction should end, as
 295                     specified in the URL.
 296     chapters:       A list of dictionaries, with the following entries:
 297                         * "start_time" - The start time of the chapter in seconds
 298                         * "end_time" - The end time of the chapter in seconds
 299                         * "title" (optional, string)
 300     playable_in_embed: Whether this video is allowed to play in embedded
 301                     players on other sites. Can be True (=always allowed),
 302                     False (=never allowed), None (=unknown), or a string
 303                     specifying the criteria for embedability (Eg: 'whitelist')
 304     availability:   Under what condition the video is available. One of
 305                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 306                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 307                     to set it
 308     __post_extractor: A function to be called just before the metadata is
 309                     written to either disk, logger or console. The function
 310                     must return a dict which will be added to the info_dict.
 311                     This is usefull for additional information that is
 312                     time-consuming to extract. Note that the fields thus
 313                     extracted will not be available to output template and
 314                     match_filter. So, only "comments" and "comment_count" are
 315                     currently allowed to be extracted via this method.
 316
 317     The following fields should only be used when the video belongs to some logical
 318     chapter or section:
 319
 320     chapter:        Name or title of the chapter the video belongs to.
 321     chapter_number: Number of the chapter the video belongs to, as an integer.
 322     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 323
 324     The following fields should only be used when the video is an episode of some
 325     series, programme or podcast:
 326
 327     series:         Title of the series or programme the video episode belongs to.
 328     season:         Title of the season the video episode belongs to.
 329     season_number:  Number of the season the video episode belongs to, as an integer.
 330     season_id:      Id of the season the video episode belongs to, as a unicode string.
 331     episode:        Title of the video episode. Unlike mandatory video title field,
 332                     this field should denote the exact title of the video episode
 333                     without any kind of decoration.
 334     episode_number: Number of the video episode within a season, as an integer.
 335     episode_id:     Id of the video episode, as a unicode string.
 336
 337     The following fields should only be used when the media is a track or a part of
 338     a music album:
 339
 340     track:          Title of the track.
 341     track_number:   Number of the track within an album or a disc, as an integer.
 342     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 343                     as a unicode string.
 344     artist:         Artist(s) of the track.
 345     genre:          Genre(s) of the track.
 346     album:          Title of the album the track belongs to.
 347     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 348     album_artist:   List of all artists appeared on the album (e.g.
 349                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 350                     and compilations).
 351     disc_number:    Number of the disc or other physical medium the track belongs to,
 352                     as an integer.
 353     release_year:   Year (YYYY) when the album was released.
 354
 355     Unless mentioned otherwise, the fields should be Unicode strings.
 356
 357     Unless mentioned otherwise, None is equivalent to absence of information.
 358
 359
 360     _type "playlist" indicates multiple videos.
 361     There must be a key "entries", which is a list, an iterable, or a PagedList
 362     object, each element of which is a valid dictionary by this specification.
 363
 364     Additionally, playlists can have "id", "title", and any other relevent
 365     attributes with the same semantics as videos (see above).
 366
 367
 368     _type "multi_video" indicates that there are multiple videos that
 369     form a single show, for examples multiple acts of an opera or TV episode.
 370     It must have an entries key like a playlist and contain all the keys
 371     required for a video at the same time.
 372
 373
 374     _type "url" indicates that the video must be extracted from another
 375     location, possibly by a different extractor. Its only required key is:
 376     "url" - the next URL to extract.
 377     The key "ie_key" can be set to the class name (minus the trailing "IE",
 378     e.g. "Youtube") if the extractor class is known in advance.
 379     Additionally, the dictionary may have any properties of the resolved entity
 380     known in advance, for example "title" if the title of the referred video is
 381     known ahead of time.
 382
 383
 384     _type "url_transparent" entities have the same specification as "url", but
 385     indicate that the given additional information is more precise than the one
 386     associated with the resolved URL.
 387     This is useful when a site employs a video service that hosts the video and
 388     its technical metadata, but that video service does not embed a useful
 389     title, description etc.
 390
 391
 392     Subclasses of this one should re-define the _real_initialize() and
 393     _real_extract() methods and define a _VALID_URL regexp.
 394     Probably, they should also be added to the list of extractors.
 395
 396     _GEO_BYPASS attribute may be set to False in order to disable
 397     geo restriction bypass mechanisms for a particular extractor.
 398     Though it won't disable explicit geo restriction bypass based on
 399     country code provided with geo_bypass_country.
 400
 401     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 402     countries for this extractor. One of these countries will be used by
 403     geo restriction bypass mechanism right away in order to bypass
 404     geo restriction, of course, if the mechanism is not disabled.
 405
 406     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 407     IP blocks in CIDR notation for this extractor. One of these IP blocks
 408     will be used by geo restriction bypass mechanism similarly
 409     to _GEO_COUNTRIES.
 410
 411     Finally, the _WORKING attribute should be set to False for broken IEs
 412     in order to warn the users and skip the tests.
 413     """
 414
 415     _ready = False
 416     _downloader = None
 417     _x_forwarded_for_ip = None
 418     _GEO_BYPASS = True
 419     _GEO_COUNTRIES = None
 420     _GEO_IP_BLOCKS = None
 421     _WORKING = True
 422
 423     def __init__(self, downloader=None):
 424         """Constructor. Receives an optional downloader."""
 425         self._ready = False
 426         self._x_forwarded_for_ip = None
 427         self.set_downloader(downloader)
 428
 429     @classmethod
 430     def suitable(cls, url):
 431         """Receives a URL and returns True if suitable for this IE."""
 432
 433         # This does not use has/getattr intentionally - we want to know whether
 434         # we have cached the regexp for *this* class, whereas getattr would also
 435         # match the superclass
 436         if '_VALID_URL_RE' not in cls.__dict__:
 437             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 438         return cls._VALID_URL_RE.match(url) is not None
 439
 440     @classmethod
 441     def _match_id(cls, url):
 442         if '_VALID_URL_RE' not in cls.__dict__:
 443             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 444         m = cls._VALID_URL_RE.match(url)
 445         assert m
 446         return compat_str(m.group('id'))
 447
 448     @classmethod
 449     def working(cls):
 450         """Getter method for _WORKING."""
 451         return cls._WORKING
 452
 453     def initialize(self):
 454         """Initializes an instance (authentication, etc)."""
 455         self._initialize_geo_bypass({
 456             'countries': self._GEO_COUNTRIES,
 457             'ip_blocks': self._GEO_IP_BLOCKS,
 458         })
 459         if not self._ready:
 460             self._real_initialize()
 461             self._ready = True
 462
 463     def _initialize_geo_bypass(self, geo_bypass_context):
 464         """
 465         Initialize geo restriction bypass mechanism.
 466
 467         This method is used to initialize geo bypass mechanism based on faking
 468         X-Forwarded-For HTTP header. A random country from provided country list
 469         is selected and a random IP belonging to this country is generated. This
 470         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 471         HTTP requests.
 472
 473         This method will be used for initial geo bypass mechanism initialization
 474         during the instance initialization with _GEO_COUNTRIES and
 475         _GEO_IP_BLOCKS.
 476
 477         You may also manually call it from extractor's code if geo bypass
 478         information is not available beforehand (e.g. obtained during
 479         extraction) or due to some other reason. In this case you should pass
 480         this information in geo bypass context passed as first argument. It may
 481         contain following fields:
 482
 483         countries:  List of geo unrestricted countries (similar
 484                     to _GEO_COUNTRIES)
 485         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 486                     (similar to _GEO_IP_BLOCKS)
 487
 488         """
 489         if not self._x_forwarded_for_ip:
 490
 491             # Geo bypass mechanism is explicitly disabled by user
 492             if not self._downloader.params.get('geo_bypass', True):
 493                 return
 494
 495             if not geo_bypass_context:
 496                 geo_bypass_context = {}
 497
 498             # Backward compatibility: previously _initialize_geo_bypass
 499             # expected a list of countries, some 3rd party code may still use
 500             # it this way
 501             if isinstance(geo_bypass_context, (list, tuple)):
 502                 geo_bypass_context = {
 503                     'countries': geo_bypass_context,
 504                 }
 505
 506             # The whole point of geo bypass mechanism is to fake IP
 507             # as X-Forwarded-For HTTP header based on some IP block or
 508             # country code.
 509
 510             # Path 1: bypassing based on IP block in CIDR notation
 511
 512             # Explicit IP block specified by user, use it right away
 513             # regardless of whether extractor is geo bypassable or not
 514             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 515
 516             # Otherwise use random IP block from geo bypass context but only
 517             # if extractor is known as geo bypassable
 518             if not ip_block:
 519                 ip_blocks = geo_bypass_context.get('ip_blocks')
 520                 if self._GEO_BYPASS and ip_blocks:
 521                     ip_block = random.choice(ip_blocks)
 522
 523             if ip_block:
 524                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 525                 self._downloader.write_debug(
 526                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 527                 return
 528
 529             # Path 2: bypassing based on country code
 530
 531             # Explicit country code specified by user, use it right away
 532             # regardless of whether extractor is geo bypassable or not
 533             country = self._downloader.params.get('geo_bypass_country', None)
 534
 535             # Otherwise use random country code from geo bypass context but
 536             # only if extractor is known as geo bypassable
 537             if not country:
 538                 countries = geo_bypass_context.get('countries')
 539                 if self._GEO_BYPASS and countries:
 540                     country = random.choice(countries)
 541
 542             if country:
 543                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 544                 self._downloader.write_debug(
 545                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 546
 547     def extract(self, url):
 548         """Extracts URL information and returns it in list of dicts."""
 549         try:
 550             for _ in range(2):
 551                 try:
 552                     self.initialize()
 553                     ie_result = self._real_extract(url)
 554                     if self._x_forwarded_for_ip:
 555                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 556                     subtitles = ie_result.get('subtitles')
 557                     if (subtitles and 'live_chat' in subtitles
 558                             and 'no-live-chat' in self._downloader.params.get('compat_opts', [])):
 559                         del subtitles['live_chat']
 560                     return ie_result
 561                 except GeoRestrictedError as e:
 562                     if self.__maybe_fake_ip_and_retry(e.countries):
 563                         continue
 564                     raise
 565         except ExtractorError:
 566             raise
 567         except compat_http_client.IncompleteRead as e:
 568             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 569         except (KeyError, StopIteration) as e:
 570             raise ExtractorError('An extractor error has occurred.', cause=e)
 571
 572     def __maybe_fake_ip_and_retry(self, countries):
 573         if (not self._downloader.params.get('geo_bypass_country', None)
 574                 and self._GEO_BYPASS
 575                 and self._downloader.params.get('geo_bypass', True)
 576                 and not self._x_forwarded_for_ip
 577                 and countries):
 578             country_code = random.choice(countries)
 579             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 580             if self._x_forwarded_for_ip:
 581                 self.report_warning(
 582                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 583                     % (self._x_forwarded_for_ip, country_code.upper()))
 584                 return True
 585         return False
 586
 587     def set_downloader(self, downloader):
 588         """Sets the downloader for this IE."""
 589         self._downloader = downloader
 590
 591     def _real_initialize(self):
 592         """Real initialization process. Redefine in subclasses."""
 593         pass
 594
 595     def _real_extract(self, url):
 596         """Real extraction process. Redefine in subclasses."""
 597         pass
 598
 599     @classmethod
 600     def ie_key(cls):
 601         """A string for getting the InfoExtractor with get_info_extractor"""
 602         return compat_str(cls.__name__[:-2])
 603
 604     @property
 605     def IE_NAME(self):
 606         return compat_str(type(self).__name__[:-2])
 607
 608     @staticmethod
 609     def __can_accept_status_code(err, expected_status):
 610         assert isinstance(err, compat_urllib_error.HTTPError)
 611         if expected_status is None:
 612             return False
 613         if isinstance(expected_status, compat_integer_types):
 614             return err.code == expected_status
 615         elif isinstance(expected_status, (list, tuple)):
 616             return err.code in expected_status
 617         elif callable(expected_status):
 618             return expected_status(err.code) is True
 619         else:
 620             assert False
 621
 622     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 623         """
 624         Return the response handle.
 625
 626         See _download_webpage docstring for arguments specification.
 627         """
 628         if not self._downloader._first_webpage_request:
 629             sleep_interval = float_or_none(self._downloader.params.get('sleep_interval_requests')) or 0
 630             if sleep_interval > 0:
 631                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 632                 time.sleep(sleep_interval)
 633         else:
 634             self._downloader._first_webpage_request = False
 635
 636         if note is None:
 637             self.report_download_webpage(video_id)
 638         elif note is not False:
 639             if video_id is None:
 640                 self.to_screen('%s' % (note,))
 641             else:
 642                 self.to_screen('%s: %s' % (video_id, note))
 643
 644         # Some sites check X-Forwarded-For HTTP header in order to figure out
 645         # the origin of the client behind proxy. This allows bypassing geo
 646         # restriction by faking this header's value to IP that belongs to some
 647         # geo unrestricted country. We will do so once we encounter any
 648         # geo restriction error.
 649         if self._x_forwarded_for_ip:
 650             if 'X-Forwarded-For' not in headers:
 651                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 652
 653         if isinstance(url_or_request, compat_urllib_request.Request):
 654             url_or_request = update_Request(
 655                 url_or_request, data=data, headers=headers, query=query)
 656         else:
 657             if query:
 658                 url_or_request = update_url_query(url_or_request, query)
 659             if data is not None or headers:
 660                 url_or_request = sanitized_Request(url_or_request, data, headers)
 661         try:
 662             return self._downloader.urlopen(url_or_request)
 663         except network_exceptions as err:
 664             if isinstance(err, compat_urllib_error.HTTPError):
 665                 if self.__can_accept_status_code(err, expected_status):
 666                     # Retain reference to error to prevent file object from
 667                     # being closed before it can be read. Works around the
 668                     # effects of <https://bugs.python.org/issue15002>
 669                     # introduced in Python 3.4.1.
 670                     err.fp._error = err
 671                     return err.fp
 672
 673             if errnote is False:
 674                 return False
 675             if errnote is None:
 676                 errnote = 'Unable to download webpage'
 677
 678             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 679             if fatal:
 680                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 681             else:
 682                 self.report_warning(errmsg)
 683                 return False
 684
 685     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 686         """
 687         Return a tuple (page content as string, URL handle).
 688
 689         See _download_webpage docstring for arguments specification.
 690         """
 691         # Strip hashes from the URL (#1038)
 692         if isinstance(url_or_request, (compat_str, str)):
 693             url_or_request = url_or_request.partition('#')[0]
 694
 695         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 696         if urlh is False:
 697             assert not fatal
 698             return False
 699         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 700         return (content, urlh)
 701
 702     @staticmethod
 703     def _guess_encoding_from_content(content_type, webpage_bytes):
 704         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 705         if m:
 706             encoding = m.group(1)
 707         else:
 708             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 709                           webpage_bytes[:1024])
 710             if m:
 711                 encoding = m.group(1).decode('ascii')
 712             elif webpage_bytes.startswith(b'\xff\xfe'):
 713                 encoding = 'utf-16'
 714             else:
 715                 encoding = 'utf-8'
 716
 717         return encoding
 718
 719     def __check_blocked(self, content):
 720         first_block = content[:512]
 721         if ('<title>Access to this site is blocked</title>' in content
 722                 and 'Websense' in first_block):
 723             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 724             blocked_iframe = self._html_search_regex(
 725                 r'<iframe src="([^"]+)"', content,
 726                 'Websense information URL', default=None)
 727             if blocked_iframe:
 728                 msg += ' Visit %s for more details' % blocked_iframe
 729             raise ExtractorError(msg, expected=True)
 730         if '<title>The URL you requested has been blocked</title>' in first_block:
 731             msg = (
 732                 'Access to this webpage has been blocked by Indian censorship. '
 733                 'Use a VPN or proxy server (with --proxy) to route around it.')
 734             block_msg = self._html_search_regex(
 735                 r'</h1><p>(.*?)</p>',
 736                 content, 'block message', default=None)
 737             if block_msg:
 738                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 739             raise ExtractorError(msg, expected=True)
 740         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 741                 and 'blocklist.rkn.gov.ru' in content):
 742             raise ExtractorError(
 743                 'Access to this webpage has been blocked by decision of the Russian government. '
 744                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 745                 expected=True)
 746
 747     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 748         content_type = urlh.headers.get('Content-Type', '')
 749         webpage_bytes = urlh.read()
 750         if prefix is not None:
 751             webpage_bytes = prefix + webpage_bytes
 752         if not encoding:
 753             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 754         if self._downloader.params.get('dump_intermediate_pages', False):
 755             self.to_screen('Dumping request to ' + urlh.geturl())
 756             dump = base64.b64encode(webpage_bytes).decode('ascii')
 757             self._downloader.to_screen(dump)
 758         if self._downloader.params.get('write_pages', False):
 759             basen = '%s_%s' % (video_id, urlh.geturl())
 760             if len(basen) > 240:
 761                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 762                 basen = basen[:240 - len(h)] + h
 763             raw_filename = basen + '.dump'
 764             filename = sanitize_filename(raw_filename, restricted=True)
 765             self.to_screen('Saving request to ' + filename)
 766             # Working around MAX_PATH limitation on Windows (see
 767             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 768             if compat_os_name == 'nt':
 769                 absfilepath = os.path.abspath(filename)
 770                 if len(absfilepath) > 259:
 771                     filename = '\\\\?\\' + absfilepath
 772             with open(filename, 'wb') as outf:
 773                 outf.write(webpage_bytes)
 774
 775         try:
 776             content = webpage_bytes.decode(encoding, 'replace')
 777         except LookupError:
 778             content = webpage_bytes.decode('utf-8', 'replace')
 779
 780         self.__check_blocked(content)
 781
 782         return content
 783
 784     def _download_webpage(
 785             self, url_or_request, video_id, note=None, errnote=None,
 786             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 787             headers={}, query={}, expected_status=None):
 788         """
 789         Return the data of the page as a string.
 790
 791         Arguments:
 792         url_or_request -- plain text URL as a string or
 793             a compat_urllib_request.Requestobject
 794         video_id -- Video/playlist/item identifier (string)
 795
 796         Keyword arguments:
 797         note -- note printed before downloading (string)
 798         errnote -- note printed in case of an error (string)
 799         fatal -- flag denoting whether error should be considered fatal,
 800             i.e. whether it should cause ExtractionError to be raised,
 801             otherwise a warning will be reported and extraction continued
 802         tries -- number of tries
 803         timeout -- sleep interval between tries
 804         encoding -- encoding for a page content decoding, guessed automatically
 805             when not explicitly specified
 806         data -- POST data (bytes)
 807         headers -- HTTP headers (dict)
 808         query -- URL query (dict)
 809         expected_status -- allows to accept failed HTTP requests (non 2xx
 810             status code) by explicitly specifying a set of accepted status
 811             codes. Can be any of the following entities:
 812                 - an integer type specifying an exact failed status code to
 813                   accept
 814                 - a list or a tuple of integer types specifying a list of
 815                   failed status codes to accept
 816                 - a callable accepting an actual failed status code and
 817                   returning True if it should be accepted
 818             Note that this argument does not affect success status codes (2xx)
 819             which are always accepted.
 820         """
 821
 822         success = False
 823         try_count = 0
 824         while success is False:
 825             try:
 826                 res = self._download_webpage_handle(
 827                     url_or_request, video_id, note, errnote, fatal,
 828                     encoding=encoding, data=data, headers=headers, query=query,
 829                     expected_status=expected_status)
 830                 success = True
 831             except compat_http_client.IncompleteRead as e:
 832                 try_count += 1
 833                 if try_count >= tries:
 834                     raise e
 835                 self._sleep(timeout, video_id)
 836         if res is False:
 837             return res
 838         else:
 839             content, _ = res
 840             return content
 841
 842     def _download_xml_handle(
 843             self, url_or_request, video_id, note='Downloading XML',
 844             errnote='Unable to download XML', transform_source=None,
 845             fatal=True, encoding=None, data=None, headers={}, query={},
 846             expected_status=None):
 847         """
 848         Return a tuple (xml as an compat_etree_Element, URL handle).
 849
 850         See _download_webpage docstring for arguments specification.
 851         """
 852         res = self._download_webpage_handle(
 853             url_or_request, video_id, note, errnote, fatal=fatal,
 854             encoding=encoding, data=data, headers=headers, query=query,
 855             expected_status=expected_status)
 856         if res is False:
 857             return res
 858         xml_string, urlh = res
 859         return self._parse_xml(
 860             xml_string, video_id, transform_source=transform_source,
 861             fatal=fatal), urlh
 862
 863     def _download_xml(
 864             self, url_or_request, video_id,
 865             note='Downloading XML', errnote='Unable to download XML',
 866             transform_source=None, fatal=True, encoding=None,
 867             data=None, headers={}, query={}, expected_status=None):
 868         """
 869         Return the xml as an compat_etree_Element.
 870
 871         See _download_webpage docstring for arguments specification.
 872         """
 873         res = self._download_xml_handle(
 874             url_or_request, video_id, note=note, errnote=errnote,
 875             transform_source=transform_source, fatal=fatal, encoding=encoding,
 876             data=data, headers=headers, query=query,
 877             expected_status=expected_status)
 878         return res if res is False else res[0]
 879
 880     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 881         if transform_source:
 882             xml_string = transform_source(xml_string)
 883         try:
 884             return compat_etree_fromstring(xml_string.encode('utf-8'))
 885         except compat_xml_parse_error as ve:
 886             errmsg = '%s: Failed to parse XML ' % video_id
 887             if fatal:
 888                 raise ExtractorError(errmsg, cause=ve)
 889             else:
 890                 self.report_warning(errmsg + str(ve))
 891
 892     def _download_json_handle(
 893             self, url_or_request, video_id, note='Downloading JSON metadata',
 894             errnote='Unable to download JSON metadata', transform_source=None,
 895             fatal=True, encoding=None, data=None, headers={}, query={},
 896             expected_status=None):
 897         """
 898         Return a tuple (JSON object, URL handle).
 899
 900         See _download_webpage docstring for arguments specification.
 901         """
 902         res = self._download_webpage_handle(
 903             url_or_request, video_id, note, errnote, fatal=fatal,
 904             encoding=encoding, data=data, headers=headers, query=query,
 905             expected_status=expected_status)
 906         if res is False:
 907             return res
 908         json_string, urlh = res
 909         return self._parse_json(
 910             json_string, video_id, transform_source=transform_source,
 911             fatal=fatal), urlh
 912
 913     def _download_json(
 914             self, url_or_request, video_id, note='Downloading JSON metadata',
 915             errnote='Unable to download JSON metadata', transform_source=None,
 916             fatal=True, encoding=None, data=None, headers={}, query={},
 917             expected_status=None):
 918         """
 919         Return the JSON object as a dict.
 920
 921         See _download_webpage docstring for arguments specification.
 922         """
 923         res = self._download_json_handle(
 924             url_or_request, video_id, note=note, errnote=errnote,
 925             transform_source=transform_source, fatal=fatal, encoding=encoding,
 926             data=data, headers=headers, query=query,
 927             expected_status=expected_status)
 928         return res if res is False else res[0]
 929
 930     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 931         if transform_source:
 932             json_string = transform_source(json_string)
 933         try:
 934             return json.loads(json_string)
 935         except ValueError as ve:
 936             errmsg = '%s: Failed to parse JSON ' % video_id
 937             if fatal:
 938                 raise ExtractorError(errmsg, cause=ve)
 939             else:
 940                 self.report_warning(errmsg + str(ve))
 941
 942     def report_warning(self, msg, video_id=None):
 943         idstr = '' if video_id is None else '%s: ' % video_id
 944         self._downloader.report_warning(
 945             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 946
 947     def to_screen(self, msg):
 948         """Print msg to screen, prefixing it with '[ie_name]'"""
 949         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 950
 951     def report_extraction(self, id_or_name):
 952         """Report information extraction."""
 953         self.to_screen('%s: Extracting information' % id_or_name)
 954
 955     def report_download_webpage(self, video_id):
 956         """Report webpage download."""
 957         self.to_screen('%s: Downloading webpage' % video_id)
 958
 959     def report_age_confirmation(self):
 960         """Report attempt to confirm age."""
 961         self.to_screen('Confirming age')
 962
 963     def report_login(self):
 964         """Report attempt to log in."""
 965         self.to_screen('Logging in')
 966
 967     def raise_login_required(
 968             self, msg='This video is only available for registered users', metadata_available=False):
 969         if metadata_available and self._downloader.params.get('ignore_no_formats_error'):
 970             self.report_warning(msg)
 971         raise ExtractorError(
 972             '%s. Use --cookies, --username and --password or --netrc to provide account credentials' % msg,
 973             expected=True)
 974
 975     def raise_geo_restricted(
 976             self, msg='This video is not available from your location due to geo restriction',
 977             countries=None, metadata_available=False):
 978         if metadata_available and self._downloader.params.get('ignore_no_formats_error'):
 979             self.report_warning(msg)
 980         else:
 981             raise GeoRestrictedError(msg, countries=countries)
 982
 983     def raise_no_formats(self, msg, expected=False, video_id=None):
 984         if expected and self._downloader.params.get('ignore_no_formats_error'):
 985             self.report_warning(msg, video_id)
 986         else:
 987             raise ExtractorError(msg, expected=expected, video_id=video_id)
 988
 989     # Methods for following #608
 990     @staticmethod
 991     def url_result(url, ie=None, video_id=None, video_title=None):
 992         """Returns a URL that points to a page that should be processed"""
 993         # TODO: ie should be the class used for getting the info
 994         video_info = {'_type': 'url',
 995                       'url': url,
 996                       'ie_key': ie}
 997         if video_id is not None:
 998             video_info['id'] = video_id
 999         if video_title is not None:
1000             video_info['title'] = video_title
1001         return video_info
1002
1003     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1004         urls = orderedSet(
1005             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1006             for m in matches)
1007         return self.playlist_result(
1008             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1009
1010     @staticmethod
1011     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1012         """Returns a playlist"""
1013         video_info = {'_type': 'playlist',
1014                       'entries': entries}
1015         video_info.update(kwargs)
1016         if playlist_id:
1017             video_info['id'] = playlist_id
1018         if playlist_title:
1019             video_info['title'] = playlist_title
1020         if playlist_description is not None:
1021             video_info['description'] = playlist_description
1022         return video_info
1023
1024     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1025         """
1026         Perform a regex search on the given string, using a single or a list of
1027         patterns returning the first matching group.
1028         In case of failure return a default value or raise a WARNING or a
1029         RegexNotFoundError, depending on fatal, specifying the field name.
1030         """
1031         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1032             mobj = re.search(pattern, string, flags)
1033         else:
1034             for p in pattern:
1035                 mobj = re.search(p, string, flags)
1036                 if mobj:
1037                     break
1038
1039         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1040             _name = '\033[0;34m%s\033[0m' % name
1041         else:
1042             _name = name
1043
1044         if mobj:
1045             if group is None:
1046                 # return the first matching group
1047                 return next(g for g in mobj.groups() if g is not None)
1048             else:
1049                 return mobj.group(group)
1050         elif default is not NO_DEFAULT:
1051             return default
1052         elif fatal:
1053             raise RegexNotFoundError('Unable to extract %s' % _name)
1054         else:
1055             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1056             return None
1057
1058     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1059         """
1060         Like _search_regex, but strips HTML tags and unescapes entities.
1061         """
1062         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1063         if res:
1064             return clean_html(res).strip()
1065         else:
1066             return res
1067
1068     def _get_netrc_login_info(self, netrc_machine=None):
1069         username = None
1070         password = None
1071         netrc_machine = netrc_machine or self._NETRC_MACHINE
1072
1073         if self._downloader.params.get('usenetrc', False):
1074             try:
1075                 info = netrc.netrc().authenticators(netrc_machine)
1076                 if info is not None:
1077                     username = info[0]
1078                     password = info[2]
1079                 else:
1080                     raise netrc.NetrcParseError(
1081                         'No authenticators for %s' % netrc_machine)
1082             except (IOError, netrc.NetrcParseError) as err:
1083                 self.report_warning(
1084                     'parsing .netrc: %s' % error_to_compat_str(err))
1085
1086         return username, password
1087
1088     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1089         """
1090         Get the login info as (username, password)
1091         First look for the manually specified credentials using username_option
1092         and password_option as keys in params dictionary. If no such credentials
1093         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1094         value.
1095         If there's no info available, return (None, None)
1096         """
1097         if self._downloader is None:
1098             return (None, None)
1099
1100         downloader_params = self._downloader.params
1101
1102         # Attempt to use provided username and password or .netrc data
1103         if downloader_params.get(username_option) is not None:
1104             username = downloader_params[username_option]
1105             password = downloader_params[password_option]
1106         else:
1107             username, password = self._get_netrc_login_info(netrc_machine)
1108
1109         return username, password
1110
1111     def _get_tfa_info(self, note='two-factor verification code'):
1112         """
1113         Get the two-factor authentication info
1114         TODO - asking the user will be required for sms/phone verify
1115         currently just uses the command line option
1116         If there's no info available, return None
1117         """
1118         if self._downloader is None:
1119             return None
1120         downloader_params = self._downloader.params
1121
1122         if downloader_params.get('twofactor') is not None:
1123             return downloader_params['twofactor']
1124
1125         return compat_getpass('Type %s and press [Return]: ' % note)
1126
1127     # Helper functions for extracting OpenGraph info
1128     @staticmethod
1129     def _og_regexes(prop):
1130         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1131         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1132                        % {'prop': re.escape(prop)})
1133         template = r'<meta[^>]+?%s[^>]+?%s'
1134         return [
1135             template % (property_re, content_re),
1136             template % (content_re, property_re),
1137         ]
1138
1139     @staticmethod
1140     def _meta_regex(prop):
1141         return r'''(?isx)<meta
1142                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1143                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1144
1145     def _og_search_property(self, prop, html, name=None, **kargs):
1146         if not isinstance(prop, (list, tuple)):
1147             prop = [prop]
1148         if name is None:
1149             name = 'OpenGraph %s' % prop[0]
1150         og_regexes = []
1151         for p in prop:
1152             og_regexes.extend(self._og_regexes(p))
1153         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1154         if escaped is None:
1155             return None
1156         return unescapeHTML(escaped)
1157
1158     def _og_search_thumbnail(self, html, **kargs):
1159         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1160
1161     def _og_search_description(self, html, **kargs):
1162         return self._og_search_property('description', html, fatal=False, **kargs)
1163
1164     def _og_search_title(self, html, **kargs):
1165         return self._og_search_property('title', html, **kargs)
1166
1167     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1168         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1169         if secure:
1170             regexes = self._og_regexes('video:secure_url') + regexes
1171         return self._html_search_regex(regexes, html, name, **kargs)
1172
1173     def _og_search_url(self, html, **kargs):
1174         return self._og_search_property('url', html, **kargs)
1175
1176     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1177         if not isinstance(name, (list, tuple)):
1178             name = [name]
1179         if display_name is None:
1180             display_name = name[0]
1181         return self._html_search_regex(
1182             [self._meta_regex(n) for n in name],
1183             html, display_name, fatal=fatal, group='content', **kwargs)
1184
1185     def _dc_search_uploader(self, html):
1186         return self._html_search_meta('dc.creator', html, 'uploader')
1187
1188     def _rta_search(self, html):
1189         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1190         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1191                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1192                      html):
1193             return 18
1194         return 0
1195
1196     def _media_rating_search(self, html):
1197         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1198         rating = self._html_search_meta('rating', html)
1199
1200         if not rating:
1201             return None
1202
1203         RATING_TABLE = {
1204             'safe for kids': 0,
1205             'general': 8,
1206             '14 years': 14,
1207             'mature': 17,
1208             'restricted': 19,
1209         }
1210         return RATING_TABLE.get(rating.lower())
1211
1212     def _family_friendly_search(self, html):
1213         # See http://schema.org/VideoObject
1214         family_friendly = self._html_search_meta(
1215             'isFamilyFriendly', html, default=None)
1216
1217         if not family_friendly:
1218             return None
1219
1220         RATING_TABLE = {
1221             '1': 0,
1222             'true': 0,
1223             '0': 18,
1224             'false': 18,
1225         }
1226         return RATING_TABLE.get(family_friendly.lower())
1227
1228     def _twitter_search_player(self, html):
1229         return self._html_search_meta('twitter:player', html,
1230                                       'twitter card player')
1231
1232     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1233         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1234         default = kwargs.get('default', NO_DEFAULT)
1235         # JSON-LD may be malformed and thus `fatal` should be respected.
1236         # At the same time `default` may be passed that assumes `fatal=False`
1237         # for _search_regex. Let's simulate the same behavior here as well.
1238         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1239         json_ld = []
1240         for mobj in json_ld_list:
1241             json_ld_item = self._parse_json(
1242                 mobj.group('json_ld'), video_id, fatal=fatal)
1243             if not json_ld_item:
1244                 continue
1245             if isinstance(json_ld_item, dict):
1246                 json_ld.append(json_ld_item)
1247             elif isinstance(json_ld_item, (list, tuple)):
1248                 json_ld.extend(json_ld_item)
1249         if json_ld:
1250             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1251         if json_ld:
1252             return json_ld
1253         if default is not NO_DEFAULT:
1254             return default
1255         elif fatal:
1256             raise RegexNotFoundError('Unable to extract JSON-LD')
1257         else:
1258             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1259             return {}
1260
1261     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1262         if isinstance(json_ld, compat_str):
1263             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1264         if not json_ld:
1265             return {}
1266         info = {}
1267         if not isinstance(json_ld, (list, tuple, dict)):
1268             return info
1269         if isinstance(json_ld, dict):
1270             json_ld = [json_ld]
1271
1272         INTERACTION_TYPE_MAP = {
1273             'CommentAction': 'comment',
1274             'AgreeAction': 'like',
1275             'DisagreeAction': 'dislike',
1276             'LikeAction': 'like',
1277             'DislikeAction': 'dislike',
1278             'ListenAction': 'view',
1279             'WatchAction': 'view',
1280             'ViewAction': 'view',
1281         }
1282
1283         def extract_interaction_type(e):
1284             interaction_type = e.get('interactionType')
1285             if isinstance(interaction_type, dict):
1286                 interaction_type = interaction_type.get('@type')
1287             return str_or_none(interaction_type)
1288
1289         def extract_interaction_statistic(e):
1290             interaction_statistic = e.get('interactionStatistic')
1291             if isinstance(interaction_statistic, dict):
1292                 interaction_statistic = [interaction_statistic]
1293             if not isinstance(interaction_statistic, list):
1294                 return
1295             for is_e in interaction_statistic:
1296                 if not isinstance(is_e, dict):
1297                     continue
1298                 if is_e.get('@type') != 'InteractionCounter':
1299                     continue
1300                 interaction_type = extract_interaction_type(is_e)
1301                 if not interaction_type:
1302                     continue
1303                 # For interaction count some sites provide string instead of
1304                 # an integer (as per spec) with non digit characters (e.g. ",")
1305                 # so extracting count with more relaxed str_to_int
1306                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1307                 if interaction_count is None:
1308                     continue
1309                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1310                 if not count_kind:
1311                     continue
1312                 count_key = '%s_count' % count_kind
1313                 if info.get(count_key) is not None:
1314                     continue
1315                 info[count_key] = interaction_count
1316
1317         def extract_video_object(e):
1318             assert e['@type'] == 'VideoObject'
1319             author = e.get('author')
1320             info.update({
1321                 'url': url_or_none(e.get('contentUrl')),
1322                 'title': unescapeHTML(e.get('name')),
1323                 'description': unescapeHTML(e.get('description')),
1324                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1325                 'duration': parse_duration(e.get('duration')),
1326                 'timestamp': unified_timestamp(e.get('uploadDate')),
1327                 # author can be an instance of 'Organization' or 'Person' types.
1328                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1329                 # however some websites are using 'Text' type instead.
1330                 # 1. https://schema.org/VideoObject
1331                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1332                 'filesize': float_or_none(e.get('contentSize')),
1333                 'tbr': int_or_none(e.get('bitrate')),
1334                 'width': int_or_none(e.get('width')),
1335                 'height': int_or_none(e.get('height')),
1336                 'view_count': int_or_none(e.get('interactionCount')),
1337             })
1338             extract_interaction_statistic(e)
1339
1340         for e in json_ld:
1341             if '@context' in e:
1342                 item_type = e.get('@type')
1343                 if expected_type is not None and expected_type != item_type:
1344                     continue
1345                 if item_type in ('TVEpisode', 'Episode'):
1346                     episode_name = unescapeHTML(e.get('name'))
1347                     info.update({
1348                         'episode': episode_name,
1349                         'episode_number': int_or_none(e.get('episodeNumber')),
1350                         'description': unescapeHTML(e.get('description')),
1351                     })
1352                     if not info.get('title') and episode_name:
1353                         info['title'] = episode_name
1354                     part_of_season = e.get('partOfSeason')
1355                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1356                         info.update({
1357                             'season': unescapeHTML(part_of_season.get('name')),
1358                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1359                         })
1360                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1361                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1362                         info['series'] = unescapeHTML(part_of_series.get('name'))
1363                 elif item_type == 'Movie':
1364                     info.update({
1365                         'title': unescapeHTML(e.get('name')),
1366                         'description': unescapeHTML(e.get('description')),
1367                         'duration': parse_duration(e.get('duration')),
1368                         'timestamp': unified_timestamp(e.get('dateCreated')),
1369                     })
1370                 elif item_type in ('Article', 'NewsArticle'):
1371                     info.update({
1372                         'timestamp': parse_iso8601(e.get('datePublished')),
1373                         'title': unescapeHTML(e.get('headline')),
1374                         'description': unescapeHTML(e.get('articleBody')),
1375                     })
1376                 elif item_type == 'VideoObject':
1377                     extract_video_object(e)
1378                     if expected_type is None:
1379                         continue
1380                     else:
1381                         break
1382                 video = e.get('video')
1383                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1384                     extract_video_object(video)
1385                 if expected_type is None:
1386                     continue
1387                 else:
1388                     break
1389         return dict((k, v) for k, v in info.items() if v is not None)
1390
1391     @staticmethod
1392     def _hidden_inputs(html):
1393         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1394         hidden_inputs = {}
1395         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1396             attrs = extract_attributes(input)
1397             if not input:
1398                 continue
1399             if attrs.get('type') not in ('hidden', 'submit'):
1400                 continue
1401             name = attrs.get('name') or attrs.get('id')
1402             value = attrs.get('value')
1403             if name and value is not None:
1404                 hidden_inputs[name] = value
1405         return hidden_inputs
1406
1407     def _form_hidden_inputs(self, form_id, html):
1408         form = self._search_regex(
1409             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1410             html, '%s form' % form_id, group='form')
1411         return self._hidden_inputs(form)
1412
1413     class FormatSort:
1414         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1415
1416         default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
1417                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1418                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1419         ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
1420                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1421                         'fps', 'fs_approx', 'source', 'format_id')
1422
1423         settings = {
1424             'vcodec': {'type': 'ordered', 'regex': True,
1425                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1426             'acodec': {'type': 'ordered', 'regex': True,
1427                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1428             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1429                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
1430             'vext': {'type': 'ordered', 'field': 'video_ext',
1431                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1432                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1433             'aext': {'type': 'ordered', 'field': 'audio_ext',
1434                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1435                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1436             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1437             'ie_pref': {'priority': True, 'type': 'extractor'},
1438             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1439             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1440             'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
1441             'quality': {'convert': 'float_none', 'default': -1},
1442             'filesize': {'convert': 'bytes'},
1443             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1444             'id': {'convert': 'string', 'field': 'format_id'},
1445             'height': {'convert': 'float_none'},
1446             'width': {'convert': 'float_none'},
1447             'fps': {'convert': 'float_none'},
1448             'tbr': {'convert': 'float_none'},
1449             'vbr': {'convert': 'float_none'},
1450             'abr': {'convert': 'float_none'},
1451             'asr': {'convert': 'float_none'},
1452             'source': {'convert': 'ignore', 'field': 'source_preference'},
1453
1454             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1455             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1456             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1457             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1458             'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1459
1460             # Most of these exist only for compatibility reasons
1461             'dimension': {'type': 'alias', 'field': 'res'},
1462             'resolution': {'type': 'alias', 'field': 'res'},
1463             'extension': {'type': 'alias', 'field': 'ext'},
1464             'bitrate': {'type': 'alias', 'field': 'br'},
1465             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1466             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1467             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1468             'framerate': {'type': 'alias', 'field': 'fps'},
1469             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1470             'protocol': {'type': 'alias', 'field': 'proto'},
1471             'source_preference': {'type': 'alias', 'field': 'source'},
1472             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1473             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1474             'samplerate': {'type': 'alias', 'field': 'asr'},
1475             'video_ext': {'type': 'alias', 'field': 'vext'},
1476             'audio_ext': {'type': 'alias', 'field': 'aext'},
1477             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1478             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1479             'video': {'type': 'alias', 'field': 'hasvid'},
1480             'has_video': {'type': 'alias', 'field': 'hasvid'},
1481             'audio': {'type': 'alias', 'field': 'hasaud'},
1482             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1483             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1484             'preference': {'type': 'alias', 'field': 'ie_pref'},
1485             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1486             'format_id': {'type': 'alias', 'field': 'id'},
1487         }
1488
1489         _order = []
1490
1491         def _get_field_setting(self, field, key):
1492             if field not in self.settings:
1493                 self.settings[field] = {}
1494             propObj = self.settings[field]
1495             if key not in propObj:
1496                 type = propObj.get('type')
1497                 if key == 'field':
1498                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1499                 elif key == 'convert':
1500                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1501                 else:
1502                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1503                 propObj[key] = default
1504             return propObj[key]
1505
1506         def _resolve_field_value(self, field, value, convertNone=False):
1507             if value is None:
1508                 if not convertNone:
1509                     return None
1510             else:
1511                 value = value.lower()
1512             conversion = self._get_field_setting(field, 'convert')
1513             if conversion == 'ignore':
1514                 return None
1515             if conversion == 'string':
1516                 return value
1517             elif conversion == 'float_none':
1518                 return float_or_none(value)
1519             elif conversion == 'bytes':
1520                 return FileDownloader.parse_bytes(value)
1521             elif conversion == 'order':
1522                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1523                 use_regex = self._get_field_setting(field, 'regex')
1524                 list_length = len(order_list)
1525                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1526                 if use_regex and value is not None:
1527                     for i, regex in enumerate(order_list):
1528                         if regex and re.match(regex, value):
1529                             return list_length - i
1530                     return list_length - empty_pos  # not in list
1531                 else:  # not regex or  value = None
1532                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1533             else:
1534                 if value.isnumeric():
1535                     return float(value)
1536                 else:
1537                     self.settings[field]['convert'] = 'string'
1538                     return value
1539
1540         def evaluate_params(self, params, sort_extractor):
1541             self._use_free_order = params.get('prefer_free_formats', False)
1542             self._sort_user = params.get('format_sort', [])
1543             self._sort_extractor = sort_extractor
1544
1545             def add_item(field, reverse, closest, limit_text):
1546                 field = field.lower()
1547                 if field in self._order:
1548                     return
1549                 self._order.append(field)
1550                 limit = self._resolve_field_value(field, limit_text)
1551                 data = {
1552                     'reverse': reverse,
1553                     'closest': False if limit is None else closest,
1554                     'limit_text': limit_text,
1555                     'limit': limit}
1556                 if field in self.settings:
1557                     self.settings[field].update(data)
1558                 else:
1559                     self.settings[field] = data
1560
1561             sort_list = (
1562                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1563                 + (tuple() if params.get('format_sort_force', False)
1564                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1565                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1566
1567             for item in sort_list:
1568                 match = re.match(self.regex, item)
1569                 if match is None:
1570                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1571                 field = match.group('field')
1572                 if field is None:
1573                     continue
1574                 if self._get_field_setting(field, 'type') == 'alias':
1575                     field = self._get_field_setting(field, 'field')
1576                 reverse = match.group('reverse') is not None
1577                 closest = match.group('separator') == '~'
1578                 limit_text = match.group('limit')
1579
1580                 has_limit = limit_text is not None
1581                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1582                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1583
1584                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1585                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1586                 limit_count = len(limits)
1587                 for (i, f) in enumerate(fields):
1588                     add_item(f, reverse, closest,
1589                              limits[i] if i < limit_count
1590                              else limits[0] if has_limit and not has_multiple_limits
1591                              else None)
1592
1593         def print_verbose_info(self, write_debug):
1594             if self._sort_user:
1595                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1596             if self._sort_extractor:
1597                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1598             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1599                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1600                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1601                               self._get_field_setting(field, 'limit_text'),
1602                               self._get_field_setting(field, 'limit'))
1603                 if self._get_field_setting(field, 'limit_text') is not None else '')
1604                 for field in self._order if self._get_field_setting(field, 'visible')]))
1605
1606         def _calculate_field_preference_from_value(self, format, field, type, value):
1607             reverse = self._get_field_setting(field, 'reverse')
1608             closest = self._get_field_setting(field, 'closest')
1609             limit = self._get_field_setting(field, 'limit')
1610
1611             if type == 'extractor':
1612                 maximum = self._get_field_setting(field, 'max')
1613                 if value is None or (maximum is not None and value >= maximum):
1614                     value = -1
1615             elif type == 'boolean':
1616                 in_list = self._get_field_setting(field, 'in_list')
1617                 not_in_list = self._get_field_setting(field, 'not_in_list')
1618                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1619             elif type == 'ordered':
1620                 value = self._resolve_field_value(field, value, True)
1621
1622             # try to convert to number
1623             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1624             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1625             if is_num:
1626                 value = val_num
1627
1628             return ((-10, 0) if value is None
1629                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1630                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1631                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1632                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1633                     else (-1, value, 0))
1634
1635         def _calculate_field_preference(self, format, field):
1636             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1637             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1638             if type == 'multiple':
1639                 type = 'field'  # Only 'field' is allowed in multiple for now
1640                 actual_fields = self._get_field_setting(field, 'field')
1641
1642                 def wrapped_function(values):
1643                     values = tuple(filter(lambda x: x is not None, values))
1644                     return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1645                             else values[0] if values
1646                             else None)
1647
1648                 value = wrapped_function((get_value(f) for f in actual_fields))
1649             else:
1650                 value = get_value(field)
1651             return self._calculate_field_preference_from_value(format, field, type, value)
1652
1653         def calculate_preference(self, format):
1654             # Determine missing protocol
1655             if not format.get('protocol'):
1656                 format['protocol'] = determine_protocol(format)
1657
1658             # Determine missing ext
1659             if not format.get('ext') and 'url' in format:
1660                 format['ext'] = determine_ext(format['url'])
1661             if format.get('vcodec') == 'none':
1662                 format['audio_ext'] = format['ext']
1663                 format['video_ext'] = 'none'
1664             else:
1665                 format['video_ext'] = format['ext']
1666                 format['audio_ext'] = 'none'
1667             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1668             #    format['preference'] = -1000
1669
1670             # Determine missing bitrates
1671             if format.get('tbr') is None:
1672                 if format.get('vbr') is not None and format.get('abr') is not None:
1673                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1674             else:
1675                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1676                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1677                 if format.get('acodec') != "none" and format.get('abr') is None:
1678                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1679
1680             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1681
1682     def _sort_formats(self, formats, field_preference=[]):
1683         if not formats:
1684             if self._downloader.params.get('ignore_no_formats_error'):
1685                 return
1686             raise ExtractorError('No video formats found')
1687         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1688         format_sort.evaluate_params(self._downloader.params, field_preference)
1689         if self._downloader.params.get('verbose', False):
1690             format_sort.print_verbose_info(self._downloader.write_debug)
1691         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1692
1693     def _check_formats(self, formats, video_id):
1694         if formats:
1695             formats[:] = filter(
1696                 lambda f: self._is_valid_url(
1697                     f['url'], video_id,
1698                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1699                 formats)
1700
1701     @staticmethod
1702     def _remove_duplicate_formats(formats):
1703         format_urls = set()
1704         unique_formats = []
1705         for f in formats:
1706             if f['url'] not in format_urls:
1707                 format_urls.add(f['url'])
1708                 unique_formats.append(f)
1709         formats[:] = unique_formats
1710
1711     def _is_valid_url(self, url, video_id, item='video', headers={}):
1712         url = self._proto_relative_url(url, scheme='http:')
1713         # For now assume non HTTP(S) URLs always valid
1714         if not (url.startswith('http://') or url.startswith('https://')):
1715             return True
1716         try:
1717             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1718             return True
1719         except ExtractorError as e:
1720             self.to_screen(
1721                 '%s: %s URL is invalid, skipping: %s'
1722                 % (video_id, item, error_to_compat_str(e.cause)))
1723             return False
1724
1725     def http_scheme(self):
1726         """ Either "http:" or "https:", depending on the user's preferences """
1727         return (
1728             'http:'
1729             if self._downloader.params.get('prefer_insecure', False)
1730             else 'https:')
1731
1732     def _proto_relative_url(self, url, scheme=None):
1733         if url is None:
1734             return url
1735         if url.startswith('//'):
1736             if scheme is None:
1737                 scheme = self.http_scheme()
1738             return scheme + url
1739         else:
1740             return url
1741
1742     def _sleep(self, timeout, video_id, msg_template=None):
1743         if msg_template is None:
1744             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1745         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1746         self.to_screen(msg)
1747         time.sleep(timeout)
1748
1749     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1750                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1751                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1752         manifest = self._download_xml(
1753             manifest_url, video_id, 'Downloading f4m manifest',
1754             'Unable to download f4m manifest',
1755             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1756             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1757             transform_source=transform_source,
1758             fatal=fatal, data=data, headers=headers, query=query)
1759
1760         if manifest is False:
1761             return []
1762
1763         return self._parse_f4m_formats(
1764             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1765             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1766
1767     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1768                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1769                            fatal=True, m3u8_id=None):
1770         if not isinstance(manifest, compat_etree_Element) and not fatal:
1771             return []
1772
1773         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1774         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1775         if akamai_pv is not None and ';' in akamai_pv.text:
1776             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1777             if playerVerificationChallenge.strip() != '':
1778                 return []
1779
1780         formats = []
1781         manifest_version = '1.0'
1782         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1783         if not media_nodes:
1784             manifest_version = '2.0'
1785             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1786         # Remove unsupported DRM protected media from final formats
1787         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1788         media_nodes = remove_encrypted_media(media_nodes)
1789         if not media_nodes:
1790             return formats
1791
1792         manifest_base_url = get_base_url(manifest)
1793
1794         bootstrap_info = xpath_element(
1795             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1796             'bootstrap info', default=None)
1797
1798         vcodec = None
1799         mime_type = xpath_text(
1800             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1801             'base URL', default=None)
1802         if mime_type and mime_type.startswith('audio/'):
1803             vcodec = 'none'
1804
1805         for i, media_el in enumerate(media_nodes):
1806             tbr = int_or_none(media_el.attrib.get('bitrate'))
1807             width = int_or_none(media_el.attrib.get('width'))
1808             height = int_or_none(media_el.attrib.get('height'))
1809             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1810             # If <bootstrapInfo> is present, the specified f4m is a
1811             # stream-level manifest, and only set-level manifests may refer to
1812             # external resources.  See section 11.4 and section 4 of F4M spec
1813             if bootstrap_info is None:
1814                 media_url = None
1815                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1816                 if manifest_version == '2.0':
1817                     media_url = media_el.attrib.get('href')
1818                 if media_url is None:
1819                     media_url = media_el.attrib.get('url')
1820                 if not media_url:
1821                     continue
1822                 manifest_url = (
1823                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1824                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1825                 # If media_url is itself a f4m manifest do the recursive extraction
1826                 # since bitrates in parent manifest (this one) and media_url manifest
1827                 # may differ leading to inability to resolve the format by requested
1828                 # bitrate in f4m downloader
1829                 ext = determine_ext(manifest_url)
1830                 if ext == 'f4m':
1831                     f4m_formats = self._extract_f4m_formats(
1832                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1833                         transform_source=transform_source, fatal=fatal)
1834                     # Sometimes stream-level manifest contains single media entry that
1835                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1836                     # At the same time parent's media entry in set-level manifest may
1837                     # contain it. We will copy it from parent in such cases.
1838                     if len(f4m_formats) == 1:
1839                         f = f4m_formats[0]
1840                         f.update({
1841                             'tbr': f.get('tbr') or tbr,
1842                             'width': f.get('width') or width,
1843                             'height': f.get('height') or height,
1844                             'format_id': f.get('format_id') if not tbr else format_id,
1845                             'vcodec': vcodec,
1846                         })
1847                     formats.extend(f4m_formats)
1848                     continue
1849                 elif ext == 'm3u8':
1850                     formats.extend(self._extract_m3u8_formats(
1851                         manifest_url, video_id, 'mp4', preference=preference,
1852                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1853                     continue
1854             formats.append({
1855                 'format_id': format_id,
1856                 'url': manifest_url,
1857                 'manifest_url': manifest_url,
1858                 'ext': 'flv' if bootstrap_info is not None else None,
1859                 'protocol': 'f4m',
1860                 'tbr': tbr,
1861                 'width': width,
1862                 'height': height,
1863                 'vcodec': vcodec,
1864                 'preference': preference,
1865                 'quality': quality,
1866             })
1867         return formats
1868
1869     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1870         return {
1871             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1872             'url': m3u8_url,
1873             'ext': ext,
1874             'protocol': 'm3u8',
1875             'preference': preference - 100 if preference else -100,
1876             'quality': quality,
1877             'resolution': 'multiple',
1878             'format_note': 'Quality selection URL',
1879         }
1880
1881     def _extract_m3u8_formats(self, *args, **kwargs):
1882         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1883         if subs:
1884             self.report_warning(bug_reports_message(
1885                 "Ignoring subtitle tracks found in the HLS manifest; "
1886                 "if any subtitle tracks are missing,"
1887             ))
1888         return fmts
1889
1890     def _extract_m3u8_formats_and_subtitles(
1891             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8',
1892             preference=None, quality=None, m3u8_id=None, note=None,
1893             errnote=None, fatal=True, live=False, data=None, headers={},
1894             query={}):
1895
1896         res = self._download_webpage_handle(
1897             m3u8_url, video_id,
1898             note=note or 'Downloading m3u8 information',
1899             errnote=errnote or 'Failed to download m3u8 information',
1900             fatal=fatal, data=data, headers=headers, query=query)
1901
1902         if res is False:
1903             return [], {}
1904
1905         m3u8_doc, urlh = res
1906         m3u8_url = urlh.geturl()
1907
1908         return self._parse_m3u8_formats_and_subtitles(
1909             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1910             preference=preference, quality=quality, m3u8_id=m3u8_id,
1911             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1912             headers=headers, query=query, video_id=video_id)
1913
1914     def _parse_m3u8_formats_and_subtitles(
1915             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8',
1916             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1917             errnote=None, fatal=True, data=None, headers={}, query={},
1918             video_id=None):
1919
1920         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1921             return [], {}
1922
1923         if (not self._downloader.params.get('allow_unplayable_formats')
1924                 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)):  # Apple FairPlay
1925             return [], {}
1926
1927         formats = []
1928
1929         subtitles = {}
1930
1931         format_url = lambda u: (
1932             u
1933             if re.match(r'^https?://', u)
1934             else compat_urlparse.urljoin(m3u8_url, u))
1935
1936         split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
1937
1938         # References:
1939         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1940         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1941         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1942
1943         # We should try extracting formats only from master playlists [1, 4.3.4],
1944         # i.e. playlists that describe available qualities. On the other hand
1945         # media playlists [1, 4.3.3] should be returned as is since they contain
1946         # just the media without qualities renditions.
1947         # Fortunately, master playlist can be easily distinguished from media
1948         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1949         # master playlist tags MUST NOT appear in a media playlist and vice versa.
1950         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1951         # media playlist and MUST NOT appear in master playlist thus we can
1952         # clearly detect media playlist with this criterion.
1953
1954         def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None,
1955                                            fatal=True, data=None, headers={}):
1956             if not m3u8_doc:
1957                 if not format_url:
1958                     return []
1959                 res = self._download_webpage_handle(
1960                     format_url, video_id,
1961                     note=False,
1962                     errnote='Failed to download m3u8 playlist information',
1963                     fatal=fatal, data=data, headers=headers)
1964
1965                 if res is False:
1966                     return []
1967
1968                 m3u8_doc, urlh = res
1969                 format_url = urlh.geturl()
1970
1971             playlist_formats = []
1972             i = (
1973                 0
1974                 if split_discontinuity
1975                 else None)
1976             format_info = {
1977                 'index': i,
1978                 'key_data': None,
1979                 'files': [],
1980             }
1981             for line in m3u8_doc.splitlines():
1982                 if not line.startswith('#'):
1983                     format_info['files'].append(line)
1984                 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
1985                     i += 1
1986                     playlist_formats.append(format_info)
1987                     format_info = {
1988                         'index': i,
1989                         'url': format_url,
1990                         'files': [],
1991                     }
1992             playlist_formats.append(format_info)
1993             return playlist_formats
1994
1995         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1996
1997             playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
1998
1999             for format in playlist_formats:
2000                 format_id = []
2001                 if m3u8_id:
2002                     format_id.append(m3u8_id)
2003                 format_index = format.get('index')
2004                 if format_index:
2005                     format_id.append(str(format_index))
2006                 f = {
2007                     'format_id': '-'.join(format_id),
2008                     'format_index': format_index,
2009                     'url': m3u8_url,
2010                     'ext': ext,
2011                     'protocol': entry_protocol,
2012                     'preference': preference,
2013                     'quality': quality,
2014                 }
2015                 formats.append(f)
2016
2017             return formats, subtitles
2018
2019         groups = {}
2020         last_stream_inf = {}
2021
2022         def extract_media(x_media_line):
2023             media = parse_m3u8_attributes(x_media_line)
2024             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2025             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2026             if not (media_type and group_id and name):
2027                 return
2028             groups.setdefault(group_id, []).append(media)
2029             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2030             if media_type == 'SUBTITLES':
2031                 lang = media['LANGUAGE']  # XXX: normalise?
2032                 url = format_url(media['URI'])
2033                 sub_info = {
2034                     'url': url,
2035                     'ext': determine_ext(url),
2036                 }
2037                 if sub_info['ext'] == 'm3u8':
2038                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2039                     # files may contain is WebVTT:
2040                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2041                     sub_info['ext'] = 'vtt'
2042                     sub_info['protocol'] = 'm3u8_native'
2043                 subtitles.setdefault(lang, []).append(sub_info)
2044             if media_type not in ('VIDEO', 'AUDIO'):
2045                 return
2046             media_url = media.get('URI')
2047             if media_url:
2048                 manifest_url = format_url(media_url)
2049                 format_id = []
2050                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2051                                                                   fatal=fatal, data=data, headers=headers)
2052
2053                 for format in playlist_formats:
2054                     format_index = format.get('index')
2055                     for v in (m3u8_id, group_id, name):
2056                         if v:
2057                             format_id.append(v)
2058                     if format_index:
2059                         format_id.append(str(format_index))
2060                     f = {
2061                         'format_id': '-'.join(format_id),
2062                         'format_index': format_index,
2063                         'url': manifest_url,
2064                         'manifest_url': m3u8_url,
2065                         'language': media.get('LANGUAGE'),
2066                         'ext': ext,
2067                         'protocol': entry_protocol,
2068                         'preference': preference,
2069                         'quality': quality,
2070                     }
2071                     if media_type == 'AUDIO':
2072                         f['vcodec'] = 'none'
2073                     formats.append(f)
2074
2075         def build_stream_name():
2076             # Despite specification does not mention NAME attribute for
2077             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2078             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2079             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2080             stream_name = last_stream_inf.get('NAME')
2081             if stream_name:
2082                 return stream_name
2083             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2084             # from corresponding rendition group
2085             stream_group_id = last_stream_inf.get('VIDEO')
2086             if not stream_group_id:
2087                 return
2088             stream_group = groups.get(stream_group_id)
2089             if not stream_group:
2090                 return stream_group_id
2091             rendition = stream_group[0]
2092             return rendition.get('NAME') or stream_group_id
2093
2094         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2095         # chance to detect video only formats when EXT-X-STREAM-INF tags
2096         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2097         for line in m3u8_doc.splitlines():
2098             if line.startswith('#EXT-X-MEDIA:'):
2099                 extract_media(line)
2100
2101         for line in m3u8_doc.splitlines():
2102             if line.startswith('#EXT-X-STREAM-INF:'):
2103                 last_stream_inf = parse_m3u8_attributes(line)
2104             elif line.startswith('#') or not line.strip():
2105                 continue
2106             else:
2107                 tbr = float_or_none(
2108                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2109                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2110                 manifest_url = format_url(line.strip())
2111
2112                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2113                                                                   fatal=fatal, data=data, headers=headers)
2114
2115                 for frmt in playlist_formats:
2116                     format_id = []
2117                     if m3u8_id:
2118                         format_id.append(m3u8_id)
2119                     format_index = frmt.get('index')
2120                     stream_name = build_stream_name()
2121                     # Bandwidth of live streams may differ over time thus making
2122                     # format_id unpredictable. So it's better to keep provided
2123                     # format_id intact.
2124                     if not live:
2125                         format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2126                     if format_index:
2127                         format_id.append(str(format_index))
2128                     f = {
2129                         'format_id': '-'.join(format_id),
2130                         'format_index': format_index,
2131                         'url': manifest_url,
2132                         'manifest_url': m3u8_url,
2133                         'tbr': tbr,
2134                         'ext': ext,
2135                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2136                         'protocol': entry_protocol,
2137                         'preference': preference,
2138                         'quality': quality,
2139                     }
2140                     resolution = last_stream_inf.get('RESOLUTION')
2141                     if resolution:
2142                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2143                         if mobj:
2144                             f['width'] = int(mobj.group('width'))
2145                             f['height'] = int(mobj.group('height'))
2146                     # Unified Streaming Platform
2147                     mobj = re.search(
2148                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2149                     if mobj:
2150                         abr, vbr = mobj.groups()
2151                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2152                         f.update({
2153                             'vbr': vbr,
2154                             'abr': abr,
2155                         })
2156                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2157                     f.update(codecs)
2158                     audio_group_id = last_stream_inf.get('AUDIO')
2159                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2160                     # references a rendition group MUST have a CODECS attribute.
2161                     # However, this is not always respected, for example, [2]
2162                     # contains EXT-X-STREAM-INF tag which references AUDIO
2163                     # rendition group but does not have CODECS and despite
2164                     # referencing an audio group it represents a complete
2165                     # (with audio and video) format. So, for such cases we will
2166                     # ignore references to rendition groups and treat them
2167                     # as complete formats.
2168                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2169                         audio_group = groups.get(audio_group_id)
2170                         if audio_group and audio_group[0].get('URI'):
2171                             # TODO: update acodec for audio only formats with
2172                             # the same GROUP-ID
2173                             f['acodec'] = 'none'
2174                     if not f.get('ext'):
2175                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2176                     formats.append(f)
2177
2178                     # for DailyMotion
2179                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2180                     if progressive_uri:
2181                         http_f = f.copy()
2182                         del http_f['manifest_url']
2183                         http_f.update({
2184                             'format_id': f['format_id'].replace('hls-', 'http-'),
2185                             'protocol': 'http',
2186                             'url': progressive_uri,
2187                         })
2188                         formats.append(http_f)
2189
2190                 last_stream_inf = {}
2191         return formats, subtitles
2192
2193     @staticmethod
2194     def _xpath_ns(path, namespace=None):
2195         if not namespace:
2196             return path
2197         out = []
2198         for c in path.split('/'):
2199             if not c or c == '.':
2200                 out.append(c)
2201             else:
2202                 out.append('{%s}%s' % (namespace, c))
2203         return '/'.join(out)
2204
2205     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2206         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2207
2208         if smil is False:
2209             assert not fatal
2210             return []
2211
2212         namespace = self._parse_smil_namespace(smil)
2213
2214         return self._parse_smil_formats(
2215             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2216
2217     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2218         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2219         if smil is False:
2220             return {}
2221         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2222
2223     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2224         return self._download_xml(
2225             smil_url, video_id, 'Downloading SMIL file',
2226             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2227
2228     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2229         namespace = self._parse_smil_namespace(smil)
2230
2231         formats = self._parse_smil_formats(
2232             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2233         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2234
2235         video_id = os.path.splitext(url_basename(smil_url))[0]
2236         title = None
2237         description = None
2238         upload_date = None
2239         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2240             name = meta.attrib.get('name')
2241             content = meta.attrib.get('content')
2242             if not name or not content:
2243                 continue
2244             if not title and name == 'title':
2245                 title = content
2246             elif not description and name in ('description', 'abstract'):
2247                 description = content
2248             elif not upload_date and name == 'date':
2249                 upload_date = unified_strdate(content)
2250
2251         thumbnails = [{
2252             'id': image.get('type'),
2253             'url': image.get('src'),
2254             'width': int_or_none(image.get('width')),
2255             'height': int_or_none(image.get('height')),
2256         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2257
2258         return {
2259             'id': video_id,
2260             'title': title or video_id,
2261             'description': description,
2262             'upload_date': upload_date,
2263             'thumbnails': thumbnails,
2264             'formats': formats,
2265             'subtitles': subtitles,
2266         }
2267
2268     def _parse_smil_namespace(self, smil):
2269         return self._search_regex(
2270             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2271
2272     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2273         base = smil_url
2274         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2275             b = meta.get('base') or meta.get('httpBase')
2276             if b:
2277                 base = b
2278                 break
2279
2280         formats = []
2281         rtmp_count = 0
2282         http_count = 0
2283         m3u8_count = 0
2284
2285         srcs = []
2286         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2287         for medium in media:
2288             src = medium.get('src')
2289             if not src or src in srcs:
2290                 continue
2291             srcs.append(src)
2292
2293             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2294             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2295             width = int_or_none(medium.get('width'))
2296             height = int_or_none(medium.get('height'))
2297             proto = medium.get('proto')
2298             ext = medium.get('ext')
2299             src_ext = determine_ext(src)
2300             streamer = medium.get('streamer') or base
2301
2302             if proto == 'rtmp' or streamer.startswith('rtmp'):
2303                 rtmp_count += 1
2304                 formats.append({
2305                     'url': streamer,
2306                     'play_path': src,
2307                     'ext': 'flv',
2308                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2309                     'tbr': bitrate,
2310                     'filesize': filesize,
2311                     'width': width,
2312                     'height': height,
2313                 })
2314                 if transform_rtmp_url:
2315                     streamer, src = transform_rtmp_url(streamer, src)
2316                     formats[-1].update({
2317                         'url': streamer,
2318                         'play_path': src,
2319                     })
2320                 continue
2321
2322             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2323             src_url = src_url.strip()
2324
2325             if proto == 'm3u8' or src_ext == 'm3u8':
2326                 m3u8_formats = self._extract_m3u8_formats(
2327                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2328                 if len(m3u8_formats) == 1:
2329                     m3u8_count += 1
2330                     m3u8_formats[0].update({
2331                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2332                         'tbr': bitrate,
2333                         'width': width,
2334                         'height': height,
2335                     })
2336                 formats.extend(m3u8_formats)
2337             elif src_ext == 'f4m':
2338                 f4m_url = src_url
2339                 if not f4m_params:
2340                     f4m_params = {
2341                         'hdcore': '3.2.0',
2342                         'plugin': 'flowplayer-3.2.0.1',
2343                     }
2344                 f4m_url += '&' if '?' in f4m_url else '?'
2345                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2346                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2347             elif src_ext == 'mpd':
2348                 formats.extend(self._extract_mpd_formats(
2349                     src_url, video_id, mpd_id='dash', fatal=False))
2350             elif re.search(r'\.ism/[Mm]anifest', src_url):
2351                 formats.extend(self._extract_ism_formats(
2352                     src_url, video_id, ism_id='mss', fatal=False))
2353             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2354                 http_count += 1
2355                 formats.append({
2356                     'url': src_url,
2357                     'ext': ext or src_ext or 'flv',
2358                     'format_id': 'http-%d' % (bitrate or http_count),
2359                     'tbr': bitrate,
2360                     'filesize': filesize,
2361                     'width': width,
2362                     'height': height,
2363                 })
2364
2365         return formats
2366
2367     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2368         urls = []
2369         subtitles = {}
2370         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2371             src = textstream.get('src')
2372             if not src or src in urls:
2373                 continue
2374             urls.append(src)
2375             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2376             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2377             subtitles.setdefault(lang, []).append({
2378                 'url': src,
2379                 'ext': ext,
2380             })
2381         return subtitles
2382
2383     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2384         xspf = self._download_xml(
2385             xspf_url, playlist_id, 'Downloading xpsf playlist',
2386             'Unable to download xspf manifest', fatal=fatal)
2387         if xspf is False:
2388             return []
2389         return self._parse_xspf(
2390             xspf, playlist_id, xspf_url=xspf_url,
2391             xspf_base_url=base_url(xspf_url))
2392
2393     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2394         NS_MAP = {
2395             'xspf': 'http://xspf.org/ns/0/',
2396             's1': 'http://static.streamone.nl/player/ns/0',
2397         }
2398
2399         entries = []
2400         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2401             title = xpath_text(
2402                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2403             description = xpath_text(
2404                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2405             thumbnail = xpath_text(
2406                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2407             duration = float_or_none(
2408                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2409
2410             formats = []
2411             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2412                 format_url = urljoin(xspf_base_url, location.text)
2413                 if not format_url:
2414                     continue
2415                 formats.append({
2416                     'url': format_url,
2417                     'manifest_url': xspf_url,
2418                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2419                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2420                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2421                 })
2422             self._sort_formats(formats)
2423
2424             entries.append({
2425                 'id': playlist_id,
2426                 'title': title,
2427                 'description': description,
2428                 'thumbnail': thumbnail,
2429                 'duration': duration,
2430                 'formats': formats,
2431             })
2432         return entries
2433
2434     def _extract_mpd_formats(self, *args, **kwargs):
2435         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2436         if subs:
2437             self.report_warning(bug_reports_message(
2438                 "Ignoring subtitle tracks found in the DASH manifest; "
2439                 "if any subtitle tracks are missing,"
2440             ))
2441         return fmts
2442
2443     def _extract_mpd_formats_and_subtitles(
2444             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2445             fatal=True, data=None, headers={}, query={}):
2446         res = self._download_xml_handle(
2447             mpd_url, video_id,
2448             note=note or 'Downloading MPD manifest',
2449             errnote=errnote or 'Failed to download MPD manifest',
2450             fatal=fatal, data=data, headers=headers, query=query)
2451         if res is False:
2452             return [], {}
2453         mpd_doc, urlh = res
2454         if mpd_doc is None:
2455             return [], {}
2456         mpd_base_url = base_url(urlh.geturl())
2457
2458         return self._parse_mpd_formats_and_subtitles(
2459             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2460
2461     def _parse_mpd_formats(self, *args, **kwargs):
2462         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2463         if subs:
2464             self.report_warning(bug_reports_message(
2465                 "Ignoring subtitle tracks found in the DASH manifest; "
2466                 "if any subtitle tracks are missing,"
2467             ))
2468         return fmts
2469
2470     def _parse_mpd_formats_and_subtitles(
2471             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2472         """
2473         Parse formats from MPD manifest.
2474         References:
2475          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2476             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2477          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2478         """
2479         if not self._downloader.params.get('dynamic_mpd', True):
2480             if mpd_doc.get('type') == 'dynamic':
2481                 return [], {}
2482
2483         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2484
2485         def _add_ns(path):
2486             return self._xpath_ns(path, namespace)
2487
2488         def is_drm_protected(element):
2489             return element.find(_add_ns('ContentProtection')) is not None
2490
2491         def extract_multisegment_info(element, ms_parent_info):
2492             ms_info = ms_parent_info.copy()
2493
2494             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2495             # common attributes and elements.  We will only extract relevant
2496             # for us.
2497             def extract_common(source):
2498                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2499                 if segment_timeline is not None:
2500                     s_e = segment_timeline.findall(_add_ns('S'))
2501                     if s_e:
2502                         ms_info['total_number'] = 0
2503                         ms_info['s'] = []
2504                         for s in s_e:
2505                             r = int(s.get('r', 0))
2506                             ms_info['total_number'] += 1 + r
2507                             ms_info['s'].append({
2508                                 't': int(s.get('t', 0)),
2509                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2510                                 'd': int(s.attrib['d']),
2511                                 'r': r,
2512                             })
2513                 start_number = source.get('startNumber')
2514                 if start_number:
2515                     ms_info['start_number'] = int(start_number)
2516                 timescale = source.get('timescale')
2517                 if timescale:
2518                     ms_info['timescale'] = int(timescale)
2519                 segment_duration = source.get('duration')
2520                 if segment_duration:
2521                     ms_info['segment_duration'] = float(segment_duration)
2522
2523             def extract_Initialization(source):
2524                 initialization = source.find(_add_ns('Initialization'))
2525                 if initialization is not None:
2526                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2527
2528             segment_list = element.find(_add_ns('SegmentList'))
2529             if segment_list is not None:
2530                 extract_common(segment_list)
2531                 extract_Initialization(segment_list)
2532                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2533                 if segment_urls_e:
2534                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2535             else:
2536                 segment_template = element.find(_add_ns('SegmentTemplate'))
2537                 if segment_template is not None:
2538                     extract_common(segment_template)
2539                     media = segment_template.get('media')
2540                     if media:
2541                         ms_info['media'] = media
2542                     initialization = segment_template.get('initialization')
2543                     if initialization:
2544                         ms_info['initialization'] = initialization
2545                     else:
2546                         extract_Initialization(segment_template)
2547             return ms_info
2548
2549         skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
2550
2551         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2552         formats = []
2553         subtitles = {}
2554         for period in mpd_doc.findall(_add_ns('Period')):
2555             period_duration = parse_duration(period.get('duration')) or mpd_duration
2556             period_ms_info = extract_multisegment_info(period, {
2557                 'start_number': 1,
2558                 'timescale': 1,
2559             })
2560             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2561                 if skip_unplayable and is_drm_protected(adaptation_set):
2562                     continue
2563                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2564                 for representation in adaptation_set.findall(_add_ns('Representation')):
2565                     if skip_unplayable and is_drm_protected(representation):
2566                         continue
2567                     representation_attrib = adaptation_set.attrib.copy()
2568                     representation_attrib.update(representation.attrib)
2569                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2570                     mime_type = representation_attrib['mimeType']
2571                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2572
2573                     if content_type in ('video', 'audio', 'text'):
2574                         base_url = ''
2575                         for element in (representation, adaptation_set, period, mpd_doc):
2576                             base_url_e = element.find(_add_ns('BaseURL'))
2577                             if base_url_e is not None:
2578                                 base_url = base_url_e.text + base_url
2579                                 if re.match(r'^https?://', base_url):
2580                                     break
2581                         if mpd_base_url and not re.match(r'^https?://', base_url):
2582                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2583                                 mpd_base_url += '/'
2584                             base_url = mpd_base_url + base_url
2585                         representation_id = representation_attrib.get('id')
2586                         lang = representation_attrib.get('lang')
2587                         url_el = representation.find(_add_ns('BaseURL'))
2588                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2589                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2590                         if content_type in ('video', 'audio'):
2591                             f = {
2592                                 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2593                                 'manifest_url': mpd_url,
2594                                 'ext': mimetype2ext(mime_type),
2595                                 'width': int_or_none(representation_attrib.get('width')),
2596                                 'height': int_or_none(representation_attrib.get('height')),
2597                                 'tbr': float_or_none(bandwidth, 1000),
2598                                 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2599                                 'fps': int_or_none(representation_attrib.get('frameRate')),
2600                                 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2601                                 'format_note': 'DASH %s' % content_type,
2602                                 'filesize': filesize,
2603                                 'container': mimetype2ext(mime_type) + '_dash',
2604                             }
2605                             f.update(parse_codecs(representation_attrib.get('codecs')))
2606                         elif content_type == 'text':
2607                             f = {
2608                                 'ext': mimetype2ext(mime_type),
2609                                 'manifest_url': mpd_url,
2610                                 'filesize': filesize,
2611                             }
2612                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2613
2614                         def prepare_template(template_name, identifiers):
2615                             tmpl = representation_ms_info[template_name]
2616                             # First of, % characters outside $...$ templates
2617                             # must be escaped by doubling for proper processing
2618                             # by % operator string formatting used further (see
2619                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2620                             t = ''
2621                             in_template = False
2622                             for c in tmpl:
2623                                 t += c
2624                                 if c == '$':
2625                                     in_template = not in_template
2626                                 elif c == '%' and not in_template:
2627                                     t += c
2628                             # Next, $...$ templates are translated to their
2629                             # %(...) counterparts to be used with % operator
2630                             t = t.replace('$RepresentationID$', representation_id)
2631                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2632                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2633                             t.replace('$$', '$')
2634                             return t
2635
2636                         # @initialization is a regular template like @media one
2637                         # so it should be handled just the same way (see
2638                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2639                         if 'initialization' in representation_ms_info:
2640                             initialization_template = prepare_template(
2641                                 'initialization',
2642                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2643                                 # $Time$ shall not be included for @initialization thus
2644                                 # only $Bandwidth$ remains
2645                                 ('Bandwidth', ))
2646                             representation_ms_info['initialization_url'] = initialization_template % {
2647                                 'Bandwidth': bandwidth,
2648                             }
2649
2650                         def location_key(location):
2651                             return 'url' if re.match(r'^https?://', location) else 'path'
2652
2653                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2654
2655                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2656                             media_location_key = location_key(media_template)
2657
2658                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2659                             # can't be used at the same time
2660                             if '%(Number' in media_template and 's' not in representation_ms_info:
2661                                 segment_duration = None
2662                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2663                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2664                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2665                                 representation_ms_info['fragments'] = [{
2666                                     media_location_key: media_template % {
2667                                         'Number': segment_number,
2668                                         'Bandwidth': bandwidth,
2669                                     },
2670                                     'duration': segment_duration,
2671                                 } for segment_number in range(
2672                                     representation_ms_info['start_number'],
2673                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2674                             else:
2675                                 # $Number*$ or $Time$ in media template with S list available
2676                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2677                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2678                                 representation_ms_info['fragments'] = []
2679                                 segment_time = 0
2680                                 segment_d = None
2681                                 segment_number = representation_ms_info['start_number']
2682
2683                                 def add_segment_url():
2684                                     segment_url = media_template % {
2685                                         'Time': segment_time,
2686                                         'Bandwidth': bandwidth,
2687                                         'Number': segment_number,
2688                                     }
2689                                     representation_ms_info['fragments'].append({
2690                                         media_location_key: segment_url,
2691                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2692                                     })
2693
2694                                 for num, s in enumerate(representation_ms_info['s']):
2695                                     segment_time = s.get('t') or segment_time
2696                                     segment_d = s['d']
2697                                     add_segment_url()
2698                                     segment_number += 1
2699                                     for r in range(s.get('r', 0)):
2700                                         segment_time += segment_d
2701                                         add_segment_url()
2702                                         segment_number += 1
2703                                     segment_time += segment_d
2704                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2705                             # No media template
2706                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2707                             # or any YouTube dashsegments video
2708                             fragments = []
2709                             segment_index = 0
2710                             timescale = representation_ms_info['timescale']
2711                             for s in representation_ms_info['s']:
2712                                 duration = float_or_none(s['d'], timescale)
2713                                 for r in range(s.get('r', 0) + 1):
2714                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2715                                     fragments.append({
2716                                         location_key(segment_uri): segment_uri,
2717                                         'duration': duration,
2718                                     })
2719                                     segment_index += 1
2720                             representation_ms_info['fragments'] = fragments
2721                         elif 'segment_urls' in representation_ms_info:
2722                             # Segment URLs with no SegmentTimeline
2723                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2724                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2725                             fragments = []
2726                             segment_duration = float_or_none(
2727                                 representation_ms_info['segment_duration'],
2728                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2729                             for segment_url in representation_ms_info['segment_urls']:
2730                                 fragment = {
2731                                     location_key(segment_url): segment_url,
2732                                 }
2733                                 if segment_duration:
2734                                     fragment['duration'] = segment_duration
2735                                 fragments.append(fragment)
2736                             representation_ms_info['fragments'] = fragments
2737                         # If there is a fragments key available then we correctly recognized fragmented media.
2738                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2739                         # assumption is not necessarily correct since we may simply have no support for
2740                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2741                         if 'fragments' in representation_ms_info:
2742                             f.update({
2743                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2744                                 'url': mpd_url or base_url,
2745                                 'fragment_base_url': base_url,
2746                                 'fragments': [],
2747                                 'protocol': 'http_dash_segments',
2748                             })
2749                             if 'initialization_url' in representation_ms_info:
2750                                 initialization_url = representation_ms_info['initialization_url']
2751                                 if not f.get('url'):
2752                                     f['url'] = initialization_url
2753                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2754                             f['fragments'].extend(representation_ms_info['fragments'])
2755                         else:
2756                             # Assuming direct URL to unfragmented media.
2757                             f['url'] = base_url
2758                         if content_type in ('video', 'audio'):
2759                             formats.append(f)
2760                         elif content_type == 'text':
2761                             subtitles.setdefault(lang or 'und', []).append(f)
2762                     else:
2763                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2764         return formats, subtitles
2765
2766     def _extract_ism_formats(self, *args, **kwargs):
2767         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2768         if subs:
2769             self.report_warning(bug_reports_message(
2770                 "Ignoring subtitle tracks found in the ISM manifest; "
2771                 "if any subtitle tracks are missing,"
2772             ))
2773         return fmts
2774
2775     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2776         res = self._download_xml_handle(
2777             ism_url, video_id,
2778             note=note or 'Downloading ISM manifest',
2779             errnote=errnote or 'Failed to download ISM manifest',
2780             fatal=fatal, data=data, headers=headers, query=query)
2781         if res is False:
2782             return [], {}
2783         ism_doc, urlh = res
2784         if ism_doc is None:
2785             return [], {}
2786
2787         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2788
2789     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2790         """
2791         Parse formats from ISM manifest.
2792         References:
2793          1. [MS-SSTR]: Smooth Streaming Protocol,
2794             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2795         """
2796         if ism_doc.get('IsLive') == 'TRUE':
2797             return [], {}
2798         if (not self._downloader.params.get('allow_unplayable_formats')
2799                 and ism_doc.find('Protection') is not None):
2800             return [], {}
2801
2802         duration = int(ism_doc.attrib['Duration'])
2803         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2804
2805         formats = []
2806         subtitles = {}
2807         for stream in ism_doc.findall('StreamIndex'):
2808             stream_type = stream.get('Type')
2809             if stream_type not in ('video', 'audio', 'text'):
2810                 continue
2811             url_pattern = stream.attrib['Url']
2812             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2813             stream_name = stream.get('Name')
2814             stream_language = stream.get('Language', 'und')
2815             for track in stream.findall('QualityLevel'):
2816                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2817                 # TODO: add support for WVC1 and WMAP
2818                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2819                     self.report_warning('%s is not a supported codec' % fourcc)
2820                     continue
2821                 tbr = int(track.attrib['Bitrate']) // 1000
2822                 # [1] does not mention Width and Height attributes. However,
2823                 # they're often present while MaxWidth and MaxHeight are
2824                 # missing, so should be used as fallbacks
2825                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2826                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2827                 sampling_rate = int_or_none(track.get('SamplingRate'))
2828
2829                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2830                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2831
2832                 fragments = []
2833                 fragment_ctx = {
2834                     'time': 0,
2835                 }
2836                 stream_fragments = stream.findall('c')
2837                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2838                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2839                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2840                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2841                     if not fragment_ctx['duration']:
2842                         try:
2843                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2844                         except IndexError:
2845                             next_fragment_time = duration
2846                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2847                     for _ in range(fragment_repeat):
2848                         fragments.append({
2849                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2850                             'duration': fragment_ctx['duration'] / stream_timescale,
2851                         })
2852                         fragment_ctx['time'] += fragment_ctx['duration']
2853
2854                 format_id = []
2855                 if ism_id:
2856                     format_id.append(ism_id)
2857                 if stream_name:
2858                     format_id.append(stream_name)
2859                 format_id.append(compat_str(tbr))
2860
2861                 if stream_type == 'text':
2862                     subtitles.setdefault(stream_language, []).append({
2863                         'ext': 'ismt',
2864                         'protocol': 'ism',
2865                         'url': ism_url,
2866                         'manifest_url': ism_url,
2867                         'fragments': fragments,
2868                         '_download_params': {
2869                             'stream_type': stream_type,
2870                             'duration': duration,
2871                             'timescale': stream_timescale,
2872                             'fourcc': fourcc,
2873                             'language': stream_language,
2874                             'codec_private_data': track.get('CodecPrivateData'),
2875                         }
2876                     })
2877                 elif stream_type in ('video', 'audio'):
2878                     formats.append({
2879                         'format_id': '-'.join(format_id),
2880                         'url': ism_url,
2881                         'manifest_url': ism_url,
2882                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2883                         'width': width,
2884                         'height': height,
2885                         'tbr': tbr,
2886                         'asr': sampling_rate,
2887                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2888                         'acodec': 'none' if stream_type == 'video' else fourcc,
2889                         'protocol': 'ism',
2890                         'fragments': fragments,
2891                         '_download_params': {
2892                             'stream_type': stream_type,
2893                             'duration': duration,
2894                             'timescale': stream_timescale,
2895                             'width': width or 0,
2896                             'height': height or 0,
2897                             'fourcc': fourcc,
2898                             'language': stream_language,
2899                             'codec_private_data': track.get('CodecPrivateData'),
2900                             'sampling_rate': sampling_rate,
2901                             'channels': int_or_none(track.get('Channels', 2)),
2902                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2903                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2904                         },
2905                     })
2906         return formats, subtitles
2907
2908     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2909         def absolute_url(item_url):
2910             return urljoin(base_url, item_url)
2911
2912         def parse_content_type(content_type):
2913             if not content_type:
2914                 return {}
2915             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2916             if ctr:
2917                 mimetype, codecs = ctr.groups()
2918                 f = parse_codecs(codecs)
2919                 f['ext'] = mimetype2ext(mimetype)
2920                 return f
2921             return {}
2922
2923         def _media_formats(src, cur_media_type, type_info={}):
2924             full_url = absolute_url(src)
2925             ext = type_info.get('ext') or determine_ext(full_url)
2926             if ext == 'm3u8':
2927                 is_plain_url = False
2928                 formats = self._extract_m3u8_formats(
2929                     full_url, video_id, ext='mp4',
2930                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2931                     preference=preference, quality=quality, fatal=False)
2932             elif ext == 'mpd':
2933                 is_plain_url = False
2934                 formats = self._extract_mpd_formats(
2935                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2936             else:
2937                 is_plain_url = True
2938                 formats = [{
2939                     'url': full_url,
2940                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2941                 }]
2942             return is_plain_url, formats
2943
2944         entries = []
2945         # amp-video and amp-audio are very similar to their HTML5 counterparts
2946         # so we wll include them right here (see
2947         # https://www.ampproject.org/docs/reference/components/amp-video)
2948         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2949         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2950         media_tags = [(media_tag, media_tag_name, media_type, '')
2951                       for media_tag, media_tag_name, media_type
2952                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2953         media_tags.extend(re.findall(
2954             # We only allow video|audio followed by a whitespace or '>'.
2955             # Allowing more characters may end up in significant slow down (see
2956             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2957             # http://www.porntrex.com/maps/videositemap.xml).
2958             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2959         for media_tag, _, media_type, media_content in media_tags:
2960             media_info = {
2961                 'formats': [],
2962                 'subtitles': {},
2963             }
2964             media_attributes = extract_attributes(media_tag)
2965             src = strip_or_none(media_attributes.get('src'))
2966             if src:
2967                 _, formats = _media_formats(src, media_type)
2968                 media_info['formats'].extend(formats)
2969             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2970             if media_content:
2971                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2972                     s_attr = extract_attributes(source_tag)
2973                     # data-video-src and data-src are non standard but seen
2974                     # several times in the wild
2975                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2976                     if not src:
2977                         continue
2978                     f = parse_content_type(s_attr.get('type'))
2979                     is_plain_url, formats = _media_formats(src, media_type, f)
2980                     if is_plain_url:
2981                         # width, height, res, label and title attributes are
2982                         # all not standard but seen several times in the wild
2983                         labels = [
2984                             s_attr.get(lbl)
2985                             for lbl in ('label', 'title')
2986                             if str_or_none(s_attr.get(lbl))
2987                         ]
2988                         width = int_or_none(s_attr.get('width'))
2989                         height = (int_or_none(s_attr.get('height'))
2990                                   or int_or_none(s_attr.get('res')))
2991                         if not width or not height:
2992                             for lbl in labels:
2993                                 resolution = parse_resolution(lbl)
2994                                 if not resolution:
2995                                     continue
2996                                 width = width or resolution.get('width')
2997                                 height = height or resolution.get('height')
2998                         for lbl in labels:
2999                             tbr = parse_bitrate(lbl)
3000                             if tbr:
3001                                 break
3002                         else:
3003                             tbr = None
3004                         f.update({
3005                             'width': width,
3006                             'height': height,
3007                             'tbr': tbr,
3008                             'format_id': s_attr.get('label') or s_attr.get('title'),
3009                         })
3010                         f.update(formats[0])
3011                         media_info['formats'].append(f)
3012                     else:
3013                         media_info['formats'].extend(formats)
3014                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3015                     track_attributes = extract_attributes(track_tag)
3016                     kind = track_attributes.get('kind')
3017                     if not kind or kind in ('subtitles', 'captions'):
3018                         src = strip_or_none(track_attributes.get('src'))
3019                         if not src:
3020                             continue
3021                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3022                         media_info['subtitles'].setdefault(lang, []).append({
3023                             'url': absolute_url(src),
3024                         })
3025             for f in media_info['formats']:
3026                 f.setdefault('http_headers', {})['Referer'] = base_url
3027             if media_info['formats'] or media_info['subtitles']:
3028                 entries.append(media_info)
3029         return entries
3030
3031     def _extract_akamai_formats(self, *args, **kwargs):
3032         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3033         if subs:
3034             self.report_warning(bug_reports_message(
3035                 "Ignoring subtitle tracks found in the manifests; "
3036                 "if any subtitle tracks are missing,"
3037             ))
3038         return fmts
3039
3040     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3041         signed = 'hdnea=' in manifest_url
3042         if not signed:
3043             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3044             manifest_url = re.sub(
3045                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3046                 '', manifest_url).strip('?')
3047
3048         formats = []
3049         subtitles = {}
3050
3051         hdcore_sign = 'hdcore=3.7.0'
3052         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3053         hds_host = hosts.get('hds')
3054         if hds_host:
3055             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3056         if 'hdcore=' not in f4m_url:
3057             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3058         f4m_formats = self._extract_f4m_formats(
3059             f4m_url, video_id, f4m_id='hds', fatal=False)
3060         for entry in f4m_formats:
3061             entry.update({'extra_param_to_segment_url': hdcore_sign})
3062         formats.extend(f4m_formats)
3063
3064         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3065         hls_host = hosts.get('hls')
3066         if hls_host:
3067             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3068         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3069             m3u8_url, video_id, 'mp4', 'm3u8_native',
3070             m3u8_id='hls', fatal=False)
3071         formats.extend(m3u8_formats)
3072         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3073
3074         http_host = hosts.get('http')
3075         if http_host and m3u8_formats and not signed:
3076             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3077             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3078             qualities_length = len(qualities)
3079             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3080                 i = 0
3081                 for f in m3u8_formats:
3082                     if f['vcodec'] != 'none':
3083                         for protocol in ('http', 'https'):
3084                             http_f = f.copy()
3085                             del http_f['manifest_url']
3086                             http_url = re.sub(
3087                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3088                             http_f.update({
3089                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3090                                 'url': http_url,
3091                                 'protocol': protocol,
3092                             })
3093                             formats.append(http_f)
3094                         i += 1
3095
3096         return formats, subtitles
3097
3098     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3099         query = compat_urlparse.urlparse(url).query
3100         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3101         mobj = re.search(
3102             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3103         url_base = mobj.group('url')
3104         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3105         formats = []
3106
3107         def manifest_url(manifest):
3108             m_url = '%s/%s' % (http_base_url, manifest)
3109             if query:
3110                 m_url += '?%s' % query
3111             return m_url
3112
3113         if 'm3u8' not in skip_protocols:
3114             formats.extend(self._extract_m3u8_formats(
3115                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3116                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3117         if 'f4m' not in skip_protocols:
3118             formats.extend(self._extract_f4m_formats(
3119                 manifest_url('manifest.f4m'),
3120                 video_id, f4m_id='hds', fatal=False))
3121         if 'dash' not in skip_protocols:
3122             formats.extend(self._extract_mpd_formats(
3123                 manifest_url('manifest.mpd'),
3124                 video_id, mpd_id='dash', fatal=False))
3125         if re.search(r'(?:/smil:|\.smil)', url_base):
3126             if 'smil' not in skip_protocols:
3127                 rtmp_formats = self._extract_smil_formats(
3128                     manifest_url('jwplayer.smil'),
3129                     video_id, fatal=False)
3130                 for rtmp_format in rtmp_formats:
3131                     rtsp_format = rtmp_format.copy()
3132                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3133                     del rtsp_format['play_path']
3134                     del rtsp_format['ext']
3135                     rtsp_format.update({
3136                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3137                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3138                         'protocol': 'rtsp',
3139                     })
3140                     formats.extend([rtmp_format, rtsp_format])
3141         else:
3142             for protocol in ('rtmp', 'rtsp'):
3143                 if protocol not in skip_protocols:
3144                     formats.append({
3145                         'url': '%s:%s' % (protocol, url_base),
3146                         'format_id': protocol,
3147                         'protocol': protocol,
3148                     })
3149         return formats
3150
3151     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3152         mobj = re.search(
3153             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3154             webpage)
3155         if mobj:
3156             try:
3157                 jwplayer_data = self._parse_json(mobj.group('options'),
3158                                                  video_id=video_id,
3159                                                  transform_source=transform_source)
3160             except ExtractorError:
3161                 pass
3162             else:
3163                 if isinstance(jwplayer_data, dict):
3164                     return jwplayer_data
3165
3166     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3167         jwplayer_data = self._find_jwplayer_data(
3168             webpage, video_id, transform_source=js_to_json)
3169         return self._parse_jwplayer_data(
3170             jwplayer_data, video_id, *args, **kwargs)
3171
3172     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3173                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3174         # JWPlayer backward compatibility: flattened playlists
3175         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3176         if 'playlist' not in jwplayer_data:
3177             jwplayer_data = {'playlist': [jwplayer_data]}
3178
3179         entries = []
3180
3181         # JWPlayer backward compatibility: single playlist item
3182         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3183         if not isinstance(jwplayer_data['playlist'], list):
3184             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3185
3186         for video_data in jwplayer_data['playlist']:
3187             # JWPlayer backward compatibility: flattened sources
3188             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3189             if 'sources' not in video_data:
3190                 video_data['sources'] = [video_data]
3191
3192             this_video_id = video_id or video_data['mediaid']
3193
3194             formats = self._parse_jwplayer_formats(
3195                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3196                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3197
3198             subtitles = {}
3199             tracks = video_data.get('tracks')
3200             if tracks and isinstance(tracks, list):
3201                 for track in tracks:
3202                     if not isinstance(track, dict):
3203                         continue
3204                     track_kind = track.get('kind')
3205                     if not track_kind or not isinstance(track_kind, compat_str):
3206                         continue
3207                     if track_kind.lower() not in ('captions', 'subtitles'):
3208                         continue
3209                     track_url = urljoin(base_url, track.get('file'))
3210                     if not track_url:
3211                         continue
3212                     subtitles.setdefault(track.get('label') or 'en', []).append({
3213                         'url': self._proto_relative_url(track_url)
3214                     })
3215
3216             entry = {
3217                 'id': this_video_id,
3218                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3219                 'description': clean_html(video_data.get('description')),
3220                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3221                 'timestamp': int_or_none(video_data.get('pubdate')),
3222                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3223                 'subtitles': subtitles,
3224             }
3225             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3226             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3227                 entry.update({
3228                     '_type': 'url_transparent',
3229                     'url': formats[0]['url'],
3230                 })
3231             else:
3232                 self._sort_formats(formats)
3233                 entry['formats'] = formats
3234             entries.append(entry)
3235         if len(entries) == 1:
3236             return entries[0]
3237         else:
3238             return self.playlist_result(entries)
3239
3240     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3241                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3242         urls = []
3243         formats = []
3244         for source in jwplayer_sources_data:
3245             if not isinstance(source, dict):
3246                 continue
3247             source_url = urljoin(
3248                 base_url, self._proto_relative_url(source.get('file')))
3249             if not source_url or source_url in urls:
3250                 continue
3251             urls.append(source_url)
3252             source_type = source.get('type') or ''
3253             ext = mimetype2ext(source_type) or determine_ext(source_url)
3254             if source_type == 'hls' or ext == 'm3u8':
3255                 formats.extend(self._extract_m3u8_formats(
3256                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3257                     m3u8_id=m3u8_id, fatal=False))
3258             elif source_type == 'dash' or ext == 'mpd':
3259                 formats.extend(self._extract_mpd_formats(
3260                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3261             elif ext == 'smil':
3262                 formats.extend(self._extract_smil_formats(
3263                     source_url, video_id, fatal=False))
3264             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3265             elif source_type.startswith('audio') or ext in (
3266                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3267                 formats.append({
3268                     'url': source_url,
3269                     'vcodec': 'none',
3270                     'ext': ext,
3271                 })
3272             else:
3273                 height = int_or_none(source.get('height'))
3274                 if height is None:
3275                     # Often no height is provided but there is a label in
3276                     # format like "1080p", "720p SD", or 1080.
3277                     height = int_or_none(self._search_regex(
3278                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3279                         'height', default=None))
3280                 a_format = {
3281                     'url': source_url,
3282                     'width': int_or_none(source.get('width')),
3283                     'height': height,
3284                     'tbr': int_or_none(source.get('bitrate')),
3285                     'ext': ext,
3286                 }
3287                 if source_url.startswith('rtmp'):
3288                     a_format['ext'] = 'flv'
3289                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3290                     # of jwplayer.flash.swf
3291                     rtmp_url_parts = re.split(
3292                         r'((?:mp4|mp3|flv):)', source_url, 1)
3293                     if len(rtmp_url_parts) == 3:
3294                         rtmp_url, prefix, play_path = rtmp_url_parts
3295                         a_format.update({
3296                             'url': rtmp_url,
3297                             'play_path': prefix + play_path,
3298                         })
3299                     if rtmp_params:
3300                         a_format.update(rtmp_params)
3301                 formats.append(a_format)
3302         return formats
3303
3304     def _live_title(self, name):
3305         """ Generate the title for a live video """
3306         now = datetime.datetime.now()
3307         now_str = now.strftime('%Y-%m-%d %H:%M')
3308         return name + ' ' + now_str
3309
3310     def _int(self, v, name, fatal=False, **kwargs):
3311         res = int_or_none(v, **kwargs)
3312         if 'get_attr' in kwargs:
3313             print(getattr(v, kwargs['get_attr']))
3314         if res is None:
3315             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3316             if fatal:
3317                 raise ExtractorError(msg)
3318             else:
3319                 self.report_warning(msg)
3320         return res
3321
3322     def _float(self, v, name, fatal=False, **kwargs):
3323         res = float_or_none(v, **kwargs)
3324         if res is None:
3325             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3326             if fatal:
3327                 raise ExtractorError(msg)
3328             else:
3329                 self.report_warning(msg)
3330         return res
3331
3332     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3333                     path='/', secure=False, discard=False, rest={}, **kwargs):
3334         cookie = compat_cookiejar_Cookie(
3335             0, name, value, port, port is not None, domain, True,
3336             domain.startswith('.'), path, True, secure, expire_time,
3337             discard, None, None, rest)
3338         self._downloader.cookiejar.set_cookie(cookie)
3339
3340     def _get_cookies(self, url):
3341         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3342         req = sanitized_Request(url)
3343         self._downloader.cookiejar.add_cookie_header(req)
3344         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3345
3346     def _apply_first_set_cookie_header(self, url_handle, cookie):
3347         """
3348         Apply first Set-Cookie header instead of the last. Experimental.
3349
3350         Some sites (e.g. [1-3]) may serve two cookies under the same name
3351         in Set-Cookie header and expect the first (old) one to be set rather
3352         than second (new). However, as of RFC6265 the newer one cookie
3353         should be set into cookie store what actually happens.
3354         We will workaround this issue by resetting the cookie to
3355         the first one manually.
3356         1. https://new.vk.com/
3357         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3358         3. https://learning.oreilly.com/
3359         """
3360         for header, cookies in url_handle.headers.items():
3361             if header.lower() != 'set-cookie':
3362                 continue
3363             if sys.version_info[0] >= 3:
3364                 cookies = cookies.encode('iso-8859-1')
3365             cookies = cookies.decode('utf-8')
3366             cookie_value = re.search(
3367                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3368             if cookie_value:
3369                 value, domain = cookie_value.groups()
3370                 self._set_cookie(domain, cookie, value)
3371                 break
3372
3373     def get_testcases(self, include_onlymatching=False):
3374         t = getattr(self, '_TEST', None)
3375         if t:
3376             assert not hasattr(self, '_TESTS'), \
3377                 '%s has _TEST and _TESTS' % type(self).__name__
3378             tests = [t]
3379         else:
3380             tests = getattr(self, '_TESTS', [])
3381         for t in tests:
3382             if not include_onlymatching and t.get('only_matching', False):
3383                 continue
3384             t['name'] = type(self).__name__[:-len('IE')]
3385             yield t
3386
3387     def is_suitable(self, age_limit):
3388         """ Test whether the extractor is generally suitable for the given
3389         age limit (i.e. pornographic sites are not, all others usually are) """
3390
3391         any_restricted = False
3392         for tc in self.get_testcases(include_onlymatching=False):
3393             if tc.get('playlist', []):
3394                 tc = tc['playlist'][0]
3395             is_restricted = age_restricted(
3396                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3397             if not is_restricted:
3398                 return True
3399             any_restricted = any_restricted or is_restricted
3400         return not any_restricted
3401
3402     def extract_subtitles(self, *args, **kwargs):
3403         if (self._downloader.params.get('writesubtitles', False)
3404                 or self._downloader.params.get('listsubtitles')):
3405             return self._get_subtitles(*args, **kwargs)
3406         return {}
3407
3408     def _get_subtitles(self, *args, **kwargs):
3409         raise NotImplementedError('This method must be implemented by subclasses')
3410
3411     @staticmethod
3412     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3413         """ Merge subtitle items for one language. Items with duplicated URLs
3414         will be dropped. """
3415         list1_urls = set([item['url'] for item in subtitle_list1])
3416         ret = list(subtitle_list1)
3417         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3418         return ret
3419
3420     @classmethod
3421     def _merge_subtitles(cls, *dicts, **kwargs):
3422         """ Merge subtitle dictionaries, language by language. """
3423
3424         target = (lambda target=None: target)(**kwargs)
3425         # The above lambda extracts the keyword argument 'target' from kwargs
3426         # while ensuring there are no stray ones. When Python 2 support
3427         # is dropped, remove it and change the function signature to:
3428         #
3429         #     def _merge_subtitles(cls, *dicts, target=None):
3430
3431         if target is None:
3432             target = {}
3433         for d in dicts:
3434             for lang, subs in d.items():
3435                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3436         return target
3437
3438     def extract_automatic_captions(self, *args, **kwargs):
3439         if (self._downloader.params.get('writeautomaticsub', False)
3440                 or self._downloader.params.get('listsubtitles')):
3441             return self._get_automatic_captions(*args, **kwargs)
3442         return {}
3443
3444     def _get_automatic_captions(self, *args, **kwargs):
3445         raise NotImplementedError('This method must be implemented by subclasses')
3446
3447     def mark_watched(self, *args, **kwargs):
3448         if (self._downloader.params.get('mark_watched', False)
3449                 and (self._get_login_info()[0] is not None
3450                      or self._downloader.params.get('cookiefile') is not None)):
3451             self._mark_watched(*args, **kwargs)
3452
3453     def _mark_watched(self, *args, **kwargs):
3454         raise NotImplementedError('This method must be implemented by subclasses')
3455
3456     def geo_verification_headers(self):
3457         headers = {}
3458         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3459         if geo_verification_proxy:
3460             headers['Ytdl-request-proxy'] = geo_verification_proxy
3461         return headers
3462
3463     def _generic_id(self, url):
3464         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3465
3466     def _generic_title(self, url):
3467         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3468
3469     @staticmethod
3470     def _availability(is_private, needs_premium, needs_subscription, needs_auth, is_unlisted):
3471         all_known = all(map(
3472             lambda x: x is not None,
3473             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3474         return (
3475             'private' if is_private
3476             else 'premium_only' if needs_premium
3477             else 'subscriber_only' if needs_subscription
3478             else 'needs_auth' if needs_auth
3479             else 'unlisted' if is_unlisted
3480             else 'public' if all_known
3481             else None)
3482
3483
3484 class SearchInfoExtractor(InfoExtractor):
3485     """
3486     Base class for paged search queries extractors.
3487     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3488     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3489     """
3490
3491     @classmethod
3492     def _make_valid_url(cls):
3493         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3494
3495     @classmethod
3496     def suitable(cls, url):
3497         return re.match(cls._make_valid_url(), url) is not None
3498
3499     def _real_extract(self, query):
3500         mobj = re.match(self._make_valid_url(), query)
3501         if mobj is None:
3502             raise ExtractorError('Invalid search query "%s"' % query)
3503
3504         prefix = mobj.group('prefix')
3505         query = mobj.group('query')
3506         if prefix == '':
3507             return self._get_n_results(query, 1)
3508         elif prefix == 'all':
3509             return self._get_n_results(query, self._MAX_RESULTS)
3510         else:
3511             n = int(prefix)
3512             if n <= 0:
3513                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3514             elif n > self._MAX_RESULTS:
3515                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3516                 n = self._MAX_RESULTS
3517             return self._get_n_results(query, n)
3518
3519     def _get_n_results(self, query, n):
3520         """Get a specified number of results for a query"""
3521         raise NotImplementedError('This method must be implemented by subclasses')
3522
3523     @property
3524     def SEARCH_KEY(self):
3525         return self._SEARCH_KEY