yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar_Cookie,
  18     compat_cookies_SimpleCookie,
  19     compat_etree_Element,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_integer_types,
  23     compat_http_client,
  24     compat_os_name,
  25     compat_str,
  26     compat_urllib_error,
  27     compat_urllib_parse_unquote,
  28     compat_urllib_parse_urlencode,
  29     compat_urllib_request,
  30     compat_urlparse,
  31     compat_xml_parse_error,
  32 )
  33 from ..downloader import FileDownloader
  34 from ..downloader.f4m import (
  35     get_base_url,
  36     remove_encrypted_media,
  37 )
  38 from ..utils import (
  39     NO_DEFAULT,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     error_to_compat_str,
  49     ExtractorError,
  50     extract_attributes,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     GeoRestrictedError,
  54     GeoUtils,
  55     int_or_none,
  56     js_to_json,
  57     JSON_LD_RE,
  58     mimetype2ext,
  59     network_exceptions,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     RegexNotFoundError,
  68     sanitized_Request,
  69     sanitize_filename,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     unescapeHTML,
  74     unified_strdate,
  75     unified_timestamp,
  76     update_Request,
  77     update_url_query,
  78     urljoin,
  79     url_basename,
  80     url_or_none,
  81     xpath_element,
  82     xpath_text,
  83     xpath_with_ns,
  84 )
  85
  86
  87 class InfoExtractor(object):
  88     """Information Extractor class.
  89
  90     Information extractors are the classes that, given a URL, extract
  91     information about the video (or videos) the URL refers to. This
  92     information includes the real video URL, the video title, author and
  93     others. The information is stored in a dictionary which is then
  94     passed to the YoutubeDL. The YoutubeDL processes this
  95     information possibly downloading the video to the file system, among
  96     other possible outcomes.
  97
  98     The type field determines the type of the result.
  99     By far the most common value (and the default if _type is missing) is
 100     "video", which indicates a single video.
 101
 102     For a video, the dictionaries must include the following fields:
 103
 104     id:             Video identifier.
 105     title:          Video title, unescaped.
 106
 107     Additionally, it must contain either a formats entry or a url one:
 108
 109     formats:        A list of dictionaries for each format available, ordered
 110                     from worst to best quality.
 111
 112                     Potential fields:
 113                     * url        The mandatory URL representing the media:
 114                                    for plain file media - HTTP URL of this file,
 115                                    for RTMP - RTMP URL,
 116                                    for HLS - URL of the M3U8 media playlist,
 117                                    for HDS - URL of the F4M manifest,
 118                                    for DASH
 119                                      - HTTP URL to plain file media (in case of
 120                                        unfragmented media)
 121                                      - URL of the MPD manifest or base URL
 122                                        representing the media if MPD manifest
 123                                        is parsed from a string (in case of
 124                                        fragmented media)
 125                                    for MSS - URL of the ISM manifest.
 126                     * manifest_url
 127                                  The URL of the manifest file in case of
 128                                  fragmented media:
 129                                    for HLS - URL of the M3U8 master playlist,
 130                                    for HDS - URL of the F4M manifest,
 131                                    for DASH - URL of the MPD manifest,
 132                                    for MSS - URL of the ISM manifest.
 133                     * ext        Will be calculated from URL if missing
 134                     * format     A human-readable description of the format
 135                                  ("mp4 container with h264/opus").
 136                                  Calculated from the format_id, width, height.
 137                                  and format_note fields if missing.
 138                     * format_id  A short description of the format
 139                                  ("mp4_h264_opus" or "19").
 140                                 Technically optional, but strongly recommended.
 141                     * format_note Additional info about the format
 142                                  ("3D" or "DASH video")
 143                     * width      Width of the video, if known
 144                     * height     Height of the video, if known
 145                     * resolution Textual description of width and height
 146                     * tbr        Average bitrate of audio and video in KBit/s
 147                     * abr        Average audio bitrate in KBit/s
 148                     * acodec     Name of the audio codec in use
 149                     * asr        Audio sampling rate in Hertz
 150                     * vbr        Average video bitrate in KBit/s
 151                     * fps        Frame rate
 152                     * vcodec     Name of the video codec in use
 153                     * container  Name of the container format
 154                     * filesize   The number of bytes, if known in advance
 155                     * filesize_approx  An estimate for the number of bytes
 156                     * player_url SWF Player URL (used for rtmpdump).
 157                     * protocol   The protocol that will be used for the actual
 158                                  download, lower-case.
 159                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 160                                  "m3u8", "m3u8_native" or "http_dash_segments".
 161                     * fragment_base_url
 162                                  Base URL for fragments. Each fragment's path
 163                                  value (if present) will be relative to
 164                                  this URL.
 165                     * fragments  A list of fragments of a fragmented media.
 166                                  Each fragment entry must contain either an url
 167                                  or a path. If an url is present it should be
 168                                  considered by a client. Otherwise both path and
 169                                  fragment_base_url must be present. Here is
 170                                  the list of all potential fields:
 171                                  * "url" - fragment's URL
 172                                  * "path" - fragment's path relative to
 173                                             fragment_base_url
 174                                  * "duration" (optional, int or float)
 175                                  * "filesize" (optional, int)
 176                     * preference Order number of this format. If this field is
 177                                  present and not None, the formats get sorted
 178                                  by this field, regardless of all other values.
 179                                  -1 for default (order by other properties),
 180                                  -2 or smaller for less than default.
 181                                  < -1000 to hide the format (if there is
 182                                     another one which is strictly better)
 183                     * language   Language code, e.g. "de" or "en-US".
 184                     * language_preference  Is this in the language mentioned in
 185                                  the URL?
 186                                  10 if it's what the URL is about,
 187                                  -1 for default (don't know),
 188                                  -10 otherwise, other values reserved for now.
 189                     * quality    Order number of the video quality of this
 190                                  format, irrespective of the file format.
 191                                  -1 for default (order by other properties),
 192                                  -2 or smaller for less than default.
 193                     * source_preference  Order number for this video source
 194                                   (quality takes higher priority)
 195                                  -1 for default (order by other properties),
 196                                  -2 or smaller for less than default.
 197                     * http_headers  A dictionary of additional HTTP headers
 198                                  to add to the request.
 199                     * stretched_ratio  If given and not 1, indicates that the
 200                                  video's pixels are not square.
 201                                  width : height ratio as float.
 202                     * no_resume  The server does not support resuming the
 203                                  (HTTP or RTMP) download. Boolean.
 204                     * downloader_options  A dictionary of downloader options as
 205                                  described in FileDownloader
 206
 207     url:            Final video URL.
 208     ext:            Video filename extension.
 209     format:         The video format, defaults to ext (used for --get-format)
 210     player_url:     SWF Player URL (used for rtmpdump).
 211
 212     The following fields are optional:
 213
 214     alt_title:      A secondary title of the video.
 215     display_id      An alternative identifier for the video, not necessarily
 216                     unique, but available before title. Typically, id is
 217                     something like "4234987", title "Dancing naked mole rats",
 218                     and display_id "dancing-naked-mole-rats"
 219     thumbnails:     A list of dictionaries, with the following entries:
 220                         * "id" (optional, string) - Thumbnail format ID
 221                         * "url"
 222                         * "preference" (optional, int) - quality of the image
 223                         * "width" (optional, int)
 224                         * "height" (optional, int)
 225                         * "resolution" (optional, string "{width}x{height}",
 226                                         deprecated)
 227                         * "filesize" (optional, int)
 228     thumbnail:      Full URL to a video thumbnail image.
 229     description:    Full video description.
 230     uploader:       Full name of the video uploader.
 231     license:        License name the video is licensed under.
 232     creator:        The creator of the video.
 233     release_timestamp: UNIX timestamp of the moment the video was released.
 234     release_date:   The date (YYYYMMDD) when the video was released.
 235     timestamp:      UNIX timestamp of the moment the video was uploaded
 236     upload_date:    Video upload date (YYYYMMDD).
 237                     If not explicitly set, calculated from timestamp.
 238     uploader_id:    Nickname or id of the video uploader.
 239     uploader_url:   Full URL to a personal webpage of the video uploader.
 240     channel:        Full name of the channel the video is uploaded on.
 241                     Note that channel fields may or may not repeat uploader
 242                     fields. This depends on a particular extractor.
 243     channel_id:     Id of the channel.
 244     channel_url:    Full URL to a channel webpage.
 245     location:       Physical location where the video was filmed.
 246     subtitles:      The available subtitles as a dictionary in the format
 247                     {tag: subformats}. "tag" is usually a language code, and
 248                     "subformats" is a list sorted from lower to higher
 249                     preference, each element is a dictionary with the "ext"
 250                     entry and one of:
 251                         * "data": The subtitles file contents
 252                         * "url": A URL pointing to the subtitles file
 253                     "ext" will be calculated from URL if missing
 254     automatic_captions: Like 'subtitles'; contains automatically generated
 255                     captions instead of normal subtitles
 256     duration:       Length of the video in seconds, as an integer or float.
 257     view_count:     How many users have watched the video on the platform.
 258     like_count:     Number of positive ratings of the video
 259     dislike_count:  Number of negative ratings of the video
 260     repost_count:   Number of reposts of the video
 261     average_rating: Average rating give by users, the scale used depends on the webpage
 262     comment_count:  Number of comments on the video
 263     comments:       A list of comments, each with one or more of the following
 264                     properties (all but one of text or html optional):
 265                         * "author" - human-readable name of the comment author
 266                         * "author_id" - user ID of the comment author
 267                         * "author_thumbnail" - The thumbnail of the comment author
 268                         * "id" - Comment ID
 269                         * "html" - Comment as HTML
 270                         * "text" - Plain text of the comment
 271                         * "timestamp" - UNIX timestamp of comment
 272                         * "parent" - ID of the comment this one is replying to.
 273                                      Set to "root" to indicate that this is a
 274                                      comment to the original video.
 275                         * "like_count" - Number of positive ratings of the comment
 276                         * "dislike_count" - Number of negative ratings of the comment
 277                         * "is_favorited" - Whether the comment is marked as
 278                                            favorite by the video uploader
 279                         * "author_is_uploader" - Whether the comment is made by
 280                                                  the video uploader
 281     age_limit:      Age restriction for the video, as an integer (years)
 282     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 283                     should allow to get the same result again. (It will be set
 284                     by YoutubeDL if it's missing)
 285     categories:     A list of categories that the video falls in, for example
 286                     ["Sports", "Berlin"]
 287     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 288     is_live:        True, False, or None (=unknown). Whether this video is a
 289                     live stream that goes on instead of a fixed-length video.
 290     was_live:       True, False, or None (=unknown). Whether this video was
 291                     originally a live stream.
 292     start_time:     Time in seconds where the reproduction should start, as
 293                     specified in the URL.
 294     end_time:       Time in seconds where the reproduction should end, as
 295                     specified in the URL.
 296     chapters:       A list of dictionaries, with the following entries:
 297                         * "start_time" - The start time of the chapter in seconds
 298                         * "end_time" - The end time of the chapter in seconds
 299                         * "title" (optional, string)
 300     playable_in_embed: Whether this video is allowed to play in embedded
 301                     players on other sites. Can be True (=always allowed),
 302                     False (=never allowed), None (=unknown), or a string
 303                     specifying the criteria for embedability (Eg: 'whitelist')
 304     availability:   Under what condition the video is available. One of
 305                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 306                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 307                     to set it
 308     __post_extractor: A function to be called just before the metadata is
 309                     written to either disk, logger or console. The function
 310                     must return a dict which will be added to the info_dict.
 311                     This is usefull for additional information that is
 312                     time-consuming to extract. Note that the fields thus
 313                     extracted will not be available to output template and
 314                     match_filter. So, only "comments" and "comment_count" are
 315                     currently allowed to be extracted via this method.
 316
 317     The following fields should only be used when the video belongs to some logical
 318     chapter or section:
 319
 320     chapter:        Name or title of the chapter the video belongs to.
 321     chapter_number: Number of the chapter the video belongs to, as an integer.
 322     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 323
 324     The following fields should only be used when the video is an episode of some
 325     series, programme or podcast:
 326
 327     series:         Title of the series or programme the video episode belongs to.
 328     season:         Title of the season the video episode belongs to.
 329     season_number:  Number of the season the video episode belongs to, as an integer.
 330     season_id:      Id of the season the video episode belongs to, as a unicode string.
 331     episode:        Title of the video episode. Unlike mandatory video title field,
 332                     this field should denote the exact title of the video episode
 333                     without any kind of decoration.
 334     episode_number: Number of the video episode within a season, as an integer.
 335     episode_id:     Id of the video episode, as a unicode string.
 336
 337     The following fields should only be used when the media is a track or a part of
 338     a music album:
 339
 340     track:          Title of the track.
 341     track_number:   Number of the track within an album or a disc, as an integer.
 342     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 343                     as a unicode string.
 344     artist:         Artist(s) of the track.
 345     genre:          Genre(s) of the track.
 346     album:          Title of the album the track belongs to.
 347     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 348     album_artist:   List of all artists appeared on the album (e.g.
 349                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 350                     and compilations).
 351     disc_number:    Number of the disc or other physical medium the track belongs to,
 352                     as an integer.
 353     release_year:   Year (YYYY) when the album was released.
 354
 355     Unless mentioned otherwise, the fields should be Unicode strings.
 356
 357     Unless mentioned otherwise, None is equivalent to absence of information.
 358
 359
 360     _type "playlist" indicates multiple videos.
 361     There must be a key "entries", which is a list, an iterable, or a PagedList
 362     object, each element of which is a valid dictionary by this specification.
 363
 364     Additionally, playlists can have "id", "title", and any other relevent
 365     attributes with the same semantics as videos (see above).
 366
 367
 368     _type "multi_video" indicates that there are multiple videos that
 369     form a single show, for examples multiple acts of an opera or TV episode.
 370     It must have an entries key like a playlist and contain all the keys
 371     required for a video at the same time.
 372
 373
 374     _type "url" indicates that the video must be extracted from another
 375     location, possibly by a different extractor. Its only required key is:
 376     "url" - the next URL to extract.
 377     The key "ie_key" can be set to the class name (minus the trailing "IE",
 378     e.g. "Youtube") if the extractor class is known in advance.
 379     Additionally, the dictionary may have any properties of the resolved entity
 380     known in advance, for example "title" if the title of the referred video is
 381     known ahead of time.
 382
 383
 384     _type "url_transparent" entities have the same specification as "url", but
 385     indicate that the given additional information is more precise than the one
 386     associated with the resolved URL.
 387     This is useful when a site employs a video service that hosts the video and
 388     its technical metadata, but that video service does not embed a useful
 389     title, description etc.
 390
 391
 392     Subclasses of this one should re-define the _real_initialize() and
 393     _real_extract() methods and define a _VALID_URL regexp.
 394     Probably, they should also be added to the list of extractors.
 395
 396     _GEO_BYPASS attribute may be set to False in order to disable
 397     geo restriction bypass mechanisms for a particular extractor.
 398     Though it won't disable explicit geo restriction bypass based on
 399     country code provided with geo_bypass_country.
 400
 401     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 402     countries for this extractor. One of these countries will be used by
 403     geo restriction bypass mechanism right away in order to bypass
 404     geo restriction, of course, if the mechanism is not disabled.
 405
 406     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 407     IP blocks in CIDR notation for this extractor. One of these IP blocks
 408     will be used by geo restriction bypass mechanism similarly
 409     to _GEO_COUNTRIES.
 410
 411     Finally, the _WORKING attribute should be set to False for broken IEs
 412     in order to warn the users and skip the tests.
 413     """
 414
 415     _ready = False
 416     _downloader = None
 417     _x_forwarded_for_ip = None
 418     _GEO_BYPASS = True
 419     _GEO_COUNTRIES = None
 420     _GEO_IP_BLOCKS = None
 421     _WORKING = True
 422
 423     def __init__(self, downloader=None):
 424         """Constructor. Receives an optional downloader."""
 425         self._ready = False
 426         self._x_forwarded_for_ip = None
 427         self.set_downloader(downloader)
 428
 429     @classmethod
 430     def suitable(cls, url):
 431         """Receives a URL and returns True if suitable for this IE."""
 432
 433         # This does not use has/getattr intentionally - we want to know whether
 434         # we have cached the regexp for *this* class, whereas getattr would also
 435         # match the superclass
 436         if '_VALID_URL_RE' not in cls.__dict__:
 437             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 438         return cls._VALID_URL_RE.match(url) is not None
 439
 440     @classmethod
 441     def _match_id(cls, url):
 442         if '_VALID_URL_RE' not in cls.__dict__:
 443             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 444         m = cls._VALID_URL_RE.match(url)
 445         assert m
 446         return compat_str(m.group('id'))
 447
 448     @classmethod
 449     def working(cls):
 450         """Getter method for _WORKING."""
 451         return cls._WORKING
 452
 453     def initialize(self):
 454         """Initializes an instance (authentication, etc)."""
 455         self._initialize_geo_bypass({
 456             'countries': self._GEO_COUNTRIES,
 457             'ip_blocks': self._GEO_IP_BLOCKS,
 458         })
 459         if not self._ready:
 460             self._real_initialize()
 461             self._ready = True
 462
 463     def _initialize_geo_bypass(self, geo_bypass_context):
 464         """
 465         Initialize geo restriction bypass mechanism.
 466
 467         This method is used to initialize geo bypass mechanism based on faking
 468         X-Forwarded-For HTTP header. A random country from provided country list
 469         is selected and a random IP belonging to this country is generated. This
 470         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 471         HTTP requests.
 472
 473         This method will be used for initial geo bypass mechanism initialization
 474         during the instance initialization with _GEO_COUNTRIES and
 475         _GEO_IP_BLOCKS.
 476
 477         You may also manually call it from extractor's code if geo bypass
 478         information is not available beforehand (e.g. obtained during
 479         extraction) or due to some other reason. In this case you should pass
 480         this information in geo bypass context passed as first argument. It may
 481         contain following fields:
 482
 483         countries:  List of geo unrestricted countries (similar
 484                     to _GEO_COUNTRIES)
 485         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 486                     (similar to _GEO_IP_BLOCKS)
 487
 488         """
 489         if not self._x_forwarded_for_ip:
 490
 491             # Geo bypass mechanism is explicitly disabled by user
 492             if not self._downloader.params.get('geo_bypass', True):
 493                 return
 494
 495             if not geo_bypass_context:
 496                 geo_bypass_context = {}
 497
 498             # Backward compatibility: previously _initialize_geo_bypass
 499             # expected a list of countries, some 3rd party code may still use
 500             # it this way
 501             if isinstance(geo_bypass_context, (list, tuple)):
 502                 geo_bypass_context = {
 503                     'countries': geo_bypass_context,
 504                 }
 505
 506             # The whole point of geo bypass mechanism is to fake IP
 507             # as X-Forwarded-For HTTP header based on some IP block or
 508             # country code.
 509
 510             # Path 1: bypassing based on IP block in CIDR notation
 511
 512             # Explicit IP block specified by user, use it right away
 513             # regardless of whether extractor is geo bypassable or not
 514             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 515
 516             # Otherwise use random IP block from geo bypass context but only
 517             # if extractor is known as geo bypassable
 518             if not ip_block:
 519                 ip_blocks = geo_bypass_context.get('ip_blocks')
 520                 if self._GEO_BYPASS and ip_blocks:
 521                     ip_block = random.choice(ip_blocks)
 522
 523             if ip_block:
 524                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 525                 if self._downloader.params.get('verbose', False):
 526                     self._downloader.to_screen(
 527                         '[debug] Using fake IP %s as X-Forwarded-For.'
 528                         % self._x_forwarded_for_ip)
 529                 return
 530
 531             # Path 2: bypassing based on country code
 532
 533             # Explicit country code specified by user, use it right away
 534             # regardless of whether extractor is geo bypassable or not
 535             country = self._downloader.params.get('geo_bypass_country', None)
 536
 537             # Otherwise use random country code from geo bypass context but
 538             # only if extractor is known as geo bypassable
 539             if not country:
 540                 countries = geo_bypass_context.get('countries')
 541                 if self._GEO_BYPASS and countries:
 542                     country = random.choice(countries)
 543
 544             if country:
 545                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 546                 if self._downloader.params.get('verbose', False):
 547                     self._downloader.to_screen(
 548                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 549                         % (self._x_forwarded_for_ip, country.upper()))
 550
 551     def extract(self, url):
 552         """Extracts URL information and returns it in list of dicts."""
 553         try:
 554             for _ in range(2):
 555                 try:
 556                     self.initialize()
 557                     ie_result = self._real_extract(url)
 558                     if self._x_forwarded_for_ip:
 559                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 560                     subtitles = ie_result.get('subtitles')
 561                     if (subtitles and 'live_chat' in subtitles
 562                             and 'no-live-chat' in self._downloader.params.get('compat_opts')):
 563                         del subtitles['live_chat']
 564                     return ie_result
 565                 except GeoRestrictedError as e:
 566                     if self.__maybe_fake_ip_and_retry(e.countries):
 567                         continue
 568                     raise
 569         except ExtractorError:
 570             raise
 571         except compat_http_client.IncompleteRead as e:
 572             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 573         except (KeyError, StopIteration) as e:
 574             raise ExtractorError('An extractor error has occurred.', cause=e)
 575
 576     def __maybe_fake_ip_and_retry(self, countries):
 577         if (not self._downloader.params.get('geo_bypass_country', None)
 578                 and self._GEO_BYPASS
 579                 and self._downloader.params.get('geo_bypass', True)
 580                 and not self._x_forwarded_for_ip
 581                 and countries):
 582             country_code = random.choice(countries)
 583             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 584             if self._x_forwarded_for_ip:
 585                 self.report_warning(
 586                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 587                     % (self._x_forwarded_for_ip, country_code.upper()))
 588                 return True
 589         return False
 590
 591     def set_downloader(self, downloader):
 592         """Sets the downloader for this IE."""
 593         self._downloader = downloader
 594
 595     def _real_initialize(self):
 596         """Real initialization process. Redefine in subclasses."""
 597         pass
 598
 599     def _real_extract(self, url):
 600         """Real extraction process. Redefine in subclasses."""
 601         pass
 602
 603     @classmethod
 604     def ie_key(cls):
 605         """A string for getting the InfoExtractor with get_info_extractor"""
 606         return compat_str(cls.__name__[:-2])
 607
 608     @property
 609     def IE_NAME(self):
 610         return compat_str(type(self).__name__[:-2])
 611
 612     @staticmethod
 613     def __can_accept_status_code(err, expected_status):
 614         assert isinstance(err, compat_urllib_error.HTTPError)
 615         if expected_status is None:
 616             return False
 617         if isinstance(expected_status, compat_integer_types):
 618             return err.code == expected_status
 619         elif isinstance(expected_status, (list, tuple)):
 620             return err.code in expected_status
 621         elif callable(expected_status):
 622             return expected_status(err.code) is True
 623         else:
 624             assert False
 625
 626     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 627         """
 628         Return the response handle.
 629
 630         See _download_webpage docstring for arguments specification.
 631         """
 632         if not self._downloader._first_webpage_request:
 633             sleep_interval = float_or_none(self._downloader.params.get('sleep_interval_requests')) or 0
 634             if sleep_interval > 0:
 635                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 636                 time.sleep(sleep_interval)
 637         else:
 638             self._downloader._first_webpage_request = False
 639
 640         if note is None:
 641             self.report_download_webpage(video_id)
 642         elif note is not False:
 643             if video_id is None:
 644                 self.to_screen('%s' % (note,))
 645             else:
 646                 self.to_screen('%s: %s' % (video_id, note))
 647
 648         # Some sites check X-Forwarded-For HTTP header in order to figure out
 649         # the origin of the client behind proxy. This allows bypassing geo
 650         # restriction by faking this header's value to IP that belongs to some
 651         # geo unrestricted country. We will do so once we encounter any
 652         # geo restriction error.
 653         if self._x_forwarded_for_ip:
 654             if 'X-Forwarded-For' not in headers:
 655                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 656
 657         if isinstance(url_or_request, compat_urllib_request.Request):
 658             url_or_request = update_Request(
 659                 url_or_request, data=data, headers=headers, query=query)
 660         else:
 661             if query:
 662                 url_or_request = update_url_query(url_or_request, query)
 663             if data is not None or headers:
 664                 url_or_request = sanitized_Request(url_or_request, data, headers)
 665         try:
 666             return self._downloader.urlopen(url_or_request)
 667         except network_exceptions as err:
 668             if isinstance(err, compat_urllib_error.HTTPError):
 669                 if self.__can_accept_status_code(err, expected_status):
 670                     # Retain reference to error to prevent file object from
 671                     # being closed before it can be read. Works around the
 672                     # effects of <https://bugs.python.org/issue15002>
 673                     # introduced in Python 3.4.1.
 674                     err.fp._error = err
 675                     return err.fp
 676
 677             if errnote is False:
 678                 return False
 679             if errnote is None:
 680                 errnote = 'Unable to download webpage'
 681
 682             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 683             if fatal:
 684                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 685             else:
 686                 self.report_warning(errmsg)
 687                 return False
 688
 689     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 690         """
 691         Return a tuple (page content as string, URL handle).
 692
 693         See _download_webpage docstring for arguments specification.
 694         """
 695         # Strip hashes from the URL (#1038)
 696         if isinstance(url_or_request, (compat_str, str)):
 697             url_or_request = url_or_request.partition('#')[0]
 698
 699         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 700         if urlh is False:
 701             assert not fatal
 702             return False
 703         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 704         return (content, urlh)
 705
 706     @staticmethod
 707     def _guess_encoding_from_content(content_type, webpage_bytes):
 708         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 709         if m:
 710             encoding = m.group(1)
 711         else:
 712             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 713                           webpage_bytes[:1024])
 714             if m:
 715                 encoding = m.group(1).decode('ascii')
 716             elif webpage_bytes.startswith(b'\xff\xfe'):
 717                 encoding = 'utf-16'
 718             else:
 719                 encoding = 'utf-8'
 720
 721         return encoding
 722
 723     def __check_blocked(self, content):
 724         first_block = content[:512]
 725         if ('<title>Access to this site is blocked</title>' in content
 726                 and 'Websense' in first_block):
 727             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 728             blocked_iframe = self._html_search_regex(
 729                 r'<iframe src="([^"]+)"', content,
 730                 'Websense information URL', default=None)
 731             if blocked_iframe:
 732                 msg += ' Visit %s for more details' % blocked_iframe
 733             raise ExtractorError(msg, expected=True)
 734         if '<title>The URL you requested has been blocked</title>' in first_block:
 735             msg = (
 736                 'Access to this webpage has been blocked by Indian censorship. '
 737                 'Use a VPN or proxy server (with --proxy) to route around it.')
 738             block_msg = self._html_search_regex(
 739                 r'</h1><p>(.*?)</p>',
 740                 content, 'block message', default=None)
 741             if block_msg:
 742                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 743             raise ExtractorError(msg, expected=True)
 744         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 745                 and 'blocklist.rkn.gov.ru' in content):
 746             raise ExtractorError(
 747                 'Access to this webpage has been blocked by decision of the Russian government. '
 748                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 749                 expected=True)
 750
 751     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 752         content_type = urlh.headers.get('Content-Type', '')
 753         webpage_bytes = urlh.read()
 754         if prefix is not None:
 755             webpage_bytes = prefix + webpage_bytes
 756         if not encoding:
 757             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 758         if self._downloader.params.get('dump_intermediate_pages', False):
 759             self.to_screen('Dumping request to ' + urlh.geturl())
 760             dump = base64.b64encode(webpage_bytes).decode('ascii')
 761             self._downloader.to_screen(dump)
 762         if self._downloader.params.get('write_pages', False):
 763             basen = '%s_%s' % (video_id, urlh.geturl())
 764             if len(basen) > 240:
 765                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 766                 basen = basen[:240 - len(h)] + h
 767             raw_filename = basen + '.dump'
 768             filename = sanitize_filename(raw_filename, restricted=True)
 769             self.to_screen('Saving request to ' + filename)
 770             # Working around MAX_PATH limitation on Windows (see
 771             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 772             if compat_os_name == 'nt':
 773                 absfilepath = os.path.abspath(filename)
 774                 if len(absfilepath) > 259:
 775                     filename = '\\\\?\\' + absfilepath
 776             with open(filename, 'wb') as outf:
 777                 outf.write(webpage_bytes)
 778
 779         try:
 780             content = webpage_bytes.decode(encoding, 'replace')
 781         except LookupError:
 782             content = webpage_bytes.decode('utf-8', 'replace')
 783
 784         self.__check_blocked(content)
 785
 786         return content
 787
 788     def _download_webpage(
 789             self, url_or_request, video_id, note=None, errnote=None,
 790             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 791             headers={}, query={}, expected_status=None):
 792         """
 793         Return the data of the page as a string.
 794
 795         Arguments:
 796         url_or_request -- plain text URL as a string or
 797             a compat_urllib_request.Requestobject
 798         video_id -- Video/playlist/item identifier (string)
 799
 800         Keyword arguments:
 801         note -- note printed before downloading (string)
 802         errnote -- note printed in case of an error (string)
 803         fatal -- flag denoting whether error should be considered fatal,
 804             i.e. whether it should cause ExtractionError to be raised,
 805             otherwise a warning will be reported and extraction continued
 806         tries -- number of tries
 807         timeout -- sleep interval between tries
 808         encoding -- encoding for a page content decoding, guessed automatically
 809             when not explicitly specified
 810         data -- POST data (bytes)
 811         headers -- HTTP headers (dict)
 812         query -- URL query (dict)
 813         expected_status -- allows to accept failed HTTP requests (non 2xx
 814             status code) by explicitly specifying a set of accepted status
 815             codes. Can be any of the following entities:
 816                 - an integer type specifying an exact failed status code to
 817                   accept
 818                 - a list or a tuple of integer types specifying a list of
 819                   failed status codes to accept
 820                 - a callable accepting an actual failed status code and
 821                   returning True if it should be accepted
 822             Note that this argument does not affect success status codes (2xx)
 823             which are always accepted.
 824         """
 825
 826         success = False
 827         try_count = 0
 828         while success is False:
 829             try:
 830                 res = self._download_webpage_handle(
 831                     url_or_request, video_id, note, errnote, fatal,
 832                     encoding=encoding, data=data, headers=headers, query=query,
 833                     expected_status=expected_status)
 834                 success = True
 835             except compat_http_client.IncompleteRead as e:
 836                 try_count += 1
 837                 if try_count >= tries:
 838                     raise e
 839                 self._sleep(timeout, video_id)
 840         if res is False:
 841             return res
 842         else:
 843             content, _ = res
 844             return content
 845
 846     def _download_xml_handle(
 847             self, url_or_request, video_id, note='Downloading XML',
 848             errnote='Unable to download XML', transform_source=None,
 849             fatal=True, encoding=None, data=None, headers={}, query={},
 850             expected_status=None):
 851         """
 852         Return a tuple (xml as an compat_etree_Element, URL handle).
 853
 854         See _download_webpage docstring for arguments specification.
 855         """
 856         res = self._download_webpage_handle(
 857             url_or_request, video_id, note, errnote, fatal=fatal,
 858             encoding=encoding, data=data, headers=headers, query=query,
 859             expected_status=expected_status)
 860         if res is False:
 861             return res
 862         xml_string, urlh = res
 863         return self._parse_xml(
 864             xml_string, video_id, transform_source=transform_source,
 865             fatal=fatal), urlh
 866
 867     def _download_xml(
 868             self, url_or_request, video_id,
 869             note='Downloading XML', errnote='Unable to download XML',
 870             transform_source=None, fatal=True, encoding=None,
 871             data=None, headers={}, query={}, expected_status=None):
 872         """
 873         Return the xml as an compat_etree_Element.
 874
 875         See _download_webpage docstring for arguments specification.
 876         """
 877         res = self._download_xml_handle(
 878             url_or_request, video_id, note=note, errnote=errnote,
 879             transform_source=transform_source, fatal=fatal, encoding=encoding,
 880             data=data, headers=headers, query=query,
 881             expected_status=expected_status)
 882         return res if res is False else res[0]
 883
 884     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 885         if transform_source:
 886             xml_string = transform_source(xml_string)
 887         try:
 888             return compat_etree_fromstring(xml_string.encode('utf-8'))
 889         except compat_xml_parse_error as ve:
 890             errmsg = '%s: Failed to parse XML ' % video_id
 891             if fatal:
 892                 raise ExtractorError(errmsg, cause=ve)
 893             else:
 894                 self.report_warning(errmsg + str(ve))
 895
 896     def _download_json_handle(
 897             self, url_or_request, video_id, note='Downloading JSON metadata',
 898             errnote='Unable to download JSON metadata', transform_source=None,
 899             fatal=True, encoding=None, data=None, headers={}, query={},
 900             expected_status=None):
 901         """
 902         Return a tuple (JSON object, URL handle).
 903
 904         See _download_webpage docstring for arguments specification.
 905         """
 906         res = self._download_webpage_handle(
 907             url_or_request, video_id, note, errnote, fatal=fatal,
 908             encoding=encoding, data=data, headers=headers, query=query,
 909             expected_status=expected_status)
 910         if res is False:
 911             return res
 912         json_string, urlh = res
 913         return self._parse_json(
 914             json_string, video_id, transform_source=transform_source,
 915             fatal=fatal), urlh
 916
 917     def _download_json(
 918             self, url_or_request, video_id, note='Downloading JSON metadata',
 919             errnote='Unable to download JSON metadata', transform_source=None,
 920             fatal=True, encoding=None, data=None, headers={}, query={},
 921             expected_status=None):
 922         """
 923         Return the JSON object as a dict.
 924
 925         See _download_webpage docstring for arguments specification.
 926         """
 927         res = self._download_json_handle(
 928             url_or_request, video_id, note=note, errnote=errnote,
 929             transform_source=transform_source, fatal=fatal, encoding=encoding,
 930             data=data, headers=headers, query=query,
 931             expected_status=expected_status)
 932         return res if res is False else res[0]
 933
 934     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 935         if transform_source:
 936             json_string = transform_source(json_string)
 937         try:
 938             return json.loads(json_string)
 939         except ValueError as ve:
 940             errmsg = '%s: Failed to parse JSON ' % video_id
 941             if fatal:
 942                 raise ExtractorError(errmsg, cause=ve)
 943             else:
 944                 self.report_warning(errmsg + str(ve))
 945
 946     def report_warning(self, msg, video_id=None):
 947         idstr = '' if video_id is None else '%s: ' % video_id
 948         self._downloader.report_warning(
 949             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 950
 951     def to_screen(self, msg):
 952         """Print msg to screen, prefixing it with '[ie_name]'"""
 953         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 954
 955     def report_extraction(self, id_or_name):
 956         """Report information extraction."""
 957         self.to_screen('%s: Extracting information' % id_or_name)
 958
 959     def report_download_webpage(self, video_id):
 960         """Report webpage download."""
 961         self.to_screen('%s: Downloading webpage' % video_id)
 962
 963     def report_age_confirmation(self):
 964         """Report attempt to confirm age."""
 965         self.to_screen('Confirming age')
 966
 967     def report_login(self):
 968         """Report attempt to log in."""
 969         self.to_screen('Logging in')
 970
 971     def raise_login_required(
 972             self, msg='This video is only available for registered users', metadata_available=False):
 973         if metadata_available and self._downloader.params.get('ignore_no_formats_error'):
 974             self.report_warning(msg)
 975         raise ExtractorError(
 976             '%s. Use --cookies, --username and --password or --netrc to provide account credentials' % msg,
 977             expected=True)
 978
 979     def raise_geo_restricted(
 980             self, msg='This video is not available from your location due to geo restriction',
 981             countries=None, metadata_available=False):
 982         if metadata_available and self._downloader.params.get('ignore_no_formats_error'):
 983             self.report_warning(msg)
 984         else:
 985             raise GeoRestrictedError(msg, countries=countries)
 986
 987     def raise_no_formats(self, msg, expected=False, video_id=None):
 988         if expected and self._downloader.params.get('ignore_no_formats_error'):
 989             self.report_warning(msg, video_id)
 990         else:
 991             raise ExtractorError(msg, expected=expected, video_id=video_id)
 992
 993     # Methods for following #608
 994     @staticmethod
 995     def url_result(url, ie=None, video_id=None, video_title=None):
 996         """Returns a URL that points to a page that should be processed"""
 997         # TODO: ie should be the class used for getting the info
 998         video_info = {'_type': 'url',
 999                       'url': url,
1000                       'ie_key': ie}
1001         if video_id is not None:
1002             video_info['id'] = video_id
1003         if video_title is not None:
1004             video_info['title'] = video_title
1005         return video_info
1006
1007     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1008         urls = orderedSet(
1009             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1010             for m in matches)
1011         return self.playlist_result(
1012             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1013
1014     @staticmethod
1015     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1016         """Returns a playlist"""
1017         video_info = {'_type': 'playlist',
1018                       'entries': entries}
1019         video_info.update(kwargs)
1020         if playlist_id:
1021             video_info['id'] = playlist_id
1022         if playlist_title:
1023             video_info['title'] = playlist_title
1024         if playlist_description is not None:
1025             video_info['description'] = playlist_description
1026         return video_info
1027
1028     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1029         """
1030         Perform a regex search on the given string, using a single or a list of
1031         patterns returning the first matching group.
1032         In case of failure return a default value or raise a WARNING or a
1033         RegexNotFoundError, depending on fatal, specifying the field name.
1034         """
1035         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1036             mobj = re.search(pattern, string, flags)
1037         else:
1038             for p in pattern:
1039                 mobj = re.search(p, string, flags)
1040                 if mobj:
1041                     break
1042
1043         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1044             _name = '\033[0;34m%s\033[0m' % name
1045         else:
1046             _name = name
1047
1048         if mobj:
1049             if group is None:
1050                 # return the first matching group
1051                 return next(g for g in mobj.groups() if g is not None)
1052             else:
1053                 return mobj.group(group)
1054         elif default is not NO_DEFAULT:
1055             return default
1056         elif fatal:
1057             raise RegexNotFoundError('Unable to extract %s' % _name)
1058         else:
1059             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1060             return None
1061
1062     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1063         """
1064         Like _search_regex, but strips HTML tags and unescapes entities.
1065         """
1066         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1067         if res:
1068             return clean_html(res).strip()
1069         else:
1070             return res
1071
1072     def _get_netrc_login_info(self, netrc_machine=None):
1073         username = None
1074         password = None
1075         netrc_machine = netrc_machine or self._NETRC_MACHINE
1076
1077         if self._downloader.params.get('usenetrc', False):
1078             try:
1079                 info = netrc.netrc().authenticators(netrc_machine)
1080                 if info is not None:
1081                     username = info[0]
1082                     password = info[2]
1083                 else:
1084                     raise netrc.NetrcParseError(
1085                         'No authenticators for %s' % netrc_machine)
1086             except (IOError, netrc.NetrcParseError) as err:
1087                 self.report_warning(
1088                     'parsing .netrc: %s' % error_to_compat_str(err))
1089
1090         return username, password
1091
1092     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1093         """
1094         Get the login info as (username, password)
1095         First look for the manually specified credentials using username_option
1096         and password_option as keys in params dictionary. If no such credentials
1097         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1098         value.
1099         If there's no info available, return (None, None)
1100         """
1101         if self._downloader is None:
1102             return (None, None)
1103
1104         downloader_params = self._downloader.params
1105
1106         # Attempt to use provided username and password or .netrc data
1107         if downloader_params.get(username_option) is not None:
1108             username = downloader_params[username_option]
1109             password = downloader_params[password_option]
1110         else:
1111             username, password = self._get_netrc_login_info(netrc_machine)
1112
1113         return username, password
1114
1115     def _get_tfa_info(self, note='two-factor verification code'):
1116         """
1117         Get the two-factor authentication info
1118         TODO - asking the user will be required for sms/phone verify
1119         currently just uses the command line option
1120         If there's no info available, return None
1121         """
1122         if self._downloader is None:
1123             return None
1124         downloader_params = self._downloader.params
1125
1126         if downloader_params.get('twofactor') is not None:
1127             return downloader_params['twofactor']
1128
1129         return compat_getpass('Type %s and press [Return]: ' % note)
1130
1131     # Helper functions for extracting OpenGraph info
1132     @staticmethod
1133     def _og_regexes(prop):
1134         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1135         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1136                        % {'prop': re.escape(prop)})
1137         template = r'<meta[^>]+?%s[^>]+?%s'
1138         return [
1139             template % (property_re, content_re),
1140             template % (content_re, property_re),
1141         ]
1142
1143     @staticmethod
1144     def _meta_regex(prop):
1145         return r'''(?isx)<meta
1146                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1147                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1148
1149     def _og_search_property(self, prop, html, name=None, **kargs):
1150         if not isinstance(prop, (list, tuple)):
1151             prop = [prop]
1152         if name is None:
1153             name = 'OpenGraph %s' % prop[0]
1154         og_regexes = []
1155         for p in prop:
1156             og_regexes.extend(self._og_regexes(p))
1157         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1158         if escaped is None:
1159             return None
1160         return unescapeHTML(escaped)
1161
1162     def _og_search_thumbnail(self, html, **kargs):
1163         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1164
1165     def _og_search_description(self, html, **kargs):
1166         return self._og_search_property('description', html, fatal=False, **kargs)
1167
1168     def _og_search_title(self, html, **kargs):
1169         return self._og_search_property('title', html, **kargs)
1170
1171     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1172         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1173         if secure:
1174             regexes = self._og_regexes('video:secure_url') + regexes
1175         return self._html_search_regex(regexes, html, name, **kargs)
1176
1177     def _og_search_url(self, html, **kargs):
1178         return self._og_search_property('url', html, **kargs)
1179
1180     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1181         if not isinstance(name, (list, tuple)):
1182             name = [name]
1183         if display_name is None:
1184             display_name = name[0]
1185         return self._html_search_regex(
1186             [self._meta_regex(n) for n in name],
1187             html, display_name, fatal=fatal, group='content', **kwargs)
1188
1189     def _dc_search_uploader(self, html):
1190         return self._html_search_meta('dc.creator', html, 'uploader')
1191
1192     def _rta_search(self, html):
1193         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1194         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1195                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1196                      html):
1197             return 18
1198         return 0
1199
1200     def _media_rating_search(self, html):
1201         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1202         rating = self._html_search_meta('rating', html)
1203
1204         if not rating:
1205             return None
1206
1207         RATING_TABLE = {
1208             'safe for kids': 0,
1209             'general': 8,
1210             '14 years': 14,
1211             'mature': 17,
1212             'restricted': 19,
1213         }
1214         return RATING_TABLE.get(rating.lower())
1215
1216     def _family_friendly_search(self, html):
1217         # See http://schema.org/VideoObject
1218         family_friendly = self._html_search_meta(
1219             'isFamilyFriendly', html, default=None)
1220
1221         if not family_friendly:
1222             return None
1223
1224         RATING_TABLE = {
1225             '1': 0,
1226             'true': 0,
1227             '0': 18,
1228             'false': 18,
1229         }
1230         return RATING_TABLE.get(family_friendly.lower())
1231
1232     def _twitter_search_player(self, html):
1233         return self._html_search_meta('twitter:player', html,
1234                                       'twitter card player')
1235
1236     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1237         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1238         default = kwargs.get('default', NO_DEFAULT)
1239         # JSON-LD may be malformed and thus `fatal` should be respected.
1240         # At the same time `default` may be passed that assumes `fatal=False`
1241         # for _search_regex. Let's simulate the same behavior here as well.
1242         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1243         json_ld = []
1244         for mobj in json_ld_list:
1245             json_ld_item = self._parse_json(
1246                 mobj.group('json_ld'), video_id, fatal=fatal)
1247             if not json_ld_item:
1248                 continue
1249             if isinstance(json_ld_item, dict):
1250                 json_ld.append(json_ld_item)
1251             elif isinstance(json_ld_item, (list, tuple)):
1252                 json_ld.extend(json_ld_item)
1253         if json_ld:
1254             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1255         if json_ld:
1256             return json_ld
1257         if default is not NO_DEFAULT:
1258             return default
1259         elif fatal:
1260             raise RegexNotFoundError('Unable to extract JSON-LD')
1261         else:
1262             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1263             return {}
1264
1265     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1266         if isinstance(json_ld, compat_str):
1267             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1268         if not json_ld:
1269             return {}
1270         info = {}
1271         if not isinstance(json_ld, (list, tuple, dict)):
1272             return info
1273         if isinstance(json_ld, dict):
1274             json_ld = [json_ld]
1275
1276         INTERACTION_TYPE_MAP = {
1277             'CommentAction': 'comment',
1278             'AgreeAction': 'like',
1279             'DisagreeAction': 'dislike',
1280             'LikeAction': 'like',
1281             'DislikeAction': 'dislike',
1282             'ListenAction': 'view',
1283             'WatchAction': 'view',
1284             'ViewAction': 'view',
1285         }
1286
1287         def extract_interaction_type(e):
1288             interaction_type = e.get('interactionType')
1289             if isinstance(interaction_type, dict):
1290                 interaction_type = interaction_type.get('@type')
1291             return str_or_none(interaction_type)
1292
1293         def extract_interaction_statistic(e):
1294             interaction_statistic = e.get('interactionStatistic')
1295             if isinstance(interaction_statistic, dict):
1296                 interaction_statistic = [interaction_statistic]
1297             if not isinstance(interaction_statistic, list):
1298                 return
1299             for is_e in interaction_statistic:
1300                 if not isinstance(is_e, dict):
1301                     continue
1302                 if is_e.get('@type') != 'InteractionCounter':
1303                     continue
1304                 interaction_type = extract_interaction_type(is_e)
1305                 if not interaction_type:
1306                     continue
1307                 # For interaction count some sites provide string instead of
1308                 # an integer (as per spec) with non digit characters (e.g. ",")
1309                 # so extracting count with more relaxed str_to_int
1310                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1311                 if interaction_count is None:
1312                     continue
1313                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1314                 if not count_kind:
1315                     continue
1316                 count_key = '%s_count' % count_kind
1317                 if info.get(count_key) is not None:
1318                     continue
1319                 info[count_key] = interaction_count
1320
1321         def extract_video_object(e):
1322             assert e['@type'] == 'VideoObject'
1323             author = e.get('author')
1324             info.update({
1325                 'url': url_or_none(e.get('contentUrl')),
1326                 'title': unescapeHTML(e.get('name')),
1327                 'description': unescapeHTML(e.get('description')),
1328                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1329                 'duration': parse_duration(e.get('duration')),
1330                 'timestamp': unified_timestamp(e.get('uploadDate')),
1331                 # author can be an instance of 'Organization' or 'Person' types.
1332                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1333                 # however some websites are using 'Text' type instead.
1334                 # 1. https://schema.org/VideoObject
1335                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1336                 'filesize': float_or_none(e.get('contentSize')),
1337                 'tbr': int_or_none(e.get('bitrate')),
1338                 'width': int_or_none(e.get('width')),
1339                 'height': int_or_none(e.get('height')),
1340                 'view_count': int_or_none(e.get('interactionCount')),
1341             })
1342             extract_interaction_statistic(e)
1343
1344         for e in json_ld:
1345             if '@context' in e:
1346                 item_type = e.get('@type')
1347                 if expected_type is not None and expected_type != item_type:
1348                     continue
1349                 if item_type in ('TVEpisode', 'Episode'):
1350                     episode_name = unescapeHTML(e.get('name'))
1351                     info.update({
1352                         'episode': episode_name,
1353                         'episode_number': int_or_none(e.get('episodeNumber')),
1354                         'description': unescapeHTML(e.get('description')),
1355                     })
1356                     if not info.get('title') and episode_name:
1357                         info['title'] = episode_name
1358                     part_of_season = e.get('partOfSeason')
1359                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1360                         info.update({
1361                             'season': unescapeHTML(part_of_season.get('name')),
1362                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1363                         })
1364                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1365                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1366                         info['series'] = unescapeHTML(part_of_series.get('name'))
1367                 elif item_type == 'Movie':
1368                     info.update({
1369                         'title': unescapeHTML(e.get('name')),
1370                         'description': unescapeHTML(e.get('description')),
1371                         'duration': parse_duration(e.get('duration')),
1372                         'timestamp': unified_timestamp(e.get('dateCreated')),
1373                     })
1374                 elif item_type in ('Article', 'NewsArticle'):
1375                     info.update({
1376                         'timestamp': parse_iso8601(e.get('datePublished')),
1377                         'title': unescapeHTML(e.get('headline')),
1378                         'description': unescapeHTML(e.get('articleBody')),
1379                     })
1380                 elif item_type == 'VideoObject':
1381                     extract_video_object(e)
1382                     if expected_type is None:
1383                         continue
1384                     else:
1385                         break
1386                 video = e.get('video')
1387                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1388                     extract_video_object(video)
1389                 if expected_type is None:
1390                     continue
1391                 else:
1392                     break
1393         return dict((k, v) for k, v in info.items() if v is not None)
1394
1395     @staticmethod
1396     def _hidden_inputs(html):
1397         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1398         hidden_inputs = {}
1399         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1400             attrs = extract_attributes(input)
1401             if not input:
1402                 continue
1403             if attrs.get('type') not in ('hidden', 'submit'):
1404                 continue
1405             name = attrs.get('name') or attrs.get('id')
1406             value = attrs.get('value')
1407             if name and value is not None:
1408                 hidden_inputs[name] = value
1409         return hidden_inputs
1410
1411     def _form_hidden_inputs(self, form_id, html):
1412         form = self._search_regex(
1413             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1414             html, '%s form' % form_id, group='form')
1415         return self._hidden_inputs(form)
1416
1417     class FormatSort:
1418         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1419
1420         default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
1421                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1422                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1423         ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
1424                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1425                         'fps', 'fs_approx', 'source', 'format_id')
1426
1427         settings = {
1428             'vcodec': {'type': 'ordered', 'regex': True,
1429                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1430             'acodec': {'type': 'ordered', 'regex': True,
1431                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1432             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1433                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
1434             'vext': {'type': 'ordered', 'field': 'video_ext',
1435                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1436                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1437             'aext': {'type': 'ordered', 'field': 'audio_ext',
1438                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1439                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1440             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1441             'ie_pref': {'priority': True, 'type': 'extractor'},
1442             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1443             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1444             'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
1445             'quality': {'convert': 'float_none', 'default': -1},
1446             'filesize': {'convert': 'bytes'},
1447             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1448             'id': {'convert': 'string', 'field': 'format_id'},
1449             'height': {'convert': 'float_none'},
1450             'width': {'convert': 'float_none'},
1451             'fps': {'convert': 'float_none'},
1452             'tbr': {'convert': 'float_none'},
1453             'vbr': {'convert': 'float_none'},
1454             'abr': {'convert': 'float_none'},
1455             'asr': {'convert': 'float_none'},
1456             'source': {'convert': 'ignore', 'field': 'source_preference'},
1457
1458             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1459             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1460             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1461             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1462             'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1463
1464             # Most of these exist only for compatibility reasons
1465             'dimension': {'type': 'alias', 'field': 'res'},
1466             'resolution': {'type': 'alias', 'field': 'res'},
1467             'extension': {'type': 'alias', 'field': 'ext'},
1468             'bitrate': {'type': 'alias', 'field': 'br'},
1469             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1470             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1471             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1472             'framerate': {'type': 'alias', 'field': 'fps'},
1473             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1474             'protocol': {'type': 'alias', 'field': 'proto'},
1475             'source_preference': {'type': 'alias', 'field': 'source'},
1476             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1477             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1478             'samplerate': {'type': 'alias', 'field': 'asr'},
1479             'video_ext': {'type': 'alias', 'field': 'vext'},
1480             'audio_ext': {'type': 'alias', 'field': 'aext'},
1481             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1482             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1483             'video': {'type': 'alias', 'field': 'hasvid'},
1484             'has_video': {'type': 'alias', 'field': 'hasvid'},
1485             'audio': {'type': 'alias', 'field': 'hasaud'},
1486             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1487             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1488             'preference': {'type': 'alias', 'field': 'ie_pref'},
1489             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1490             'format_id': {'type': 'alias', 'field': 'id'},
1491         }
1492
1493         _order = []
1494
1495         def _get_field_setting(self, field, key):
1496             if field not in self.settings:
1497                 self.settings[field] = {}
1498             propObj = self.settings[field]
1499             if key not in propObj:
1500                 type = propObj.get('type')
1501                 if key == 'field':
1502                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1503                 elif key == 'convert':
1504                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1505                 else:
1506                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1507                 propObj[key] = default
1508             return propObj[key]
1509
1510         def _resolve_field_value(self, field, value, convertNone=False):
1511             if value is None:
1512                 if not convertNone:
1513                     return None
1514             else:
1515                 value = value.lower()
1516             conversion = self._get_field_setting(field, 'convert')
1517             if conversion == 'ignore':
1518                 return None
1519             if conversion == 'string':
1520                 return value
1521             elif conversion == 'float_none':
1522                 return float_or_none(value)
1523             elif conversion == 'bytes':
1524                 return FileDownloader.parse_bytes(value)
1525             elif conversion == 'order':
1526                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1527                 use_regex = self._get_field_setting(field, 'regex')
1528                 list_length = len(order_list)
1529                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1530                 if use_regex and value is not None:
1531                     for i, regex in enumerate(order_list):
1532                         if regex and re.match(regex, value):
1533                             return list_length - i
1534                     return list_length - empty_pos  # not in list
1535                 else:  # not regex or  value = None
1536                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1537             else:
1538                 if value.isnumeric():
1539                     return float(value)
1540                 else:
1541                     self.settings[field]['convert'] = 'string'
1542                     return value
1543
1544         def evaluate_params(self, params, sort_extractor):
1545             self._use_free_order = params.get('prefer_free_formats', False)
1546             self._sort_user = params.get('format_sort', [])
1547             self._sort_extractor = sort_extractor
1548
1549             def add_item(field, reverse, closest, limit_text):
1550                 field = field.lower()
1551                 if field in self._order:
1552                     return
1553                 self._order.append(field)
1554                 limit = self._resolve_field_value(field, limit_text)
1555                 data = {
1556                     'reverse': reverse,
1557                     'closest': False if limit is None else closest,
1558                     'limit_text': limit_text,
1559                     'limit': limit}
1560                 if field in self.settings:
1561                     self.settings[field].update(data)
1562                 else:
1563                     self.settings[field] = data
1564
1565             sort_list = (
1566                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1567                 + (tuple() if params.get('format_sort_force', False)
1568                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1569                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1570
1571             for item in sort_list:
1572                 match = re.match(self.regex, item)
1573                 if match is None:
1574                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1575                 field = match.group('field')
1576                 if field is None:
1577                     continue
1578                 if self._get_field_setting(field, 'type') == 'alias':
1579                     field = self._get_field_setting(field, 'field')
1580                 reverse = match.group('reverse') is not None
1581                 closest = match.group('separator') == '~'
1582                 limit_text = match.group('limit')
1583
1584                 has_limit = limit_text is not None
1585                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1586                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1587
1588                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1589                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1590                 limit_count = len(limits)
1591                 for (i, f) in enumerate(fields):
1592                     add_item(f, reverse, closest,
1593                              limits[i] if i < limit_count
1594                              else limits[0] if has_limit and not has_multiple_limits
1595                              else None)
1596
1597         def print_verbose_info(self, to_screen):
1598             if self._sort_user:
1599                 to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
1600             if self._sort_extractor:
1601                 to_screen('[debug] Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1602             to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1603                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1604                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1605                               self._get_field_setting(field, 'limit_text'),
1606                               self._get_field_setting(field, 'limit'))
1607                 if self._get_field_setting(field, 'limit_text') is not None else '')
1608                 for field in self._order if self._get_field_setting(field, 'visible')]))
1609
1610         def _calculate_field_preference_from_value(self, format, field, type, value):
1611             reverse = self._get_field_setting(field, 'reverse')
1612             closest = self._get_field_setting(field, 'closest')
1613             limit = self._get_field_setting(field, 'limit')
1614
1615             if type == 'extractor':
1616                 maximum = self._get_field_setting(field, 'max')
1617                 if value is None or (maximum is not None and value >= maximum):
1618                     value = -1
1619             elif type == 'boolean':
1620                 in_list = self._get_field_setting(field, 'in_list')
1621                 not_in_list = self._get_field_setting(field, 'not_in_list')
1622                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1623             elif type == 'ordered':
1624                 value = self._resolve_field_value(field, value, True)
1625
1626             # try to convert to number
1627             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1628             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1629             if is_num:
1630                 value = val_num
1631
1632             return ((-10, 0) if value is None
1633                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1634                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1635                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1636                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1637                     else (-1, value, 0))
1638
1639         def _calculate_field_preference(self, format, field):
1640             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1641             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1642             if type == 'multiple':
1643                 type = 'field'  # Only 'field' is allowed in multiple for now
1644                 actual_fields = self._get_field_setting(field, 'field')
1645
1646                 def wrapped_function(values):
1647                     values = tuple(filter(lambda x: x is not None, values))
1648                     return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1649                             else values[0] if values
1650                             else None)
1651
1652                 value = wrapped_function((get_value(f) for f in actual_fields))
1653             else:
1654                 value = get_value(field)
1655             return self._calculate_field_preference_from_value(format, field, type, value)
1656
1657         def calculate_preference(self, format):
1658             # Determine missing protocol
1659             if not format.get('protocol'):
1660                 format['protocol'] = determine_protocol(format)
1661
1662             # Determine missing ext
1663             if not format.get('ext') and 'url' in format:
1664                 format['ext'] = determine_ext(format['url'])
1665             if format.get('vcodec') == 'none':
1666                 format['audio_ext'] = format['ext']
1667                 format['video_ext'] = 'none'
1668             else:
1669                 format['video_ext'] = format['ext']
1670                 format['audio_ext'] = 'none'
1671             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1672             #    format['preference'] = -1000
1673
1674             # Determine missing bitrates
1675             if format.get('tbr') is None:
1676                 if format.get('vbr') is not None and format.get('abr') is not None:
1677                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1678             else:
1679                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1680                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1681                 if format.get('acodec') != "none" and format.get('abr') is None:
1682                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1683
1684             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1685
1686     def _sort_formats(self, formats, field_preference=[]):
1687         if not formats:
1688             if self._downloader.params.get('ignore_no_formats_error'):
1689                 return
1690             raise ExtractorError('No video formats found')
1691         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1692         format_sort.evaluate_params(self._downloader.params, field_preference)
1693         if self._downloader.params.get('verbose', False):
1694             format_sort.print_verbose_info(self._downloader.to_screen)
1695         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1696
1697     def _check_formats(self, formats, video_id):
1698         if formats:
1699             formats[:] = filter(
1700                 lambda f: self._is_valid_url(
1701                     f['url'], video_id,
1702                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1703                 formats)
1704
1705     @staticmethod
1706     def _remove_duplicate_formats(formats):
1707         format_urls = set()
1708         unique_formats = []
1709         for f in formats:
1710             if f['url'] not in format_urls:
1711                 format_urls.add(f['url'])
1712                 unique_formats.append(f)
1713         formats[:] = unique_formats
1714
1715     def _is_valid_url(self, url, video_id, item='video', headers={}):
1716         url = self._proto_relative_url(url, scheme='http:')
1717         # For now assume non HTTP(S) URLs always valid
1718         if not (url.startswith('http://') or url.startswith('https://')):
1719             return True
1720         try:
1721             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1722             return True
1723         except ExtractorError as e:
1724             self.to_screen(
1725                 '%s: %s URL is invalid, skipping: %s'
1726                 % (video_id, item, error_to_compat_str(e.cause)))
1727             return False
1728
1729     def http_scheme(self):
1730         """ Either "http:" or "https:", depending on the user's preferences """
1731         return (
1732             'http:'
1733             if self._downloader.params.get('prefer_insecure', False)
1734             else 'https:')
1735
1736     def _proto_relative_url(self, url, scheme=None):
1737         if url is None:
1738             return url
1739         if url.startswith('//'):
1740             if scheme is None:
1741                 scheme = self.http_scheme()
1742             return scheme + url
1743         else:
1744             return url
1745
1746     def _sleep(self, timeout, video_id, msg_template=None):
1747         if msg_template is None:
1748             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1749         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1750         self.to_screen(msg)
1751         time.sleep(timeout)
1752
1753     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1754                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1755                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1756         manifest = self._download_xml(
1757             manifest_url, video_id, 'Downloading f4m manifest',
1758             'Unable to download f4m manifest',
1759             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1760             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1761             transform_source=transform_source,
1762             fatal=fatal, data=data, headers=headers, query=query)
1763
1764         if manifest is False:
1765             return []
1766
1767         return self._parse_f4m_formats(
1768             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1769             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1770
1771     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1772                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1773                            fatal=True, m3u8_id=None):
1774         if not isinstance(manifest, compat_etree_Element) and not fatal:
1775             return []
1776
1777         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1778         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1779         if akamai_pv is not None and ';' in akamai_pv.text:
1780             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1781             if playerVerificationChallenge.strip() != '':
1782                 return []
1783
1784         formats = []
1785         manifest_version = '1.0'
1786         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1787         if not media_nodes:
1788             manifest_version = '2.0'
1789             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1790         # Remove unsupported DRM protected media from final formats
1791         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1792         media_nodes = remove_encrypted_media(media_nodes)
1793         if not media_nodes:
1794             return formats
1795
1796         manifest_base_url = get_base_url(manifest)
1797
1798         bootstrap_info = xpath_element(
1799             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1800             'bootstrap info', default=None)
1801
1802         vcodec = None
1803         mime_type = xpath_text(
1804             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1805             'base URL', default=None)
1806         if mime_type and mime_type.startswith('audio/'):
1807             vcodec = 'none'
1808
1809         for i, media_el in enumerate(media_nodes):
1810             tbr = int_or_none(media_el.attrib.get('bitrate'))
1811             width = int_or_none(media_el.attrib.get('width'))
1812             height = int_or_none(media_el.attrib.get('height'))
1813             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1814             # If <bootstrapInfo> is present, the specified f4m is a
1815             # stream-level manifest, and only set-level manifests may refer to
1816             # external resources.  See section 11.4 and section 4 of F4M spec
1817             if bootstrap_info is None:
1818                 media_url = None
1819                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1820                 if manifest_version == '2.0':
1821                     media_url = media_el.attrib.get('href')
1822                 if media_url is None:
1823                     media_url = media_el.attrib.get('url')
1824                 if not media_url:
1825                     continue
1826                 manifest_url = (
1827                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1828                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1829                 # If media_url is itself a f4m manifest do the recursive extraction
1830                 # since bitrates in parent manifest (this one) and media_url manifest
1831                 # may differ leading to inability to resolve the format by requested
1832                 # bitrate in f4m downloader
1833                 ext = determine_ext(manifest_url)
1834                 if ext == 'f4m':
1835                     f4m_formats = self._extract_f4m_formats(
1836                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1837                         transform_source=transform_source, fatal=fatal)
1838                     # Sometimes stream-level manifest contains single media entry that
1839                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1840                     # At the same time parent's media entry in set-level manifest may
1841                     # contain it. We will copy it from parent in such cases.
1842                     if len(f4m_formats) == 1:
1843                         f = f4m_formats[0]
1844                         f.update({
1845                             'tbr': f.get('tbr') or tbr,
1846                             'width': f.get('width') or width,
1847                             'height': f.get('height') or height,
1848                             'format_id': f.get('format_id') if not tbr else format_id,
1849                             'vcodec': vcodec,
1850                         })
1851                     formats.extend(f4m_formats)
1852                     continue
1853                 elif ext == 'm3u8':
1854                     formats.extend(self._extract_m3u8_formats(
1855                         manifest_url, video_id, 'mp4', preference=preference,
1856                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1857                     continue
1858             formats.append({
1859                 'format_id': format_id,
1860                 'url': manifest_url,
1861                 'manifest_url': manifest_url,
1862                 'ext': 'flv' if bootstrap_info is not None else None,
1863                 'protocol': 'f4m',
1864                 'tbr': tbr,
1865                 'width': width,
1866                 'height': height,
1867                 'vcodec': vcodec,
1868                 'preference': preference,
1869                 'quality': quality,
1870             })
1871         return formats
1872
1873     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1874         return {
1875             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1876             'url': m3u8_url,
1877             'ext': ext,
1878             'protocol': 'm3u8',
1879             'preference': preference - 100 if preference else -100,
1880             'quality': quality,
1881             'resolution': 'multiple',
1882             'format_note': 'Quality selection URL',
1883         }
1884
1885     def _extract_m3u8_formats(self, *args, **kwargs):
1886         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1887         if subs:
1888             self.report_warning(bug_reports_message(
1889                 "Ignoring subtitle tracks found in the HLS manifest; "
1890                 "if any subtitle tracks are missing,"
1891             ))
1892         return fmts
1893
1894     def _extract_m3u8_formats_and_subtitles(
1895             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8',
1896             preference=None, quality=None, m3u8_id=None, note=None,
1897             errnote=None, fatal=True, live=False, data=None, headers={},
1898             query={}):
1899
1900         res = self._download_webpage_handle(
1901             m3u8_url, video_id,
1902             note=note or 'Downloading m3u8 information',
1903             errnote=errnote or 'Failed to download m3u8 information',
1904             fatal=fatal, data=data, headers=headers, query=query)
1905
1906         if res is False:
1907             return [], {}
1908
1909         m3u8_doc, urlh = res
1910         m3u8_url = urlh.geturl()
1911
1912         return self._parse_m3u8_formats_and_subtitles(
1913             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1914             preference=preference, quality=quality, m3u8_id=m3u8_id,
1915             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1916             headers=headers, query=query, video_id=video_id)
1917
1918     def _parse_m3u8_formats_and_subtitles(
1919             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8',
1920             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1921             errnote=None, fatal=True, data=None, headers={}, query={},
1922             video_id=None):
1923
1924         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1925             return [], {}
1926
1927         if (not self._downloader.params.get('allow_unplayable_formats')
1928                 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)):  # Apple FairPlay
1929             return [], {}
1930
1931         formats = []
1932
1933         subtitles = {}
1934
1935         format_url = lambda u: (
1936             u
1937             if re.match(r'^https?://', u)
1938             else compat_urlparse.urljoin(m3u8_url, u))
1939
1940         split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
1941
1942         # References:
1943         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1944         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1945         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1946
1947         # We should try extracting formats only from master playlists [1, 4.3.4],
1948         # i.e. playlists that describe available qualities. On the other hand
1949         # media playlists [1, 4.3.3] should be returned as is since they contain
1950         # just the media without qualities renditions.
1951         # Fortunately, master playlist can be easily distinguished from media
1952         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1953         # master playlist tags MUST NOT appear in a media playlist and vice versa.
1954         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1955         # media playlist and MUST NOT appear in master playlist thus we can
1956         # clearly detect media playlist with this criterion.
1957
1958         def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None,
1959                                            fatal=True, data=None, headers={}):
1960             if not m3u8_doc:
1961                 if not format_url:
1962                     return []
1963                 res = self._download_webpage_handle(
1964                     format_url, video_id,
1965                     note=False,
1966                     errnote='Failed to download m3u8 playlist information',
1967                     fatal=fatal, data=data, headers=headers)
1968
1969                 if res is False:
1970                     return []
1971
1972                 m3u8_doc, urlh = res
1973                 format_url = urlh.geturl()
1974
1975             playlist_formats = []
1976             i = (
1977                 0
1978                 if split_discontinuity
1979                 else None)
1980             format_info = {
1981                 'index': i,
1982                 'key_data': None,
1983                 'files': [],
1984             }
1985             for line in m3u8_doc.splitlines():
1986                 if not line.startswith('#'):
1987                     format_info['files'].append(line)
1988                 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
1989                     i += 1
1990                     playlist_formats.append(format_info)
1991                     format_info = {
1992                         'index': i,
1993                         'url': format_url,
1994                         'files': [],
1995                     }
1996             playlist_formats.append(format_info)
1997             return playlist_formats
1998
1999         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2000
2001             playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
2002
2003             for format in playlist_formats:
2004                 format_id = []
2005                 if m3u8_id:
2006                     format_id.append(m3u8_id)
2007                 format_index = format.get('index')
2008                 if format_index:
2009                     format_id.append(str(format_index))
2010                 f = {
2011                     'format_id': '-'.join(format_id),
2012                     'format_index': format_index,
2013                     'url': m3u8_url,
2014                     'ext': ext,
2015                     'protocol': entry_protocol,
2016                     'preference': preference,
2017                     'quality': quality,
2018                 }
2019                 formats.append(f)
2020
2021             return formats, subtitles
2022
2023         groups = {}
2024         last_stream_inf = {}
2025
2026         def extract_media(x_media_line):
2027             media = parse_m3u8_attributes(x_media_line)
2028             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2029             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2030             if not (media_type and group_id and name):
2031                 return
2032             groups.setdefault(group_id, []).append(media)
2033             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2034             if media_type == 'SUBTITLES':
2035                 lang = media['LANGUAGE']  # XXX: normalise?
2036                 url = format_url(media['URI'])
2037                 sub_info = {
2038                     'url': url,
2039                     'ext': determine_ext(url),
2040                 }
2041                 if sub_info['ext'] == 'm3u8':
2042                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2043                     # files may contain is WebVTT:
2044                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2045                     sub_info['ext'] = 'vtt'
2046                     sub_info['protocol'] = 'm3u8_native'
2047                 subtitles.setdefault(lang, []).append(sub_info)
2048             if media_type not in ('VIDEO', 'AUDIO'):
2049                 return
2050             media_url = media.get('URI')
2051             if media_url:
2052                 manifest_url = format_url(media_url)
2053                 format_id = []
2054                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2055                                                                   fatal=fatal, data=data, headers=headers)
2056
2057                 for format in playlist_formats:
2058                     format_index = format.get('index')
2059                     for v in (m3u8_id, group_id, name):
2060                         if v:
2061                             format_id.append(v)
2062                     if format_index:
2063                         format_id.append(str(format_index))
2064                     f = {
2065                         'format_id': '-'.join(format_id),
2066                         'format_index': format_index,
2067                         'url': manifest_url,
2068                         'manifest_url': m3u8_url,
2069                         'language': media.get('LANGUAGE'),
2070                         'ext': ext,
2071                         'protocol': entry_protocol,
2072                         'preference': preference,
2073                         'quality': quality,
2074                     }
2075                     if media_type == 'AUDIO':
2076                         f['vcodec'] = 'none'
2077                     formats.append(f)
2078
2079         def build_stream_name():
2080             # Despite specification does not mention NAME attribute for
2081             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2082             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2083             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2084             stream_name = last_stream_inf.get('NAME')
2085             if stream_name:
2086                 return stream_name
2087             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2088             # from corresponding rendition group
2089             stream_group_id = last_stream_inf.get('VIDEO')
2090             if not stream_group_id:
2091                 return
2092             stream_group = groups.get(stream_group_id)
2093             if not stream_group:
2094                 return stream_group_id
2095             rendition = stream_group[0]
2096             return rendition.get('NAME') or stream_group_id
2097
2098         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2099         # chance to detect video only formats when EXT-X-STREAM-INF tags
2100         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2101         for line in m3u8_doc.splitlines():
2102             if line.startswith('#EXT-X-MEDIA:'):
2103                 extract_media(line)
2104
2105         for line in m3u8_doc.splitlines():
2106             if line.startswith('#EXT-X-STREAM-INF:'):
2107                 last_stream_inf = parse_m3u8_attributes(line)
2108             elif line.startswith('#') or not line.strip():
2109                 continue
2110             else:
2111                 tbr = float_or_none(
2112                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2113                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2114                 manifest_url = format_url(line.strip())
2115
2116                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2117                                                                   fatal=fatal, data=data, headers=headers)
2118
2119                 for frmt in playlist_formats:
2120                     format_id = []
2121                     if m3u8_id:
2122                         format_id.append(m3u8_id)
2123                     format_index = frmt.get('index')
2124                     stream_name = build_stream_name()
2125                     # Bandwidth of live streams may differ over time thus making
2126                     # format_id unpredictable. So it's better to keep provided
2127                     # format_id intact.
2128                     if not live:
2129                         format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2130                     if format_index:
2131                         format_id.append(str(format_index))
2132                     f = {
2133                         'format_id': '-'.join(format_id),
2134                         'format_index': format_index,
2135                         'url': manifest_url,
2136                         'manifest_url': m3u8_url,
2137                         'tbr': tbr,
2138                         'ext': ext,
2139                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2140                         'protocol': entry_protocol,
2141                         'preference': preference,
2142                         'quality': quality,
2143                     }
2144                     resolution = last_stream_inf.get('RESOLUTION')
2145                     if resolution:
2146                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2147                         if mobj:
2148                             f['width'] = int(mobj.group('width'))
2149                             f['height'] = int(mobj.group('height'))
2150                     # Unified Streaming Platform
2151                     mobj = re.search(
2152                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2153                     if mobj:
2154                         abr, vbr = mobj.groups()
2155                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2156                         f.update({
2157                             'vbr': vbr,
2158                             'abr': abr,
2159                         })
2160                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2161                     f.update(codecs)
2162                     audio_group_id = last_stream_inf.get('AUDIO')
2163                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2164                     # references a rendition group MUST have a CODECS attribute.
2165                     # However, this is not always respected, for example, [2]
2166                     # contains EXT-X-STREAM-INF tag which references AUDIO
2167                     # rendition group but does not have CODECS and despite
2168                     # referencing an audio group it represents a complete
2169                     # (with audio and video) format. So, for such cases we will
2170                     # ignore references to rendition groups and treat them
2171                     # as complete formats.
2172                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2173                         audio_group = groups.get(audio_group_id)
2174                         if audio_group and audio_group[0].get('URI'):
2175                             # TODO: update acodec for audio only formats with
2176                             # the same GROUP-ID
2177                             f['acodec'] = 'none'
2178                     if not f.get('ext'):
2179                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2180                     formats.append(f)
2181
2182                     # for DailyMotion
2183                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2184                     if progressive_uri:
2185                         http_f = f.copy()
2186                         del http_f['manifest_url']
2187                         http_f.update({
2188                             'format_id': f['format_id'].replace('hls-', 'http-'),
2189                             'protocol': 'http',
2190                             'url': progressive_uri,
2191                         })
2192                         formats.append(http_f)
2193
2194                 last_stream_inf = {}
2195         return formats, subtitles
2196
2197     @staticmethod
2198     def _xpath_ns(path, namespace=None):
2199         if not namespace:
2200             return path
2201         out = []
2202         for c in path.split('/'):
2203             if not c or c == '.':
2204                 out.append(c)
2205             else:
2206                 out.append('{%s}%s' % (namespace, c))
2207         return '/'.join(out)
2208
2209     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2210         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2211
2212         if smil is False:
2213             assert not fatal
2214             return []
2215
2216         namespace = self._parse_smil_namespace(smil)
2217
2218         return self._parse_smil_formats(
2219             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2220
2221     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2222         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2223         if smil is False:
2224             return {}
2225         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2226
2227     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2228         return self._download_xml(
2229             smil_url, video_id, 'Downloading SMIL file',
2230             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2231
2232     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2233         namespace = self._parse_smil_namespace(smil)
2234
2235         formats = self._parse_smil_formats(
2236             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2237         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2238
2239         video_id = os.path.splitext(url_basename(smil_url))[0]
2240         title = None
2241         description = None
2242         upload_date = None
2243         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2244             name = meta.attrib.get('name')
2245             content = meta.attrib.get('content')
2246             if not name or not content:
2247                 continue
2248             if not title and name == 'title':
2249                 title = content
2250             elif not description and name in ('description', 'abstract'):
2251                 description = content
2252             elif not upload_date and name == 'date':
2253                 upload_date = unified_strdate(content)
2254
2255         thumbnails = [{
2256             'id': image.get('type'),
2257             'url': image.get('src'),
2258             'width': int_or_none(image.get('width')),
2259             'height': int_or_none(image.get('height')),
2260         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2261
2262         return {
2263             'id': video_id,
2264             'title': title or video_id,
2265             'description': description,
2266             'upload_date': upload_date,
2267             'thumbnails': thumbnails,
2268             'formats': formats,
2269             'subtitles': subtitles,
2270         }
2271
2272     def _parse_smil_namespace(self, smil):
2273         return self._search_regex(
2274             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2275
2276     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2277         base = smil_url
2278         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2279             b = meta.get('base') or meta.get('httpBase')
2280             if b:
2281                 base = b
2282                 break
2283
2284         formats = []
2285         rtmp_count = 0
2286         http_count = 0
2287         m3u8_count = 0
2288
2289         srcs = []
2290         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2291         for medium in media:
2292             src = medium.get('src')
2293             if not src or src in srcs:
2294                 continue
2295             srcs.append(src)
2296
2297             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2298             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2299             width = int_or_none(medium.get('width'))
2300             height = int_or_none(medium.get('height'))
2301             proto = medium.get('proto')
2302             ext = medium.get('ext')
2303             src_ext = determine_ext(src)
2304             streamer = medium.get('streamer') or base
2305
2306             if proto == 'rtmp' or streamer.startswith('rtmp'):
2307                 rtmp_count += 1
2308                 formats.append({
2309                     'url': streamer,
2310                     'play_path': src,
2311                     'ext': 'flv',
2312                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2313                     'tbr': bitrate,
2314                     'filesize': filesize,
2315                     'width': width,
2316                     'height': height,
2317                 })
2318                 if transform_rtmp_url:
2319                     streamer, src = transform_rtmp_url(streamer, src)
2320                     formats[-1].update({
2321                         'url': streamer,
2322                         'play_path': src,
2323                     })
2324                 continue
2325
2326             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2327             src_url = src_url.strip()
2328
2329             if proto == 'm3u8' or src_ext == 'm3u8':
2330                 m3u8_formats = self._extract_m3u8_formats(
2331                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2332                 if len(m3u8_formats) == 1:
2333                     m3u8_count += 1
2334                     m3u8_formats[0].update({
2335                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2336                         'tbr': bitrate,
2337                         'width': width,
2338                         'height': height,
2339                     })
2340                 formats.extend(m3u8_formats)
2341             elif src_ext == 'f4m':
2342                 f4m_url = src_url
2343                 if not f4m_params:
2344                     f4m_params = {
2345                         'hdcore': '3.2.0',
2346                         'plugin': 'flowplayer-3.2.0.1',
2347                     }
2348                 f4m_url += '&' if '?' in f4m_url else '?'
2349                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2350                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2351             elif src_ext == 'mpd':
2352                 formats.extend(self._extract_mpd_formats(
2353                     src_url, video_id, mpd_id='dash', fatal=False))
2354             elif re.search(r'\.ism/[Mm]anifest', src_url):
2355                 formats.extend(self._extract_ism_formats(
2356                     src_url, video_id, ism_id='mss', fatal=False))
2357             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2358                 http_count += 1
2359                 formats.append({
2360                     'url': src_url,
2361                     'ext': ext or src_ext or 'flv',
2362                     'format_id': 'http-%d' % (bitrate or http_count),
2363                     'tbr': bitrate,
2364                     'filesize': filesize,
2365                     'width': width,
2366                     'height': height,
2367                 })
2368
2369         return formats
2370
2371     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2372         urls = []
2373         subtitles = {}
2374         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2375             src = textstream.get('src')
2376             if not src or src in urls:
2377                 continue
2378             urls.append(src)
2379             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2380             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2381             subtitles.setdefault(lang, []).append({
2382                 'url': src,
2383                 'ext': ext,
2384             })
2385         return subtitles
2386
2387     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2388         xspf = self._download_xml(
2389             xspf_url, playlist_id, 'Downloading xpsf playlist',
2390             'Unable to download xspf manifest', fatal=fatal)
2391         if xspf is False:
2392             return []
2393         return self._parse_xspf(
2394             xspf, playlist_id, xspf_url=xspf_url,
2395             xspf_base_url=base_url(xspf_url))
2396
2397     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2398         NS_MAP = {
2399             'xspf': 'http://xspf.org/ns/0/',
2400             's1': 'http://static.streamone.nl/player/ns/0',
2401         }
2402
2403         entries = []
2404         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2405             title = xpath_text(
2406                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2407             description = xpath_text(
2408                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2409             thumbnail = xpath_text(
2410                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2411             duration = float_or_none(
2412                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2413
2414             formats = []
2415             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2416                 format_url = urljoin(xspf_base_url, location.text)
2417                 if not format_url:
2418                     continue
2419                 formats.append({
2420                     'url': format_url,
2421                     'manifest_url': xspf_url,
2422                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2423                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2424                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2425                 })
2426             self._sort_formats(formats)
2427
2428             entries.append({
2429                 'id': playlist_id,
2430                 'title': title,
2431                 'description': description,
2432                 'thumbnail': thumbnail,
2433                 'duration': duration,
2434                 'formats': formats,
2435             })
2436         return entries
2437
2438     def _extract_mpd_formats(self, *args, **kwargs):
2439         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2440         if subs:
2441             self.report_warning(bug_reports_message(
2442                 "Ignoring subtitle tracks found in the DASH manifest; "
2443                 "if any subtitle tracks are missing,"
2444             ))
2445         return fmts
2446
2447     def _extract_mpd_formats_and_subtitles(
2448             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2449             fatal=True, data=None, headers={}, query={}):
2450         res = self._download_xml_handle(
2451             mpd_url, video_id,
2452             note=note or 'Downloading MPD manifest',
2453             errnote=errnote or 'Failed to download MPD manifest',
2454             fatal=fatal, data=data, headers=headers, query=query)
2455         if res is False:
2456             return [], {}
2457         mpd_doc, urlh = res
2458         if mpd_doc is None:
2459             return [], {}
2460         mpd_base_url = base_url(urlh.geturl())
2461
2462         return self._parse_mpd_formats_and_subtitles(
2463             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2464
2465     def _parse_mpd_formats(self, *args, **kwargs):
2466         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2467         if subs:
2468             self.report_warning(bug_reports_message(
2469                 "Ignoring subtitle tracks found in the DASH manifest; "
2470                 "if any subtitle tracks are missing,"
2471             ))
2472         return fmts
2473
2474     def _parse_mpd_formats_and_subtitles(
2475             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2476         """
2477         Parse formats from MPD manifest.
2478         References:
2479          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2480             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2481          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2482         """
2483         if not self._downloader.params.get('dynamic_mpd', True):
2484             if mpd_doc.get('type') == 'dynamic':
2485                 return [], {}
2486
2487         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2488
2489         def _add_ns(path):
2490             return self._xpath_ns(path, namespace)
2491
2492         def is_drm_protected(element):
2493             return element.find(_add_ns('ContentProtection')) is not None
2494
2495         def extract_multisegment_info(element, ms_parent_info):
2496             ms_info = ms_parent_info.copy()
2497
2498             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2499             # common attributes and elements.  We will only extract relevant
2500             # for us.
2501             def extract_common(source):
2502                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2503                 if segment_timeline is not None:
2504                     s_e = segment_timeline.findall(_add_ns('S'))
2505                     if s_e:
2506                         ms_info['total_number'] = 0
2507                         ms_info['s'] = []
2508                         for s in s_e:
2509                             r = int(s.get('r', 0))
2510                             ms_info['total_number'] += 1 + r
2511                             ms_info['s'].append({
2512                                 't': int(s.get('t', 0)),
2513                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2514                                 'd': int(s.attrib['d']),
2515                                 'r': r,
2516                             })
2517                 start_number = source.get('startNumber')
2518                 if start_number:
2519                     ms_info['start_number'] = int(start_number)
2520                 timescale = source.get('timescale')
2521                 if timescale:
2522                     ms_info['timescale'] = int(timescale)
2523                 segment_duration = source.get('duration')
2524                 if segment_duration:
2525                     ms_info['segment_duration'] = float(segment_duration)
2526
2527             def extract_Initialization(source):
2528                 initialization = source.find(_add_ns('Initialization'))
2529                 if initialization is not None:
2530                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2531
2532             segment_list = element.find(_add_ns('SegmentList'))
2533             if segment_list is not None:
2534                 extract_common(segment_list)
2535                 extract_Initialization(segment_list)
2536                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2537                 if segment_urls_e:
2538                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2539             else:
2540                 segment_template = element.find(_add_ns('SegmentTemplate'))
2541                 if segment_template is not None:
2542                     extract_common(segment_template)
2543                     media = segment_template.get('media')
2544                     if media:
2545                         ms_info['media'] = media
2546                     initialization = segment_template.get('initialization')
2547                     if initialization:
2548                         ms_info['initialization'] = initialization
2549                     else:
2550                         extract_Initialization(segment_template)
2551             return ms_info
2552
2553         skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
2554
2555         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2556         formats = []
2557         subtitles = {}
2558         for period in mpd_doc.findall(_add_ns('Period')):
2559             period_duration = parse_duration(period.get('duration')) or mpd_duration
2560             period_ms_info = extract_multisegment_info(period, {
2561                 'start_number': 1,
2562                 'timescale': 1,
2563             })
2564             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2565                 if skip_unplayable and is_drm_protected(adaptation_set):
2566                     continue
2567                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2568                 for representation in adaptation_set.findall(_add_ns('Representation')):
2569                     if skip_unplayable and is_drm_protected(representation):
2570                         continue
2571                     representation_attrib = adaptation_set.attrib.copy()
2572                     representation_attrib.update(representation.attrib)
2573                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2574                     mime_type = representation_attrib['mimeType']
2575                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2576
2577                     if content_type in ('video', 'audio', 'text'):
2578                         base_url = ''
2579                         for element in (representation, adaptation_set, period, mpd_doc):
2580                             base_url_e = element.find(_add_ns('BaseURL'))
2581                             if base_url_e is not None:
2582                                 base_url = base_url_e.text + base_url
2583                                 if re.match(r'^https?://', base_url):
2584                                     break
2585                         if mpd_base_url and not re.match(r'^https?://', base_url):
2586                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2587                                 mpd_base_url += '/'
2588                             base_url = mpd_base_url + base_url
2589                         representation_id = representation_attrib.get('id')
2590                         lang = representation_attrib.get('lang')
2591                         url_el = representation.find(_add_ns('BaseURL'))
2592                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2593                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2594                         if content_type in ('video', 'audio'):
2595                             f = {
2596                                 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2597                                 'manifest_url': mpd_url,
2598                                 'ext': mimetype2ext(mime_type),
2599                                 'width': int_or_none(representation_attrib.get('width')),
2600                                 'height': int_or_none(representation_attrib.get('height')),
2601                                 'tbr': float_or_none(bandwidth, 1000),
2602                                 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2603                                 'fps': int_or_none(representation_attrib.get('frameRate')),
2604                                 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2605                                 'format_note': 'DASH %s' % content_type,
2606                                 'filesize': filesize,
2607                                 'container': mimetype2ext(mime_type) + '_dash',
2608                             }
2609                             f.update(parse_codecs(representation_attrib.get('codecs')))
2610                         elif content_type == 'text':
2611                             f = {
2612                                 'ext': mimetype2ext(mime_type),
2613                                 'manifest_url': mpd_url,
2614                                 'filesize': filesize,
2615                             }
2616                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2617
2618                         def prepare_template(template_name, identifiers):
2619                             tmpl = representation_ms_info[template_name]
2620                             # First of, % characters outside $...$ templates
2621                             # must be escaped by doubling for proper processing
2622                             # by % operator string formatting used further (see
2623                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2624                             t = ''
2625                             in_template = False
2626                             for c in tmpl:
2627                                 t += c
2628                                 if c == '$':
2629                                     in_template = not in_template
2630                                 elif c == '%' and not in_template:
2631                                     t += c
2632                             # Next, $...$ templates are translated to their
2633                             # %(...) counterparts to be used with % operator
2634                             t = t.replace('$RepresentationID$', representation_id)
2635                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2636                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2637                             t.replace('$$', '$')
2638                             return t
2639
2640                         # @initialization is a regular template like @media one
2641                         # so it should be handled just the same way (see
2642                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2643                         if 'initialization' in representation_ms_info:
2644                             initialization_template = prepare_template(
2645                                 'initialization',
2646                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2647                                 # $Time$ shall not be included for @initialization thus
2648                                 # only $Bandwidth$ remains
2649                                 ('Bandwidth', ))
2650                             representation_ms_info['initialization_url'] = initialization_template % {
2651                                 'Bandwidth': bandwidth,
2652                             }
2653
2654                         def location_key(location):
2655                             return 'url' if re.match(r'^https?://', location) else 'path'
2656
2657                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2658
2659                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2660                             media_location_key = location_key(media_template)
2661
2662                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2663                             # can't be used at the same time
2664                             if '%(Number' in media_template and 's' not in representation_ms_info:
2665                                 segment_duration = None
2666                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2667                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2668                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2669                                 representation_ms_info['fragments'] = [{
2670                                     media_location_key: media_template % {
2671                                         'Number': segment_number,
2672                                         'Bandwidth': bandwidth,
2673                                     },
2674                                     'duration': segment_duration,
2675                                 } for segment_number in range(
2676                                     representation_ms_info['start_number'],
2677                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2678                             else:
2679                                 # $Number*$ or $Time$ in media template with S list available
2680                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2681                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2682                                 representation_ms_info['fragments'] = []
2683                                 segment_time = 0
2684                                 segment_d = None
2685                                 segment_number = representation_ms_info['start_number']
2686
2687                                 def add_segment_url():
2688                                     segment_url = media_template % {
2689                                         'Time': segment_time,
2690                                         'Bandwidth': bandwidth,
2691                                         'Number': segment_number,
2692                                     }
2693                                     representation_ms_info['fragments'].append({
2694                                         media_location_key: segment_url,
2695                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2696                                     })
2697
2698                                 for num, s in enumerate(representation_ms_info['s']):
2699                                     segment_time = s.get('t') or segment_time
2700                                     segment_d = s['d']
2701                                     add_segment_url()
2702                                     segment_number += 1
2703                                     for r in range(s.get('r', 0)):
2704                                         segment_time += segment_d
2705                                         add_segment_url()
2706                                         segment_number += 1
2707                                     segment_time += segment_d
2708                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2709                             # No media template
2710                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2711                             # or any YouTube dashsegments video
2712                             fragments = []
2713                             segment_index = 0
2714                             timescale = representation_ms_info['timescale']
2715                             for s in representation_ms_info['s']:
2716                                 duration = float_or_none(s['d'], timescale)
2717                                 for r in range(s.get('r', 0) + 1):
2718                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2719                                     fragments.append({
2720                                         location_key(segment_uri): segment_uri,
2721                                         'duration': duration,
2722                                     })
2723                                     segment_index += 1
2724                             representation_ms_info['fragments'] = fragments
2725                         elif 'segment_urls' in representation_ms_info:
2726                             # Segment URLs with no SegmentTimeline
2727                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2728                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2729                             fragments = []
2730                             segment_duration = float_or_none(
2731                                 representation_ms_info['segment_duration'],
2732                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2733                             for segment_url in representation_ms_info['segment_urls']:
2734                                 fragment = {
2735                                     location_key(segment_url): segment_url,
2736                                 }
2737                                 if segment_duration:
2738                                     fragment['duration'] = segment_duration
2739                                 fragments.append(fragment)
2740                             representation_ms_info['fragments'] = fragments
2741                         # If there is a fragments key available then we correctly recognized fragmented media.
2742                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2743                         # assumption is not necessarily correct since we may simply have no support for
2744                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2745                         if 'fragments' in representation_ms_info:
2746                             f.update({
2747                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2748                                 'url': mpd_url or base_url,
2749                                 'fragment_base_url': base_url,
2750                                 'fragments': [],
2751                                 'protocol': 'http_dash_segments',
2752                             })
2753                             if 'initialization_url' in representation_ms_info:
2754                                 initialization_url = representation_ms_info['initialization_url']
2755                                 if not f.get('url'):
2756                                     f['url'] = initialization_url
2757                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2758                             f['fragments'].extend(representation_ms_info['fragments'])
2759                         else:
2760                             # Assuming direct URL to unfragmented media.
2761                             f['url'] = base_url
2762                         if content_type in ('video', 'audio'):
2763                             formats.append(f)
2764                         elif content_type == 'text':
2765                             subtitles.setdefault(lang or 'und', []).append(f)
2766                     else:
2767                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2768         return formats, subtitles
2769
2770     def _extract_ism_formats(self, *args, **kwargs):
2771         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2772         if subs:
2773             self.report_warning(bug_reports_message(
2774                 "Ignoring subtitle tracks found in the ISM manifest; "
2775                 "if any subtitle tracks are missing,"
2776             ))
2777         return fmts
2778
2779     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2780         res = self._download_xml_handle(
2781             ism_url, video_id,
2782             note=note or 'Downloading ISM manifest',
2783             errnote=errnote or 'Failed to download ISM manifest',
2784             fatal=fatal, data=data, headers=headers, query=query)
2785         if res is False:
2786             return [], {}
2787         ism_doc, urlh = res
2788         if ism_doc is None:
2789             return [], {}
2790
2791         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2792
2793     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2794         """
2795         Parse formats from ISM manifest.
2796         References:
2797          1. [MS-SSTR]: Smooth Streaming Protocol,
2798             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2799         """
2800         if ism_doc.get('IsLive') == 'TRUE':
2801             return [], {}
2802         if (not self._downloader.params.get('allow_unplayable_formats')
2803                 and ism_doc.find('Protection') is not None):
2804             return [], {}
2805
2806         duration = int(ism_doc.attrib['Duration'])
2807         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2808
2809         formats = []
2810         subtitles = {}
2811         for stream in ism_doc.findall('StreamIndex'):
2812             stream_type = stream.get('Type')
2813             if stream_type not in ('video', 'audio', 'text'):
2814                 continue
2815             url_pattern = stream.attrib['Url']
2816             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2817             stream_name = stream.get('Name')
2818             stream_language = stream.get('Language', 'und')
2819             for track in stream.findall('QualityLevel'):
2820                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2821                 # TODO: add support for WVC1 and WMAP
2822                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2823                     self.report_warning('%s is not a supported codec' % fourcc)
2824                     continue
2825                 tbr = int(track.attrib['Bitrate']) // 1000
2826                 # [1] does not mention Width and Height attributes. However,
2827                 # they're often present while MaxWidth and MaxHeight are
2828                 # missing, so should be used as fallbacks
2829                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2830                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2831                 sampling_rate = int_or_none(track.get('SamplingRate'))
2832
2833                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2834                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2835
2836                 fragments = []
2837                 fragment_ctx = {
2838                     'time': 0,
2839                 }
2840                 stream_fragments = stream.findall('c')
2841                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2842                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2843                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2844                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2845                     if not fragment_ctx['duration']:
2846                         try:
2847                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2848                         except IndexError:
2849                             next_fragment_time = duration
2850                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2851                     for _ in range(fragment_repeat):
2852                         fragments.append({
2853                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2854                             'duration': fragment_ctx['duration'] / stream_timescale,
2855                         })
2856                         fragment_ctx['time'] += fragment_ctx['duration']
2857
2858                 format_id = []
2859                 if ism_id:
2860                     format_id.append(ism_id)
2861                 if stream_name:
2862                     format_id.append(stream_name)
2863                 format_id.append(compat_str(tbr))
2864
2865                 if stream_type == 'text':
2866                     subtitles.setdefault(stream_language, []).append({
2867                         'ext': 'ismt',
2868                         'protocol': 'ism',
2869                         'url': ism_url,
2870                         'manifest_url': ism_url,
2871                         'fragments': fragments,
2872                         '_download_params': {
2873                             'stream_type': stream_type,
2874                             'duration': duration,
2875                             'timescale': stream_timescale,
2876                             'fourcc': fourcc,
2877                             'language': stream_language,
2878                             'codec_private_data': track.get('CodecPrivateData'),
2879                         }
2880                     })
2881                 elif stream_type in ('video', 'audio'):
2882                     formats.append({
2883                         'format_id': '-'.join(format_id),
2884                         'url': ism_url,
2885                         'manifest_url': ism_url,
2886                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2887                         'width': width,
2888                         'height': height,
2889                         'tbr': tbr,
2890                         'asr': sampling_rate,
2891                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2892                         'acodec': 'none' if stream_type == 'video' else fourcc,
2893                         'protocol': 'ism',
2894                         'fragments': fragments,
2895                         '_download_params': {
2896                             'stream_type': stream_type,
2897                             'duration': duration,
2898                             'timescale': stream_timescale,
2899                             'width': width or 0,
2900                             'height': height or 0,
2901                             'fourcc': fourcc,
2902                             'language': stream_language,
2903                             'codec_private_data': track.get('CodecPrivateData'),
2904                             'sampling_rate': sampling_rate,
2905                             'channels': int_or_none(track.get('Channels', 2)),
2906                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2907                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2908                         },
2909                     })
2910         return formats, subtitles
2911
2912     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2913         def absolute_url(item_url):
2914             return urljoin(base_url, item_url)
2915
2916         def parse_content_type(content_type):
2917             if not content_type:
2918                 return {}
2919             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2920             if ctr:
2921                 mimetype, codecs = ctr.groups()
2922                 f = parse_codecs(codecs)
2923                 f['ext'] = mimetype2ext(mimetype)
2924                 return f
2925             return {}
2926
2927         def _media_formats(src, cur_media_type, type_info={}):
2928             full_url = absolute_url(src)
2929             ext = type_info.get('ext') or determine_ext(full_url)
2930             if ext == 'm3u8':
2931                 is_plain_url = False
2932                 formats = self._extract_m3u8_formats(
2933                     full_url, video_id, ext='mp4',
2934                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2935                     preference=preference, quality=quality, fatal=False)
2936             elif ext == 'mpd':
2937                 is_plain_url = False
2938                 formats = self._extract_mpd_formats(
2939                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2940             else:
2941                 is_plain_url = True
2942                 formats = [{
2943                     'url': full_url,
2944                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2945                 }]
2946             return is_plain_url, formats
2947
2948         entries = []
2949         # amp-video and amp-audio are very similar to their HTML5 counterparts
2950         # so we wll include them right here (see
2951         # https://www.ampproject.org/docs/reference/components/amp-video)
2952         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2953         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2954         media_tags = [(media_tag, media_tag_name, media_type, '')
2955                       for media_tag, media_tag_name, media_type
2956                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2957         media_tags.extend(re.findall(
2958             # We only allow video|audio followed by a whitespace or '>'.
2959             # Allowing more characters may end up in significant slow down (see
2960             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2961             # http://www.porntrex.com/maps/videositemap.xml).
2962             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2963         for media_tag, _, media_type, media_content in media_tags:
2964             media_info = {
2965                 'formats': [],
2966                 'subtitles': {},
2967             }
2968             media_attributes = extract_attributes(media_tag)
2969             src = strip_or_none(media_attributes.get('src'))
2970             if src:
2971                 _, formats = _media_formats(src, media_type)
2972                 media_info['formats'].extend(formats)
2973             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2974             if media_content:
2975                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2976                     s_attr = extract_attributes(source_tag)
2977                     # data-video-src and data-src are non standard but seen
2978                     # several times in the wild
2979                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2980                     if not src:
2981                         continue
2982                     f = parse_content_type(s_attr.get('type'))
2983                     is_plain_url, formats = _media_formats(src, media_type, f)
2984                     if is_plain_url:
2985                         # width, height, res, label and title attributes are
2986                         # all not standard but seen several times in the wild
2987                         labels = [
2988                             s_attr.get(lbl)
2989                             for lbl in ('label', 'title')
2990                             if str_or_none(s_attr.get(lbl))
2991                         ]
2992                         width = int_or_none(s_attr.get('width'))
2993                         height = (int_or_none(s_attr.get('height'))
2994                                   or int_or_none(s_attr.get('res')))
2995                         if not width or not height:
2996                             for lbl in labels:
2997                                 resolution = parse_resolution(lbl)
2998                                 if not resolution:
2999                                     continue
3000                                 width = width or resolution.get('width')
3001                                 height = height or resolution.get('height')
3002                         for lbl in labels:
3003                             tbr = parse_bitrate(lbl)
3004                             if tbr:
3005                                 break
3006                         else:
3007                             tbr = None
3008                         f.update({
3009                             'width': width,
3010                             'height': height,
3011                             'tbr': tbr,
3012                             'format_id': s_attr.get('label') or s_attr.get('title'),
3013                         })
3014                         f.update(formats[0])
3015                         media_info['formats'].append(f)
3016                     else:
3017                         media_info['formats'].extend(formats)
3018                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3019                     track_attributes = extract_attributes(track_tag)
3020                     kind = track_attributes.get('kind')
3021                     if not kind or kind in ('subtitles', 'captions'):
3022                         src = strip_or_none(track_attributes.get('src'))
3023                         if not src:
3024                             continue
3025                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3026                         media_info['subtitles'].setdefault(lang, []).append({
3027                             'url': absolute_url(src),
3028                         })
3029             for f in media_info['formats']:
3030                 f.setdefault('http_headers', {})['Referer'] = base_url
3031             if media_info['formats'] or media_info['subtitles']:
3032                 entries.append(media_info)
3033         return entries
3034
3035     def _extract_akamai_formats(self, *args, **kwargs):
3036         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3037         if subs:
3038             self.report_warning(bug_reports_message(
3039                 "Ignoring subtitle tracks found in the manifests; "
3040                 "if any subtitle tracks are missing,"
3041             ))
3042         return fmts
3043
3044     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3045         signed = 'hdnea=' in manifest_url
3046         if not signed:
3047             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3048             manifest_url = re.sub(
3049                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3050                 '', manifest_url).strip('?')
3051
3052         formats = []
3053         subtitles = {}
3054
3055         hdcore_sign = 'hdcore=3.7.0'
3056         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3057         hds_host = hosts.get('hds')
3058         if hds_host:
3059             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3060         if 'hdcore=' not in f4m_url:
3061             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3062         f4m_formats = self._extract_f4m_formats(
3063             f4m_url, video_id, f4m_id='hds', fatal=False)
3064         for entry in f4m_formats:
3065             entry.update({'extra_param_to_segment_url': hdcore_sign})
3066         formats.extend(f4m_formats)
3067
3068         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3069         hls_host = hosts.get('hls')
3070         if hls_host:
3071             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3072         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3073             m3u8_url, video_id, 'mp4', 'm3u8_native',
3074             m3u8_id='hls', fatal=False)
3075         formats.extend(m3u8_formats)
3076         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3077
3078         http_host = hosts.get('http')
3079         if http_host and m3u8_formats and not signed:
3080             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3081             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3082             qualities_length = len(qualities)
3083             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3084                 i = 0
3085                 for f in m3u8_formats:
3086                     if f['vcodec'] != 'none':
3087                         for protocol in ('http', 'https'):
3088                             http_f = f.copy()
3089                             del http_f['manifest_url']
3090                             http_url = re.sub(
3091                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3092                             http_f.update({
3093                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3094                                 'url': http_url,
3095                                 'protocol': protocol,
3096                             })
3097                             formats.append(http_f)
3098                         i += 1
3099
3100         return formats, subtitles
3101
3102     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3103         query = compat_urlparse.urlparse(url).query
3104         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3105         mobj = re.search(
3106             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3107         url_base = mobj.group('url')
3108         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3109         formats = []
3110
3111         def manifest_url(manifest):
3112             m_url = '%s/%s' % (http_base_url, manifest)
3113             if query:
3114                 m_url += '?%s' % query
3115             return m_url
3116
3117         if 'm3u8' not in skip_protocols:
3118             formats.extend(self._extract_m3u8_formats(
3119                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3120                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3121         if 'f4m' not in skip_protocols:
3122             formats.extend(self._extract_f4m_formats(
3123                 manifest_url('manifest.f4m'),
3124                 video_id, f4m_id='hds', fatal=False))
3125         if 'dash' not in skip_protocols:
3126             formats.extend(self._extract_mpd_formats(
3127                 manifest_url('manifest.mpd'),
3128                 video_id, mpd_id='dash', fatal=False))
3129         if re.search(r'(?:/smil:|\.smil)', url_base):
3130             if 'smil' not in skip_protocols:
3131                 rtmp_formats = self._extract_smil_formats(
3132                     manifest_url('jwplayer.smil'),
3133                     video_id, fatal=False)
3134                 for rtmp_format in rtmp_formats:
3135                     rtsp_format = rtmp_format.copy()
3136                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3137                     del rtsp_format['play_path']
3138                     del rtsp_format['ext']
3139                     rtsp_format.update({
3140                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3141                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3142                         'protocol': 'rtsp',
3143                     })
3144                     formats.extend([rtmp_format, rtsp_format])
3145         else:
3146             for protocol in ('rtmp', 'rtsp'):
3147                 if protocol not in skip_protocols:
3148                     formats.append({
3149                         'url': '%s:%s' % (protocol, url_base),
3150                         'format_id': protocol,
3151                         'protocol': protocol,
3152                     })
3153         return formats
3154
3155     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3156         mobj = re.search(
3157             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3158             webpage)
3159         if mobj:
3160             try:
3161                 jwplayer_data = self._parse_json(mobj.group('options'),
3162                                                  video_id=video_id,
3163                                                  transform_source=transform_source)
3164             except ExtractorError:
3165                 pass
3166             else:
3167                 if isinstance(jwplayer_data, dict):
3168                     return jwplayer_data
3169
3170     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3171         jwplayer_data = self._find_jwplayer_data(
3172             webpage, video_id, transform_source=js_to_json)
3173         return self._parse_jwplayer_data(
3174             jwplayer_data, video_id, *args, **kwargs)
3175
3176     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3177                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3178         # JWPlayer backward compatibility: flattened playlists
3179         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3180         if 'playlist' not in jwplayer_data:
3181             jwplayer_data = {'playlist': [jwplayer_data]}
3182
3183         entries = []
3184
3185         # JWPlayer backward compatibility: single playlist item
3186         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3187         if not isinstance(jwplayer_data['playlist'], list):
3188             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3189
3190         for video_data in jwplayer_data['playlist']:
3191             # JWPlayer backward compatibility: flattened sources
3192             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3193             if 'sources' not in video_data:
3194                 video_data['sources'] = [video_data]
3195
3196             this_video_id = video_id or video_data['mediaid']
3197
3198             formats = self._parse_jwplayer_formats(
3199                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3200                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3201
3202             subtitles = {}
3203             tracks = video_data.get('tracks')
3204             if tracks and isinstance(tracks, list):
3205                 for track in tracks:
3206                     if not isinstance(track, dict):
3207                         continue
3208                     track_kind = track.get('kind')
3209                     if not track_kind or not isinstance(track_kind, compat_str):
3210                         continue
3211                     if track_kind.lower() not in ('captions', 'subtitles'):
3212                         continue
3213                     track_url = urljoin(base_url, track.get('file'))
3214                     if not track_url:
3215                         continue
3216                     subtitles.setdefault(track.get('label') or 'en', []).append({
3217                         'url': self._proto_relative_url(track_url)
3218                     })
3219
3220             entry = {
3221                 'id': this_video_id,
3222                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3223                 'description': clean_html(video_data.get('description')),
3224                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3225                 'timestamp': int_or_none(video_data.get('pubdate')),
3226                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3227                 'subtitles': subtitles,
3228             }
3229             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3230             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3231                 entry.update({
3232                     '_type': 'url_transparent',
3233                     'url': formats[0]['url'],
3234                 })
3235             else:
3236                 self._sort_formats(formats)
3237                 entry['formats'] = formats
3238             entries.append(entry)
3239         if len(entries) == 1:
3240             return entries[0]
3241         else:
3242             return self.playlist_result(entries)
3243
3244     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3245                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3246         urls = []
3247         formats = []
3248         for source in jwplayer_sources_data:
3249             if not isinstance(source, dict):
3250                 continue
3251             source_url = urljoin(
3252                 base_url, self._proto_relative_url(source.get('file')))
3253             if not source_url or source_url in urls:
3254                 continue
3255             urls.append(source_url)
3256             source_type = source.get('type') or ''
3257             ext = mimetype2ext(source_type) or determine_ext(source_url)
3258             if source_type == 'hls' or ext == 'm3u8':
3259                 formats.extend(self._extract_m3u8_formats(
3260                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3261                     m3u8_id=m3u8_id, fatal=False))
3262             elif source_type == 'dash' or ext == 'mpd':
3263                 formats.extend(self._extract_mpd_formats(
3264                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3265             elif ext == 'smil':
3266                 formats.extend(self._extract_smil_formats(
3267                     source_url, video_id, fatal=False))
3268             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3269             elif source_type.startswith('audio') or ext in (
3270                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3271                 formats.append({
3272                     'url': source_url,
3273                     'vcodec': 'none',
3274                     'ext': ext,
3275                 })
3276             else:
3277                 height = int_or_none(source.get('height'))
3278                 if height is None:
3279                     # Often no height is provided but there is a label in
3280                     # format like "1080p", "720p SD", or 1080.
3281                     height = int_or_none(self._search_regex(
3282                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3283                         'height', default=None))
3284                 a_format = {
3285                     'url': source_url,
3286                     'width': int_or_none(source.get('width')),
3287                     'height': height,
3288                     'tbr': int_or_none(source.get('bitrate')),
3289                     'ext': ext,
3290                 }
3291                 if source_url.startswith('rtmp'):
3292                     a_format['ext'] = 'flv'
3293                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3294                     # of jwplayer.flash.swf
3295                     rtmp_url_parts = re.split(
3296                         r'((?:mp4|mp3|flv):)', source_url, 1)
3297                     if len(rtmp_url_parts) == 3:
3298                         rtmp_url, prefix, play_path = rtmp_url_parts
3299                         a_format.update({
3300                             'url': rtmp_url,
3301                             'play_path': prefix + play_path,
3302                         })
3303                     if rtmp_params:
3304                         a_format.update(rtmp_params)
3305                 formats.append(a_format)
3306         return formats
3307
3308     def _live_title(self, name):
3309         """ Generate the title for a live video """
3310         now = datetime.datetime.now()
3311         now_str = now.strftime('%Y-%m-%d %H:%M')
3312         return name + ' ' + now_str
3313
3314     def _int(self, v, name, fatal=False, **kwargs):
3315         res = int_or_none(v, **kwargs)
3316         if 'get_attr' in kwargs:
3317             print(getattr(v, kwargs['get_attr']))
3318         if res is None:
3319             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3320             if fatal:
3321                 raise ExtractorError(msg)
3322             else:
3323                 self.report_warning(msg)
3324         return res
3325
3326     def _float(self, v, name, fatal=False, **kwargs):
3327         res = float_or_none(v, **kwargs)
3328         if res is None:
3329             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3330             if fatal:
3331                 raise ExtractorError(msg)
3332             else:
3333                 self.report_warning(msg)
3334         return res
3335
3336     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3337                     path='/', secure=False, discard=False, rest={}, **kwargs):
3338         cookie = compat_cookiejar_Cookie(
3339             0, name, value, port, port is not None, domain, True,
3340             domain.startswith('.'), path, True, secure, expire_time,
3341             discard, None, None, rest)
3342         self._downloader.cookiejar.set_cookie(cookie)
3343
3344     def _get_cookies(self, url):
3345         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3346         req = sanitized_Request(url)
3347         self._downloader.cookiejar.add_cookie_header(req)
3348         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3349
3350     def _apply_first_set_cookie_header(self, url_handle, cookie):
3351         """
3352         Apply first Set-Cookie header instead of the last. Experimental.
3353
3354         Some sites (e.g. [1-3]) may serve two cookies under the same name
3355         in Set-Cookie header and expect the first (old) one to be set rather
3356         than second (new). However, as of RFC6265 the newer one cookie
3357         should be set into cookie store what actually happens.
3358         We will workaround this issue by resetting the cookie to
3359         the first one manually.
3360         1. https://new.vk.com/
3361         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3362         3. https://learning.oreilly.com/
3363         """
3364         for header, cookies in url_handle.headers.items():
3365             if header.lower() != 'set-cookie':
3366                 continue
3367             if sys.version_info[0] >= 3:
3368                 cookies = cookies.encode('iso-8859-1')
3369             cookies = cookies.decode('utf-8')
3370             cookie_value = re.search(
3371                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3372             if cookie_value:
3373                 value, domain = cookie_value.groups()
3374                 self._set_cookie(domain, cookie, value)
3375                 break
3376
3377     def get_testcases(self, include_onlymatching=False):
3378         t = getattr(self, '_TEST', None)
3379         if t:
3380             assert not hasattr(self, '_TESTS'), \
3381                 '%s has _TEST and _TESTS' % type(self).__name__
3382             tests = [t]
3383         else:
3384             tests = getattr(self, '_TESTS', [])
3385         for t in tests:
3386             if not include_onlymatching and t.get('only_matching', False):
3387                 continue
3388             t['name'] = type(self).__name__[:-len('IE')]
3389             yield t
3390
3391     def is_suitable(self, age_limit):
3392         """ Test whether the extractor is generally suitable for the given
3393         age limit (i.e. pornographic sites are not, all others usually are) """
3394
3395         any_restricted = False
3396         for tc in self.get_testcases(include_onlymatching=False):
3397             if tc.get('playlist', []):
3398                 tc = tc['playlist'][0]
3399             is_restricted = age_restricted(
3400                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3401             if not is_restricted:
3402                 return True
3403             any_restricted = any_restricted or is_restricted
3404         return not any_restricted
3405
3406     def extract_subtitles(self, *args, **kwargs):
3407         if (self._downloader.params.get('writesubtitles', False)
3408                 or self._downloader.params.get('listsubtitles')):
3409             return self._get_subtitles(*args, **kwargs)
3410         return {}
3411
3412     def _get_subtitles(self, *args, **kwargs):
3413         raise NotImplementedError('This method must be implemented by subclasses')
3414
3415     @staticmethod
3416     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3417         """ Merge subtitle items for one language. Items with duplicated URLs
3418         will be dropped. """
3419         list1_urls = set([item['url'] for item in subtitle_list1])
3420         ret = list(subtitle_list1)
3421         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3422         return ret
3423
3424     @classmethod
3425     def _merge_subtitles(cls, *dicts, **kwargs):
3426         """ Merge subtitle dictionaries, language by language. """
3427
3428         target = (lambda target=None: target)(**kwargs)
3429         # The above lambda extracts the keyword argument 'target' from kwargs
3430         # while ensuring there are no stray ones. When Python 2 support
3431         # is dropped, remove it and change the function signature to:
3432         #
3433         #     def _merge_subtitles(cls, *dicts, target=None):
3434
3435         if target is None:
3436             target = {}
3437         for d in dicts:
3438             for lang, subs in d.items():
3439                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3440         return target
3441
3442     def extract_automatic_captions(self, *args, **kwargs):
3443         if (self._downloader.params.get('writeautomaticsub', False)
3444                 or self._downloader.params.get('listsubtitles')):
3445             return self._get_automatic_captions(*args, **kwargs)
3446         return {}
3447
3448     def _get_automatic_captions(self, *args, **kwargs):
3449         raise NotImplementedError('This method must be implemented by subclasses')
3450
3451     def mark_watched(self, *args, **kwargs):
3452         if (self._downloader.params.get('mark_watched', False)
3453                 and (self._get_login_info()[0] is not None
3454                      or self._downloader.params.get('cookiefile') is not None)):
3455             self._mark_watched(*args, **kwargs)
3456
3457     def _mark_watched(self, *args, **kwargs):
3458         raise NotImplementedError('This method must be implemented by subclasses')
3459
3460     def geo_verification_headers(self):
3461         headers = {}
3462         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3463         if geo_verification_proxy:
3464             headers['Ytdl-request-proxy'] = geo_verification_proxy
3465         return headers
3466
3467     def _generic_id(self, url):
3468         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3469
3470     def _generic_title(self, url):
3471         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3472
3473     @staticmethod
3474     def _availability(is_private, needs_premium, needs_subscription, needs_auth, is_unlisted):
3475         all_known = all(map(
3476             lambda x: x is not None,
3477             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3478         return (
3479             'private' if is_private
3480             else 'premium_only' if needs_premium
3481             else 'subscriber_only' if needs_subscription
3482             else 'needs_auth' if needs_auth
3483             else 'unlisted' if is_unlisted
3484             else 'public' if all_known
3485             else None)
3486
3487
3488 class SearchInfoExtractor(InfoExtractor):
3489     """
3490     Base class for paged search queries extractors.
3491     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3492     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3493     """
3494
3495     @classmethod
3496     def _make_valid_url(cls):
3497         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3498
3499     @classmethod
3500     def suitable(cls, url):
3501         return re.match(cls._make_valid_url(), url) is not None
3502
3503     def _real_extract(self, query):
3504         mobj = re.match(self._make_valid_url(), query)
3505         if mobj is None:
3506             raise ExtractorError('Invalid search query "%s"' % query)
3507
3508         prefix = mobj.group('prefix')
3509         query = mobj.group('query')
3510         if prefix == '':
3511             return self._get_n_results(query, 1)
3512         elif prefix == 'all':
3513             return self._get_n_results(query, self._MAX_RESULTS)
3514         else:
3515             n = int(prefix)
3516             if n <= 0:
3517                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3518             elif n > self._MAX_RESULTS:
3519                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3520                 n = self._MAX_RESULTS
3521             return self._get_n_results(query, n)
3522
3523     def _get_n_results(self, query, n):
3524         """Get a specified number of results for a query"""
3525         raise NotImplementedError('This method must be implemented by subclasses')
3526
3527     @property
3528     def SEARCH_KEY(self):
3529         return self._SEARCH_KEY