yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar_Cookie,
  18     compat_cookies_SimpleCookie,
  19     compat_etree_Element,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_integer_types,
  23     compat_http_client,
  24     compat_os_name,
  25     compat_str,
  26     compat_urllib_error,
  27     compat_urllib_parse_unquote,
  28     compat_urllib_parse_urlencode,
  29     compat_urllib_request,
  30     compat_urlparse,
  31     compat_xml_parse_error,
  32 )
  33 from ..downloader import FileDownloader
  34 from ..downloader.f4m import (
  35     get_base_url,
  36     remove_encrypted_media,
  37 )
  38 from ..utils import (
  39     NO_DEFAULT,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     error_to_compat_str,
  49     ExtractorError,
  50     extract_attributes,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     GeoRestrictedError,
  54     GeoUtils,
  55     int_or_none,
  56     js_to_json,
  57     JSON_LD_RE,
  58     mimetype2ext,
  59     network_exceptions,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     RegexNotFoundError,
  68     sanitized_Request,
  69     sanitize_filename,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     unescapeHTML,
  74     unified_strdate,
  75     unified_timestamp,
  76     update_Request,
  77     update_url_query,
  78     urljoin,
  79     url_basename,
  80     url_or_none,
  81     xpath_element,
  82     xpath_text,
  83     xpath_with_ns,
  84 )
  85
  86
  87 class InfoExtractor(object):
  88     """Information Extractor class.
  89
  90     Information extractors are the classes that, given a URL, extract
  91     information about the video (or videos) the URL refers to. This
  92     information includes the real video URL, the video title, author and
  93     others. The information is stored in a dictionary which is then
  94     passed to the YoutubeDL. The YoutubeDL processes this
  95     information possibly downloading the video to the file system, among
  96     other possible outcomes.
  97
  98     The type field determines the type of the result.
  99     By far the most common value (and the default if _type is missing) is
 100     "video", which indicates a single video.
 101
 102     For a video, the dictionaries must include the following fields:
 103
 104     id:             Video identifier.
 105     title:          Video title, unescaped.
 106
 107     Additionally, it must contain either a formats entry or a url one:
 108
 109     formats:        A list of dictionaries for each format available, ordered
 110                     from worst to best quality.
 111
 112                     Potential fields:
 113                     * url        The mandatory URL representing the media:
 114                                    for plain file media - HTTP URL of this file,
 115                                    for RTMP - RTMP URL,
 116                                    for HLS - URL of the M3U8 media playlist,
 117                                    for HDS - URL of the F4M manifest,
 118                                    for DASH
 119                                      - HTTP URL to plain file media (in case of
 120                                        unfragmented media)
 121                                      - URL of the MPD manifest or base URL
 122                                        representing the media if MPD manifest
 123                                        is parsed from a string (in case of
 124                                        fragmented media)
 125                                    for MSS - URL of the ISM manifest.
 126                     * manifest_url
 127                                  The URL of the manifest file in case of
 128                                  fragmented media:
 129                                    for HLS - URL of the M3U8 master playlist,
 130                                    for HDS - URL of the F4M manifest,
 131                                    for DASH - URL of the MPD manifest,
 132                                    for MSS - URL of the ISM manifest.
 133                     * ext        Will be calculated from URL if missing
 134                     * format     A human-readable description of the format
 135                                  ("mp4 container with h264/opus").
 136                                  Calculated from the format_id, width, height.
 137                                  and format_note fields if missing.
 138                     * format_id  A short description of the format
 139                                  ("mp4_h264_opus" or "19").
 140                                 Technically optional, but strongly recommended.
 141                     * format_note Additional info about the format
 142                                  ("3D" or "DASH video")
 143                     * width      Width of the video, if known
 144                     * height     Height of the video, if known
 145                     * resolution Textual description of width and height
 146                     * tbr        Average bitrate of audio and video in KBit/s
 147                     * abr        Average audio bitrate in KBit/s
 148                     * acodec     Name of the audio codec in use
 149                     * asr        Audio sampling rate in Hertz
 150                     * vbr        Average video bitrate in KBit/s
 151                     * fps        Frame rate
 152                     * vcodec     Name of the video codec in use
 153                     * container  Name of the container format
 154                     * filesize   The number of bytes, if known in advance
 155                     * filesize_approx  An estimate for the number of bytes
 156                     * player_url SWF Player URL (used for rtmpdump).
 157                     * protocol   The protocol that will be used for the actual
 158                                  download, lower-case.
 159                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 160                                  "m3u8", "m3u8_native" or "http_dash_segments".
 161                     * fragment_base_url
 162                                  Base URL for fragments. Each fragment's path
 163                                  value (if present) will be relative to
 164                                  this URL.
 165                     * fragments  A list of fragments of a fragmented media.
 166                                  Each fragment entry must contain either an url
 167                                  or a path. If an url is present it should be
 168                                  considered by a client. Otherwise both path and
 169                                  fragment_base_url must be present. Here is
 170                                  the list of all potential fields:
 171                                  * "url" - fragment's URL
 172                                  * "path" - fragment's path relative to
 173                                             fragment_base_url
 174                                  * "duration" (optional, int or float)
 175                                  * "filesize" (optional, int)
 176                     * preference Order number of this format. If this field is
 177                                  present and not None, the formats get sorted
 178                                  by this field, regardless of all other values.
 179                                  -1 for default (order by other properties),
 180                                  -2 or smaller for less than default.
 181                                  < -1000 to hide the format (if there is
 182                                     another one which is strictly better)
 183                     * language   Language code, e.g. "de" or "en-US".
 184                     * language_preference  Is this in the language mentioned in
 185                                  the URL?
 186                                  10 if it's what the URL is about,
 187                                  -1 for default (don't know),
 188                                  -10 otherwise, other values reserved for now.
 189                     * quality    Order number of the video quality of this
 190                                  format, irrespective of the file format.
 191                                  -1 for default (order by other properties),
 192                                  -2 or smaller for less than default.
 193                     * source_preference  Order number for this video source
 194                                   (quality takes higher priority)
 195                                  -1 for default (order by other properties),
 196                                  -2 or smaller for less than default.
 197                     * http_headers  A dictionary of additional HTTP headers
 198                                  to add to the request.
 199                     * stretched_ratio  If given and not 1, indicates that the
 200                                  video's pixels are not square.
 201                                  width : height ratio as float.
 202                     * no_resume  The server does not support resuming the
 203                                  (HTTP or RTMP) download. Boolean.
 204                     * downloader_options  A dictionary of downloader options as
 205                                  described in FileDownloader
 206                     RTMP formats can also have the additional fields: page_url,
 207                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 208                     rtmp_protocol, rtmp_real_time
 209
 210     url:            Final video URL.
 211     ext:            Video filename extension.
 212     format:         The video format, defaults to ext (used for --get-format)
 213     player_url:     SWF Player URL (used for rtmpdump).
 214
 215     The following fields are optional:
 216
 217     alt_title:      A secondary title of the video.
 218     display_id      An alternative identifier for the video, not necessarily
 219                     unique, but available before title. Typically, id is
 220                     something like "4234987", title "Dancing naked mole rats",
 221                     and display_id "dancing-naked-mole-rats"
 222     thumbnails:     A list of dictionaries, with the following entries:
 223                         * "id" (optional, string) - Thumbnail format ID
 224                         * "url"
 225                         * "preference" (optional, int) - quality of the image
 226                         * "width" (optional, int)
 227                         * "height" (optional, int)
 228                         * "resolution" (optional, string "{width}x{height}",
 229                                         deprecated)
 230                         * "filesize" (optional, int)
 231     thumbnail:      Full URL to a video thumbnail image.
 232     description:    Full video description.
 233     uploader:       Full name of the video uploader.
 234     license:        License name the video is licensed under.
 235     creator:        The creator of the video.
 236     release_timestamp: UNIX timestamp of the moment the video was released.
 237     release_date:   The date (YYYYMMDD) when the video was released.
 238     timestamp:      UNIX timestamp of the moment the video was uploaded
 239     upload_date:    Video upload date (YYYYMMDD).
 240                     If not explicitly set, calculated from timestamp.
 241     uploader_id:    Nickname or id of the video uploader.
 242     uploader_url:   Full URL to a personal webpage of the video uploader.
 243     channel:        Full name of the channel the video is uploaded on.
 244                     Note that channel fields may or may not repeat uploader
 245                     fields. This depends on a particular extractor.
 246     channel_id:     Id of the channel.
 247     channel_url:    Full URL to a channel webpage.
 248     location:       Physical location where the video was filmed.
 249     subtitles:      The available subtitles as a dictionary in the format
 250                     {tag: subformats}. "tag" is usually a language code, and
 251                     "subformats" is a list sorted from lower to higher
 252                     preference, each element is a dictionary with the "ext"
 253                     entry and one of:
 254                         * "data": The subtitles file contents
 255                         * "url": A URL pointing to the subtitles file
 256                     It can optionally also have:
 257                         * "name": Name or description of the subtitles
 258                     "ext" will be calculated from URL if missing
 259     automatic_captions: Like 'subtitles'; contains automatically generated
 260                     captions instead of normal subtitles
 261     duration:       Length of the video in seconds, as an integer or float.
 262     view_count:     How many users have watched the video on the platform.
 263     like_count:     Number of positive ratings of the video
 264     dislike_count:  Number of negative ratings of the video
 265     repost_count:   Number of reposts of the video
 266     average_rating: Average rating give by users, the scale used depends on the webpage
 267     comment_count:  Number of comments on the video
 268     comments:       A list of comments, each with one or more of the following
 269                     properties (all but one of text or html optional):
 270                         * "author" - human-readable name of the comment author
 271                         * "author_id" - user ID of the comment author
 272                         * "author_thumbnail" - The thumbnail of the comment author
 273                         * "id" - Comment ID
 274                         * "html" - Comment as HTML
 275                         * "text" - Plain text of the comment
 276                         * "timestamp" - UNIX timestamp of comment
 277                         * "parent" - ID of the comment this one is replying to.
 278                                      Set to "root" to indicate that this is a
 279                                      comment to the original video.
 280                         * "like_count" - Number of positive ratings of the comment
 281                         * "dislike_count" - Number of negative ratings of the comment
 282                         * "is_favorited" - Whether the comment is marked as
 283                                            favorite by the video uploader
 284                         * "author_is_uploader" - Whether the comment is made by
 285                                                  the video uploader
 286     age_limit:      Age restriction for the video, as an integer (years)
 287     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 288                     should allow to get the same result again. (It will be set
 289                     by YoutubeDL if it's missing)
 290     categories:     A list of categories that the video falls in, for example
 291                     ["Sports", "Berlin"]
 292     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 293     is_live:        True, False, or None (=unknown). Whether this video is a
 294                     live stream that goes on instead of a fixed-length video.
 295     was_live:       True, False, or None (=unknown). Whether this video was
 296                     originally a live stream.
 297     start_time:     Time in seconds where the reproduction should start, as
 298                     specified in the URL.
 299     end_time:       Time in seconds where the reproduction should end, as
 300                     specified in the URL.
 301     chapters:       A list of dictionaries, with the following entries:
 302                         * "start_time" - The start time of the chapter in seconds
 303                         * "end_time" - The end time of the chapter in seconds
 304                         * "title" (optional, string)
 305     playable_in_embed: Whether this video is allowed to play in embedded
 306                     players on other sites. Can be True (=always allowed),
 307                     False (=never allowed), None (=unknown), or a string
 308                     specifying the criteria for embedability (Eg: 'whitelist')
 309     availability:   Under what condition the video is available. One of
 310                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 311                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 312                     to set it
 313     __post_extractor: A function to be called just before the metadata is
 314                     written to either disk, logger or console. The function
 315                     must return a dict which will be added to the info_dict.
 316                     This is usefull for additional information that is
 317                     time-consuming to extract. Note that the fields thus
 318                     extracted will not be available to output template and
 319                     match_filter. So, only "comments" and "comment_count" are
 320                     currently allowed to be extracted via this method.
 321
 322     The following fields should only be used when the video belongs to some logical
 323     chapter or section:
 324
 325     chapter:        Name or title of the chapter the video belongs to.
 326     chapter_number: Number of the chapter the video belongs to, as an integer.
 327     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 328
 329     The following fields should only be used when the video is an episode of some
 330     series, programme or podcast:
 331
 332     series:         Title of the series or programme the video episode belongs to.
 333     season:         Title of the season the video episode belongs to.
 334     season_number:  Number of the season the video episode belongs to, as an integer.
 335     season_id:      Id of the season the video episode belongs to, as a unicode string.
 336     episode:        Title of the video episode. Unlike mandatory video title field,
 337                     this field should denote the exact title of the video episode
 338                     without any kind of decoration.
 339     episode_number: Number of the video episode within a season, as an integer.
 340     episode_id:     Id of the video episode, as a unicode string.
 341
 342     The following fields should only be used when the media is a track or a part of
 343     a music album:
 344
 345     track:          Title of the track.
 346     track_number:   Number of the track within an album or a disc, as an integer.
 347     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 348                     as a unicode string.
 349     artist:         Artist(s) of the track.
 350     genre:          Genre(s) of the track.
 351     album:          Title of the album the track belongs to.
 352     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 353     album_artist:   List of all artists appeared on the album (e.g.
 354                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 355                     and compilations).
 356     disc_number:    Number of the disc or other physical medium the track belongs to,
 357                     as an integer.
 358     release_year:   Year (YYYY) when the album was released.
 359
 360     Unless mentioned otherwise, the fields should be Unicode strings.
 361
 362     Unless mentioned otherwise, None is equivalent to absence of information.
 363
 364
 365     _type "playlist" indicates multiple videos.
 366     There must be a key "entries", which is a list, an iterable, or a PagedList
 367     object, each element of which is a valid dictionary by this specification.
 368
 369     Additionally, playlists can have "id", "title", and any other relevent
 370     attributes with the same semantics as videos (see above).
 371
 372
 373     _type "multi_video" indicates that there are multiple videos that
 374     form a single show, for examples multiple acts of an opera or TV episode.
 375     It must have an entries key like a playlist and contain all the keys
 376     required for a video at the same time.
 377
 378
 379     _type "url" indicates that the video must be extracted from another
 380     location, possibly by a different extractor. Its only required key is:
 381     "url" - the next URL to extract.
 382     The key "ie_key" can be set to the class name (minus the trailing "IE",
 383     e.g. "Youtube") if the extractor class is known in advance.
 384     Additionally, the dictionary may have any properties of the resolved entity
 385     known in advance, for example "title" if the title of the referred video is
 386     known ahead of time.
 387
 388
 389     _type "url_transparent" entities have the same specification as "url", but
 390     indicate that the given additional information is more precise than the one
 391     associated with the resolved URL.
 392     This is useful when a site employs a video service that hosts the video and
 393     its technical metadata, but that video service does not embed a useful
 394     title, description etc.
 395
 396
 397     Subclasses of this one should re-define the _real_initialize() and
 398     _real_extract() methods and define a _VALID_URL regexp.
 399     Probably, they should also be added to the list of extractors.
 400
 401     _GEO_BYPASS attribute may be set to False in order to disable
 402     geo restriction bypass mechanisms for a particular extractor.
 403     Though it won't disable explicit geo restriction bypass based on
 404     country code provided with geo_bypass_country.
 405
 406     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 407     countries for this extractor. One of these countries will be used by
 408     geo restriction bypass mechanism right away in order to bypass
 409     geo restriction, of course, if the mechanism is not disabled.
 410
 411     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 412     IP blocks in CIDR notation for this extractor. One of these IP blocks
 413     will be used by geo restriction bypass mechanism similarly
 414     to _GEO_COUNTRIES.
 415
 416     Finally, the _WORKING attribute should be set to False for broken IEs
 417     in order to warn the users and skip the tests.
 418     """
 419
 420     _ready = False
 421     _downloader = None
 422     _x_forwarded_for_ip = None
 423     _GEO_BYPASS = True
 424     _GEO_COUNTRIES = None
 425     _GEO_IP_BLOCKS = None
 426     _WORKING = True
 427
 428     _LOGIN_HINTS = {
 429         'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
 430         'cookies': (
 431             'Use --cookies for the authentication. '
 432             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to pass cookies'),
 433         'password': 'Use --username and --password or --netrc to provide account credentials',
 434     }
 435
 436     def __init__(self, downloader=None):
 437         """Constructor. Receives an optional downloader."""
 438         self._ready = False
 439         self._x_forwarded_for_ip = None
 440         self.set_downloader(downloader)
 441
 442     @classmethod
 443     def suitable(cls, url):
 444         """Receives a URL and returns True if suitable for this IE."""
 445
 446         # This does not use has/getattr intentionally - we want to know whether
 447         # we have cached the regexp for *this* class, whereas getattr would also
 448         # match the superclass
 449         if '_VALID_URL_RE' not in cls.__dict__:
 450             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 451         return cls._VALID_URL_RE.match(url) is not None
 452
 453     @classmethod
 454     def _match_id(cls, url):
 455         if '_VALID_URL_RE' not in cls.__dict__:
 456             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 457         m = cls._VALID_URL_RE.match(url)
 458         assert m
 459         return compat_str(m.group('id'))
 460
 461     @classmethod
 462     def working(cls):
 463         """Getter method for _WORKING."""
 464         return cls._WORKING
 465
 466     def initialize(self):
 467         """Initializes an instance (authentication, etc)."""
 468         self._initialize_geo_bypass({
 469             'countries': self._GEO_COUNTRIES,
 470             'ip_blocks': self._GEO_IP_BLOCKS,
 471         })
 472         if not self._ready:
 473             self._real_initialize()
 474             self._ready = True
 475
 476     def _initialize_geo_bypass(self, geo_bypass_context):
 477         """
 478         Initialize geo restriction bypass mechanism.
 479
 480         This method is used to initialize geo bypass mechanism based on faking
 481         X-Forwarded-For HTTP header. A random country from provided country list
 482         is selected and a random IP belonging to this country is generated. This
 483         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 484         HTTP requests.
 485
 486         This method will be used for initial geo bypass mechanism initialization
 487         during the instance initialization with _GEO_COUNTRIES and
 488         _GEO_IP_BLOCKS.
 489
 490         You may also manually call it from extractor's code if geo bypass
 491         information is not available beforehand (e.g. obtained during
 492         extraction) or due to some other reason. In this case you should pass
 493         this information in geo bypass context passed as first argument. It may
 494         contain following fields:
 495
 496         countries:  List of geo unrestricted countries (similar
 497                     to _GEO_COUNTRIES)
 498         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 499                     (similar to _GEO_IP_BLOCKS)
 500
 501         """
 502         if not self._x_forwarded_for_ip:
 503
 504             # Geo bypass mechanism is explicitly disabled by user
 505             if not self.get_param('geo_bypass', True):
 506                 return
 507
 508             if not geo_bypass_context:
 509                 geo_bypass_context = {}
 510
 511             # Backward compatibility: previously _initialize_geo_bypass
 512             # expected a list of countries, some 3rd party code may still use
 513             # it this way
 514             if isinstance(geo_bypass_context, (list, tuple)):
 515                 geo_bypass_context = {
 516                     'countries': geo_bypass_context,
 517                 }
 518
 519             # The whole point of geo bypass mechanism is to fake IP
 520             # as X-Forwarded-For HTTP header based on some IP block or
 521             # country code.
 522
 523             # Path 1: bypassing based on IP block in CIDR notation
 524
 525             # Explicit IP block specified by user, use it right away
 526             # regardless of whether extractor is geo bypassable or not
 527             ip_block = self.get_param('geo_bypass_ip_block', None)
 528
 529             # Otherwise use random IP block from geo bypass context but only
 530             # if extractor is known as geo bypassable
 531             if not ip_block:
 532                 ip_blocks = geo_bypass_context.get('ip_blocks')
 533                 if self._GEO_BYPASS and ip_blocks:
 534                     ip_block = random.choice(ip_blocks)
 535
 536             if ip_block:
 537                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 538                 self._downloader.write_debug(
 539                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 540                 return
 541
 542             # Path 2: bypassing based on country code
 543
 544             # Explicit country code specified by user, use it right away
 545             # regardless of whether extractor is geo bypassable or not
 546             country = self.get_param('geo_bypass_country', None)
 547
 548             # Otherwise use random country code from geo bypass context but
 549             # only if extractor is known as geo bypassable
 550             if not country:
 551                 countries = geo_bypass_context.get('countries')
 552                 if self._GEO_BYPASS and countries:
 553                     country = random.choice(countries)
 554
 555             if country:
 556                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 557                 self._downloader.write_debug(
 558                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 559
 560     def extract(self, url):
 561         """Extracts URL information and returns it in list of dicts."""
 562         try:
 563             for _ in range(2):
 564                 try:
 565                     self.initialize()
 566                     self.write_debug('Extracting URL: %s' % url)
 567                     ie_result = self._real_extract(url)
 568                     if ie_result is None:
 569                         return None
 570                     if self._x_forwarded_for_ip:
 571                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 572                     subtitles = ie_result.get('subtitles')
 573                     if (subtitles and 'live_chat' in subtitles
 574                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 575                         del subtitles['live_chat']
 576                     return ie_result
 577                 except GeoRestrictedError as e:
 578                     if self.__maybe_fake_ip_and_retry(e.countries):
 579                         continue
 580                     raise
 581         except ExtractorError:
 582             raise
 583         except compat_http_client.IncompleteRead as e:
 584             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 585         except (KeyError, StopIteration) as e:
 586             raise ExtractorError('An extractor error has occurred.', cause=e)
 587
 588     def __maybe_fake_ip_and_retry(self, countries):
 589         if (not self.get_param('geo_bypass_country', None)
 590                 and self._GEO_BYPASS
 591                 and self.get_param('geo_bypass', True)
 592                 and not self._x_forwarded_for_ip
 593                 and countries):
 594             country_code = random.choice(countries)
 595             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 596             if self._x_forwarded_for_ip:
 597                 self.report_warning(
 598                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 599                     % (self._x_forwarded_for_ip, country_code.upper()))
 600                 return True
 601         return False
 602
 603     def set_downloader(self, downloader):
 604         """Sets the downloader for this IE."""
 605         self._downloader = downloader
 606
 607     def _real_initialize(self):
 608         """Real initialization process. Redefine in subclasses."""
 609         pass
 610
 611     def _real_extract(self, url):
 612         """Real extraction process. Redefine in subclasses."""
 613         pass
 614
 615     @classmethod
 616     def ie_key(cls):
 617         """A string for getting the InfoExtractor with get_info_extractor"""
 618         return compat_str(cls.__name__[:-2])
 619
 620     @property
 621     def IE_NAME(self):
 622         return compat_str(type(self).__name__[:-2])
 623
 624     @staticmethod
 625     def __can_accept_status_code(err, expected_status):
 626         assert isinstance(err, compat_urllib_error.HTTPError)
 627         if expected_status is None:
 628             return False
 629         if isinstance(expected_status, compat_integer_types):
 630             return err.code == expected_status
 631         elif isinstance(expected_status, (list, tuple)):
 632             return err.code in expected_status
 633         elif callable(expected_status):
 634             return expected_status(err.code) is True
 635         else:
 636             assert False
 637
 638     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 639         """
 640         Return the response handle.
 641
 642         See _download_webpage docstring for arguments specification.
 643         """
 644         if not self._downloader._first_webpage_request:
 645             sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
 646             if sleep_interval > 0:
 647                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 648                 time.sleep(sleep_interval)
 649         else:
 650             self._downloader._first_webpage_request = False
 651
 652         if note is None:
 653             self.report_download_webpage(video_id)
 654         elif note is not False:
 655             if video_id is None:
 656                 self.to_screen('%s' % (note,))
 657             else:
 658                 self.to_screen('%s: %s' % (video_id, note))
 659
 660         # Some sites check X-Forwarded-For HTTP header in order to figure out
 661         # the origin of the client behind proxy. This allows bypassing geo
 662         # restriction by faking this header's value to IP that belongs to some
 663         # geo unrestricted country. We will do so once we encounter any
 664         # geo restriction error.
 665         if self._x_forwarded_for_ip:
 666             if 'X-Forwarded-For' not in headers:
 667                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 668
 669         if isinstance(url_or_request, compat_urllib_request.Request):
 670             url_or_request = update_Request(
 671                 url_or_request, data=data, headers=headers, query=query)
 672         else:
 673             if query:
 674                 url_or_request = update_url_query(url_or_request, query)
 675             if data is not None or headers:
 676                 url_or_request = sanitized_Request(url_or_request, data, headers)
 677         try:
 678             return self._downloader.urlopen(url_or_request)
 679         except network_exceptions as err:
 680             if isinstance(err, compat_urllib_error.HTTPError):
 681                 if self.__can_accept_status_code(err, expected_status):
 682                     # Retain reference to error to prevent file object from
 683                     # being closed before it can be read. Works around the
 684                     # effects of <https://bugs.python.org/issue15002>
 685                     # introduced in Python 3.4.1.
 686                     err.fp._error = err
 687                     return err.fp
 688
 689             if errnote is False:
 690                 return False
 691             if errnote is None:
 692                 errnote = 'Unable to download webpage'
 693
 694             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 695             if fatal:
 696                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 697             else:
 698                 self.report_warning(errmsg)
 699                 return False
 700
 701     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 702         """
 703         Return a tuple (page content as string, URL handle).
 704
 705         See _download_webpage docstring for arguments specification.
 706         """
 707         # Strip hashes from the URL (#1038)
 708         if isinstance(url_or_request, (compat_str, str)):
 709             url_or_request = url_or_request.partition('#')[0]
 710
 711         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 712         if urlh is False:
 713             assert not fatal
 714             return False
 715         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 716         return (content, urlh)
 717
 718     @staticmethod
 719     def _guess_encoding_from_content(content_type, webpage_bytes):
 720         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 721         if m:
 722             encoding = m.group(1)
 723         else:
 724             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 725                           webpage_bytes[:1024])
 726             if m:
 727                 encoding = m.group(1).decode('ascii')
 728             elif webpage_bytes.startswith(b'\xff\xfe'):
 729                 encoding = 'utf-16'
 730             else:
 731                 encoding = 'utf-8'
 732
 733         return encoding
 734
 735     def __check_blocked(self, content):
 736         first_block = content[:512]
 737         if ('<title>Access to this site is blocked</title>' in content
 738                 and 'Websense' in first_block):
 739             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 740             blocked_iframe = self._html_search_regex(
 741                 r'<iframe src="([^"]+)"', content,
 742                 'Websense information URL', default=None)
 743             if blocked_iframe:
 744                 msg += ' Visit %s for more details' % blocked_iframe
 745             raise ExtractorError(msg, expected=True)
 746         if '<title>The URL you requested has been blocked</title>' in first_block:
 747             msg = (
 748                 'Access to this webpage has been blocked by Indian censorship. '
 749                 'Use a VPN or proxy server (with --proxy) to route around it.')
 750             block_msg = self._html_search_regex(
 751                 r'</h1><p>(.*?)</p>',
 752                 content, 'block message', default=None)
 753             if block_msg:
 754                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 755             raise ExtractorError(msg, expected=True)
 756         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 757                 and 'blocklist.rkn.gov.ru' in content):
 758             raise ExtractorError(
 759                 'Access to this webpage has been blocked by decision of the Russian government. '
 760                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 761                 expected=True)
 762
 763     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 764         content_type = urlh.headers.get('Content-Type', '')
 765         webpage_bytes = urlh.read()
 766         if prefix is not None:
 767             webpage_bytes = prefix + webpage_bytes
 768         if not encoding:
 769             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 770         if self.get_param('dump_intermediate_pages', False):
 771             self.to_screen('Dumping request to ' + urlh.geturl())
 772             dump = base64.b64encode(webpage_bytes).decode('ascii')
 773             self._downloader.to_screen(dump)
 774         if self.get_param('write_pages', False):
 775             basen = '%s_%s' % (video_id, urlh.geturl())
 776             if len(basen) > 240:
 777                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 778                 basen = basen[:240 - len(h)] + h
 779             raw_filename = basen + '.dump'
 780             filename = sanitize_filename(raw_filename, restricted=True)
 781             self.to_screen('Saving request to ' + filename)
 782             # Working around MAX_PATH limitation on Windows (see
 783             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 784             if compat_os_name == 'nt':
 785                 absfilepath = os.path.abspath(filename)
 786                 if len(absfilepath) > 259:
 787                     filename = '\\\\?\\' + absfilepath
 788             with open(filename, 'wb') as outf:
 789                 outf.write(webpage_bytes)
 790
 791         try:
 792             content = webpage_bytes.decode(encoding, 'replace')
 793         except LookupError:
 794             content = webpage_bytes.decode('utf-8', 'replace')
 795
 796         self.__check_blocked(content)
 797
 798         return content
 799
 800     def _download_webpage(
 801             self, url_or_request, video_id, note=None, errnote=None,
 802             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 803             headers={}, query={}, expected_status=None):
 804         """
 805         Return the data of the page as a string.
 806
 807         Arguments:
 808         url_or_request -- plain text URL as a string or
 809             a compat_urllib_request.Requestobject
 810         video_id -- Video/playlist/item identifier (string)
 811
 812         Keyword arguments:
 813         note -- note printed before downloading (string)
 814         errnote -- note printed in case of an error (string)
 815         fatal -- flag denoting whether error should be considered fatal,
 816             i.e. whether it should cause ExtractionError to be raised,
 817             otherwise a warning will be reported and extraction continued
 818         tries -- number of tries
 819         timeout -- sleep interval between tries
 820         encoding -- encoding for a page content decoding, guessed automatically
 821             when not explicitly specified
 822         data -- POST data (bytes)
 823         headers -- HTTP headers (dict)
 824         query -- URL query (dict)
 825         expected_status -- allows to accept failed HTTP requests (non 2xx
 826             status code) by explicitly specifying a set of accepted status
 827             codes. Can be any of the following entities:
 828                 - an integer type specifying an exact failed status code to
 829                   accept
 830                 - a list or a tuple of integer types specifying a list of
 831                   failed status codes to accept
 832                 - a callable accepting an actual failed status code and
 833                   returning True if it should be accepted
 834             Note that this argument does not affect success status codes (2xx)
 835             which are always accepted.
 836         """
 837
 838         success = False
 839         try_count = 0
 840         while success is False:
 841             try:
 842                 res = self._download_webpage_handle(
 843                     url_or_request, video_id, note, errnote, fatal,
 844                     encoding=encoding, data=data, headers=headers, query=query,
 845                     expected_status=expected_status)
 846                 success = True
 847             except compat_http_client.IncompleteRead as e:
 848                 try_count += 1
 849                 if try_count >= tries:
 850                     raise e
 851                 self._sleep(timeout, video_id)
 852         if res is False:
 853             return res
 854         else:
 855             content, _ = res
 856             return content
 857
 858     def _download_xml_handle(
 859             self, url_or_request, video_id, note='Downloading XML',
 860             errnote='Unable to download XML', transform_source=None,
 861             fatal=True, encoding=None, data=None, headers={}, query={},
 862             expected_status=None):
 863         """
 864         Return a tuple (xml as an compat_etree_Element, URL handle).
 865
 866         See _download_webpage docstring for arguments specification.
 867         """
 868         res = self._download_webpage_handle(
 869             url_or_request, video_id, note, errnote, fatal=fatal,
 870             encoding=encoding, data=data, headers=headers, query=query,
 871             expected_status=expected_status)
 872         if res is False:
 873             return res
 874         xml_string, urlh = res
 875         return self._parse_xml(
 876             xml_string, video_id, transform_source=transform_source,
 877             fatal=fatal), urlh
 878
 879     def _download_xml(
 880             self, url_or_request, video_id,
 881             note='Downloading XML', errnote='Unable to download XML',
 882             transform_source=None, fatal=True, encoding=None,
 883             data=None, headers={}, query={}, expected_status=None):
 884         """
 885         Return the xml as an compat_etree_Element.
 886
 887         See _download_webpage docstring for arguments specification.
 888         """
 889         res = self._download_xml_handle(
 890             url_or_request, video_id, note=note, errnote=errnote,
 891             transform_source=transform_source, fatal=fatal, encoding=encoding,
 892             data=data, headers=headers, query=query,
 893             expected_status=expected_status)
 894         return res if res is False else res[0]
 895
 896     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 897         if transform_source:
 898             xml_string = transform_source(xml_string)
 899         try:
 900             return compat_etree_fromstring(xml_string.encode('utf-8'))
 901         except compat_xml_parse_error as ve:
 902             errmsg = '%s: Failed to parse XML ' % video_id
 903             if fatal:
 904                 raise ExtractorError(errmsg, cause=ve)
 905             else:
 906                 self.report_warning(errmsg + str(ve))
 907
 908     def _download_json_handle(
 909             self, url_or_request, video_id, note='Downloading JSON metadata',
 910             errnote='Unable to download JSON metadata', transform_source=None,
 911             fatal=True, encoding=None, data=None, headers={}, query={},
 912             expected_status=None):
 913         """
 914         Return a tuple (JSON object, URL handle).
 915
 916         See _download_webpage docstring for arguments specification.
 917         """
 918         res = self._download_webpage_handle(
 919             url_or_request, video_id, note, errnote, fatal=fatal,
 920             encoding=encoding, data=data, headers=headers, query=query,
 921             expected_status=expected_status)
 922         if res is False:
 923             return res
 924         json_string, urlh = res
 925         return self._parse_json(
 926             json_string, video_id, transform_source=transform_source,
 927             fatal=fatal), urlh
 928
 929     def _download_json(
 930             self, url_or_request, video_id, note='Downloading JSON metadata',
 931             errnote='Unable to download JSON metadata', transform_source=None,
 932             fatal=True, encoding=None, data=None, headers={}, query={},
 933             expected_status=None):
 934         """
 935         Return the JSON object as a dict.
 936
 937         See _download_webpage docstring for arguments specification.
 938         """
 939         res = self._download_json_handle(
 940             url_or_request, video_id, note=note, errnote=errnote,
 941             transform_source=transform_source, fatal=fatal, encoding=encoding,
 942             data=data, headers=headers, query=query,
 943             expected_status=expected_status)
 944         return res if res is False else res[0]
 945
 946     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 947         if transform_source:
 948             json_string = transform_source(json_string)
 949         try:
 950             return json.loads(json_string)
 951         except ValueError as ve:
 952             errmsg = '%s: Failed to parse JSON ' % video_id
 953             if fatal:
 954                 raise ExtractorError(errmsg, cause=ve)
 955             else:
 956                 self.report_warning(errmsg + str(ve))
 957
 958     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 959         return self._parse_json(
 960             data[data.find('{'):data.rfind('}') + 1],
 961             video_id, transform_source, fatal)
 962
 963     def _download_socket_json_handle(
 964             self, url_or_request, video_id, note='Polling socket',
 965             errnote='Unable to poll socket', transform_source=None,
 966             fatal=True, encoding=None, data=None, headers={}, query={},
 967             expected_status=None):
 968         """
 969         Return a tuple (JSON object, URL handle).
 970
 971         See _download_webpage docstring for arguments specification.
 972         """
 973         res = self._download_webpage_handle(
 974             url_or_request, video_id, note, errnote, fatal=fatal,
 975             encoding=encoding, data=data, headers=headers, query=query,
 976             expected_status=expected_status)
 977         if res is False:
 978             return res
 979         webpage, urlh = res
 980         return self._parse_socket_response_as_json(
 981             webpage, video_id, transform_source=transform_source,
 982             fatal=fatal), urlh
 983
 984     def _download_socket_json(
 985             self, url_or_request, video_id, note='Polling socket',
 986             errnote='Unable to poll socket', transform_source=None,
 987             fatal=True, encoding=None, data=None, headers={}, query={},
 988             expected_status=None):
 989         """
 990         Return the JSON object as a dict.
 991
 992         See _download_webpage docstring for arguments specification.
 993         """
 994         res = self._download_socket_json_handle(
 995             url_or_request, video_id, note=note, errnote=errnote,
 996             transform_source=transform_source, fatal=fatal, encoding=encoding,
 997             data=data, headers=headers, query=query,
 998             expected_status=expected_status)
 999         return res if res is False else res[0]
1000
1001     def report_warning(self, msg, video_id=None, *args, **kwargs):
1002         idstr = '' if video_id is None else '%s: ' % video_id
1003         self._downloader.report_warning(
1004             '[%s] %s%s' % (self.IE_NAME, idstr, msg), *args, **kwargs)
1005
1006     def to_screen(self, msg, *args, **kwargs):
1007         """Print msg to screen, prefixing it with '[ie_name]'"""
1008         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1009
1010     def write_debug(self, msg, *args, **kwargs):
1011         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1012
1013     def get_param(self, name, default=None, *args, **kwargs):
1014         if self._downloader:
1015             return self._downloader.params.get(name, default, *args, **kwargs)
1016         return default
1017
1018     def report_extraction(self, id_or_name):
1019         """Report information extraction."""
1020         self.to_screen('%s: Extracting information' % id_or_name)
1021
1022     def report_download_webpage(self, video_id):
1023         """Report webpage download."""
1024         self.to_screen('%s: Downloading webpage' % video_id)
1025
1026     def report_age_confirmation(self):
1027         """Report attempt to confirm age."""
1028         self.to_screen('Confirming age')
1029
1030     def report_login(self):
1031         """Report attempt to log in."""
1032         self.to_screen('Logging in')
1033
1034     def raise_login_required(
1035             self, msg='This video is only available for registered users',
1036             metadata_available=False, method='any'):
1037         if metadata_available and self.get_param('ignore_no_formats_error'):
1038             self.report_warning(msg)
1039         raise ExtractorError('%s. %s' % (msg, self._LOGIN_HINTS[method]), expected=True)
1040
1041     def raise_geo_restricted(
1042             self, msg='This video is not available from your location due to geo restriction',
1043             countries=None, metadata_available=False):
1044         if metadata_available and self.get_param('ignore_no_formats_error'):
1045             self.report_warning(msg)
1046         else:
1047             raise GeoRestrictedError(msg, countries=countries)
1048
1049     def raise_no_formats(self, msg, expected=False, video_id=None):
1050         if expected and self.get_param('ignore_no_formats_error'):
1051             self.report_warning(msg, video_id)
1052         else:
1053             raise ExtractorError(msg, expected=expected, video_id=video_id)
1054
1055     # Methods for following #608
1056     @staticmethod
1057     def url_result(url, ie=None, video_id=None, video_title=None):
1058         """Returns a URL that points to a page that should be processed"""
1059         # TODO: ie should be the class used for getting the info
1060         video_info = {'_type': 'url',
1061                       'url': url,
1062                       'ie_key': ie}
1063         if video_id is not None:
1064             video_info['id'] = video_id
1065         if video_title is not None:
1066             video_info['title'] = video_title
1067         return video_info
1068
1069     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1070         urls = orderedSet(
1071             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1072             for m in matches)
1073         return self.playlist_result(
1074             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1075
1076     @staticmethod
1077     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1078         """Returns a playlist"""
1079         video_info = {'_type': 'playlist',
1080                       'entries': entries}
1081         video_info.update(kwargs)
1082         if playlist_id:
1083             video_info['id'] = playlist_id
1084         if playlist_title:
1085             video_info['title'] = playlist_title
1086         if playlist_description is not None:
1087             video_info['description'] = playlist_description
1088         return video_info
1089
1090     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1091         """
1092         Perform a regex search on the given string, using a single or a list of
1093         patterns returning the first matching group.
1094         In case of failure return a default value or raise a WARNING or a
1095         RegexNotFoundError, depending on fatal, specifying the field name.
1096         """
1097         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1098             mobj = re.search(pattern, string, flags)
1099         else:
1100             for p in pattern:
1101                 mobj = re.search(p, string, flags)
1102                 if mobj:
1103                     break
1104
1105         if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1106             _name = '\033[0;34m%s\033[0m' % name
1107         else:
1108             _name = name
1109
1110         if mobj:
1111             if group is None:
1112                 # return the first matching group
1113                 return next(g for g in mobj.groups() if g is not None)
1114             else:
1115                 return mobj.group(group)
1116         elif default is not NO_DEFAULT:
1117             return default
1118         elif fatal:
1119             raise RegexNotFoundError('Unable to extract %s' % _name)
1120         else:
1121             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1122             return None
1123
1124     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1125         """
1126         Like _search_regex, but strips HTML tags and unescapes entities.
1127         """
1128         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1129         if res:
1130             return clean_html(res).strip()
1131         else:
1132             return res
1133
1134     def _get_netrc_login_info(self, netrc_machine=None):
1135         username = None
1136         password = None
1137         netrc_machine = netrc_machine or self._NETRC_MACHINE
1138
1139         if self.get_param('usenetrc', False):
1140             try:
1141                 info = netrc.netrc().authenticators(netrc_machine)
1142                 if info is not None:
1143                     username = info[0]
1144                     password = info[2]
1145                 else:
1146                     raise netrc.NetrcParseError(
1147                         'No authenticators for %s' % netrc_machine)
1148             except (IOError, netrc.NetrcParseError) as err:
1149                 self.report_warning(
1150                     'parsing .netrc: %s' % error_to_compat_str(err))
1151
1152         return username, password
1153
1154     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1155         """
1156         Get the login info as (username, password)
1157         First look for the manually specified credentials using username_option
1158         and password_option as keys in params dictionary. If no such credentials
1159         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1160         value.
1161         If there's no info available, return (None, None)
1162         """
1163
1164         # Attempt to use provided username and password or .netrc data
1165         username = self.get_param(username_option)
1166         if username is not None:
1167             password = self.get_param(password_option)
1168         else:
1169             username, password = self._get_netrc_login_info(netrc_machine)
1170
1171         return username, password
1172
1173     def _get_tfa_info(self, note='two-factor verification code'):
1174         """
1175         Get the two-factor authentication info
1176         TODO - asking the user will be required for sms/phone verify
1177         currently just uses the command line option
1178         If there's no info available, return None
1179         """
1180
1181         tfa = self.get_param('twofactor')
1182         if tfa is not None:
1183             return tfa
1184
1185         return compat_getpass('Type %s and press [Return]: ' % note)
1186
1187     # Helper functions for extracting OpenGraph info
1188     @staticmethod
1189     def _og_regexes(prop):
1190         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1191         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1192                        % {'prop': re.escape(prop)})
1193         template = r'<meta[^>]+?%s[^>]+?%s'
1194         return [
1195             template % (property_re, content_re),
1196             template % (content_re, property_re),
1197         ]
1198
1199     @staticmethod
1200     def _meta_regex(prop):
1201         return r'''(?isx)<meta
1202                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1203                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1204
1205     def _og_search_property(self, prop, html, name=None, **kargs):
1206         if not isinstance(prop, (list, tuple)):
1207             prop = [prop]
1208         if name is None:
1209             name = 'OpenGraph %s' % prop[0]
1210         og_regexes = []
1211         for p in prop:
1212             og_regexes.extend(self._og_regexes(p))
1213         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1214         if escaped is None:
1215             return None
1216         return unescapeHTML(escaped)
1217
1218     def _og_search_thumbnail(self, html, **kargs):
1219         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1220
1221     def _og_search_description(self, html, **kargs):
1222         return self._og_search_property('description', html, fatal=False, **kargs)
1223
1224     def _og_search_title(self, html, **kargs):
1225         return self._og_search_property('title', html, **kargs)
1226
1227     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1228         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1229         if secure:
1230             regexes = self._og_regexes('video:secure_url') + regexes
1231         return self._html_search_regex(regexes, html, name, **kargs)
1232
1233     def _og_search_url(self, html, **kargs):
1234         return self._og_search_property('url', html, **kargs)
1235
1236     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1237         if not isinstance(name, (list, tuple)):
1238             name = [name]
1239         if display_name is None:
1240             display_name = name[0]
1241         return self._html_search_regex(
1242             [self._meta_regex(n) for n in name],
1243             html, display_name, fatal=fatal, group='content', **kwargs)
1244
1245     def _dc_search_uploader(self, html):
1246         return self._html_search_meta('dc.creator', html, 'uploader')
1247
1248     def _rta_search(self, html):
1249         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1250         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1251                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1252                      html):
1253             return 18
1254         return 0
1255
1256     def _media_rating_search(self, html):
1257         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1258         rating = self._html_search_meta('rating', html)
1259
1260         if not rating:
1261             return None
1262
1263         RATING_TABLE = {
1264             'safe for kids': 0,
1265             'general': 8,
1266             '14 years': 14,
1267             'mature': 17,
1268             'restricted': 19,
1269         }
1270         return RATING_TABLE.get(rating.lower())
1271
1272     def _family_friendly_search(self, html):
1273         # See http://schema.org/VideoObject
1274         family_friendly = self._html_search_meta(
1275             'isFamilyFriendly', html, default=None)
1276
1277         if not family_friendly:
1278             return None
1279
1280         RATING_TABLE = {
1281             '1': 0,
1282             'true': 0,
1283             '0': 18,
1284             'false': 18,
1285         }
1286         return RATING_TABLE.get(family_friendly.lower())
1287
1288     def _twitter_search_player(self, html):
1289         return self._html_search_meta('twitter:player', html,
1290                                       'twitter card player')
1291
1292     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1293         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1294         default = kwargs.get('default', NO_DEFAULT)
1295         # JSON-LD may be malformed and thus `fatal` should be respected.
1296         # At the same time `default` may be passed that assumes `fatal=False`
1297         # for _search_regex. Let's simulate the same behavior here as well.
1298         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1299         json_ld = []
1300         for mobj in json_ld_list:
1301             json_ld_item = self._parse_json(
1302                 mobj.group('json_ld'), video_id, fatal=fatal)
1303             if not json_ld_item:
1304                 continue
1305             if isinstance(json_ld_item, dict):
1306                 json_ld.append(json_ld_item)
1307             elif isinstance(json_ld_item, (list, tuple)):
1308                 json_ld.extend(json_ld_item)
1309         if json_ld:
1310             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1311         if json_ld:
1312             return json_ld
1313         if default is not NO_DEFAULT:
1314             return default
1315         elif fatal:
1316             raise RegexNotFoundError('Unable to extract JSON-LD')
1317         else:
1318             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1319             return {}
1320
1321     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1322         if isinstance(json_ld, compat_str):
1323             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1324         if not json_ld:
1325             return {}
1326         info = {}
1327         if not isinstance(json_ld, (list, tuple, dict)):
1328             return info
1329         if isinstance(json_ld, dict):
1330             json_ld = [json_ld]
1331
1332         INTERACTION_TYPE_MAP = {
1333             'CommentAction': 'comment',
1334             'AgreeAction': 'like',
1335             'DisagreeAction': 'dislike',
1336             'LikeAction': 'like',
1337             'DislikeAction': 'dislike',
1338             'ListenAction': 'view',
1339             'WatchAction': 'view',
1340             'ViewAction': 'view',
1341         }
1342
1343         def extract_interaction_type(e):
1344             interaction_type = e.get('interactionType')
1345             if isinstance(interaction_type, dict):
1346                 interaction_type = interaction_type.get('@type')
1347             return str_or_none(interaction_type)
1348
1349         def extract_interaction_statistic(e):
1350             interaction_statistic = e.get('interactionStatistic')
1351             if isinstance(interaction_statistic, dict):
1352                 interaction_statistic = [interaction_statistic]
1353             if not isinstance(interaction_statistic, list):
1354                 return
1355             for is_e in interaction_statistic:
1356                 if not isinstance(is_e, dict):
1357                     continue
1358                 if is_e.get('@type') != 'InteractionCounter':
1359                     continue
1360                 interaction_type = extract_interaction_type(is_e)
1361                 if not interaction_type:
1362                     continue
1363                 # For interaction count some sites provide string instead of
1364                 # an integer (as per spec) with non digit characters (e.g. ",")
1365                 # so extracting count with more relaxed str_to_int
1366                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1367                 if interaction_count is None:
1368                     continue
1369                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1370                 if not count_kind:
1371                     continue
1372                 count_key = '%s_count' % count_kind
1373                 if info.get(count_key) is not None:
1374                     continue
1375                 info[count_key] = interaction_count
1376
1377         def extract_video_object(e):
1378             assert e['@type'] == 'VideoObject'
1379             author = e.get('author')
1380             info.update({
1381                 'url': url_or_none(e.get('contentUrl')),
1382                 'title': unescapeHTML(e.get('name')),
1383                 'description': unescapeHTML(e.get('description')),
1384                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1385                 'duration': parse_duration(e.get('duration')),
1386                 'timestamp': unified_timestamp(e.get('uploadDate')),
1387                 # author can be an instance of 'Organization' or 'Person' types.
1388                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1389                 # however some websites are using 'Text' type instead.
1390                 # 1. https://schema.org/VideoObject
1391                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1392                 'filesize': float_or_none(e.get('contentSize')),
1393                 'tbr': int_or_none(e.get('bitrate')),
1394                 'width': int_or_none(e.get('width')),
1395                 'height': int_or_none(e.get('height')),
1396                 'view_count': int_or_none(e.get('interactionCount')),
1397             })
1398             extract_interaction_statistic(e)
1399
1400         for e in json_ld:
1401             if '@context' in e:
1402                 item_type = e.get('@type')
1403                 if expected_type is not None and expected_type != item_type:
1404                     continue
1405                 if item_type in ('TVEpisode', 'Episode'):
1406                     episode_name = unescapeHTML(e.get('name'))
1407                     info.update({
1408                         'episode': episode_name,
1409                         'episode_number': int_or_none(e.get('episodeNumber')),
1410                         'description': unescapeHTML(e.get('description')),
1411                     })
1412                     if not info.get('title') and episode_name:
1413                         info['title'] = episode_name
1414                     part_of_season = e.get('partOfSeason')
1415                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1416                         info.update({
1417                             'season': unescapeHTML(part_of_season.get('name')),
1418                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1419                         })
1420                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1421                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1422                         info['series'] = unescapeHTML(part_of_series.get('name'))
1423                 elif item_type == 'Movie':
1424                     info.update({
1425                         'title': unescapeHTML(e.get('name')),
1426                         'description': unescapeHTML(e.get('description')),
1427                         'duration': parse_duration(e.get('duration')),
1428                         'timestamp': unified_timestamp(e.get('dateCreated')),
1429                     })
1430                 elif item_type in ('Article', 'NewsArticle'):
1431                     info.update({
1432                         'timestamp': parse_iso8601(e.get('datePublished')),
1433                         'title': unescapeHTML(e.get('headline')),
1434                         'description': unescapeHTML(e.get('articleBody')),
1435                     })
1436                 elif item_type == 'VideoObject':
1437                     extract_video_object(e)
1438                     if expected_type is None:
1439                         continue
1440                     else:
1441                         break
1442                 video = e.get('video')
1443                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1444                     extract_video_object(video)
1445                 if expected_type is None:
1446                     continue
1447                 else:
1448                     break
1449         return dict((k, v) for k, v in info.items() if v is not None)
1450
1451     @staticmethod
1452     def _hidden_inputs(html):
1453         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1454         hidden_inputs = {}
1455         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1456             attrs = extract_attributes(input)
1457             if not input:
1458                 continue
1459             if attrs.get('type') not in ('hidden', 'submit'):
1460                 continue
1461             name = attrs.get('name') or attrs.get('id')
1462             value = attrs.get('value')
1463             if name and value is not None:
1464                 hidden_inputs[name] = value
1465         return hidden_inputs
1466
1467     def _form_hidden_inputs(self, form_id, html):
1468         form = self._search_regex(
1469             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1470             html, '%s form' % form_id, group='form')
1471         return self._hidden_inputs(form)
1472
1473     class FormatSort:
1474         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1475
1476         default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
1477                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1478                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1479         ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
1480                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1481                         'fps', 'fs_approx', 'source', 'format_id')
1482
1483         settings = {
1484             'vcodec': {'type': 'ordered', 'regex': True,
1485                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1486             'acodec': {'type': 'ordered', 'regex': True,
1487                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1488             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1489                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
1490             'vext': {'type': 'ordered', 'field': 'video_ext',
1491                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1492                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1493             'aext': {'type': 'ordered', 'field': 'audio_ext',
1494                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1495                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1496             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1497             'ie_pref': {'priority': True, 'type': 'extractor'},
1498             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1499             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1500             'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
1501             'quality': {'convert': 'float_none', 'default': -1},
1502             'filesize': {'convert': 'bytes'},
1503             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1504             'id': {'convert': 'string', 'field': 'format_id'},
1505             'height': {'convert': 'float_none'},
1506             'width': {'convert': 'float_none'},
1507             'fps': {'convert': 'float_none'},
1508             'tbr': {'convert': 'float_none'},
1509             'vbr': {'convert': 'float_none'},
1510             'abr': {'convert': 'float_none'},
1511             'asr': {'convert': 'float_none'},
1512             'source': {'convert': 'ignore', 'field': 'source_preference'},
1513
1514             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1515             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1516             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1517             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1518             'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1519
1520             # Most of these exist only for compatibility reasons
1521             'dimension': {'type': 'alias', 'field': 'res'},
1522             'resolution': {'type': 'alias', 'field': 'res'},
1523             'extension': {'type': 'alias', 'field': 'ext'},
1524             'bitrate': {'type': 'alias', 'field': 'br'},
1525             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1526             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1527             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1528             'framerate': {'type': 'alias', 'field': 'fps'},
1529             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1530             'protocol': {'type': 'alias', 'field': 'proto'},
1531             'source_preference': {'type': 'alias', 'field': 'source'},
1532             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1533             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1534             'samplerate': {'type': 'alias', 'field': 'asr'},
1535             'video_ext': {'type': 'alias', 'field': 'vext'},
1536             'audio_ext': {'type': 'alias', 'field': 'aext'},
1537             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1538             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1539             'video': {'type': 'alias', 'field': 'hasvid'},
1540             'has_video': {'type': 'alias', 'field': 'hasvid'},
1541             'audio': {'type': 'alias', 'field': 'hasaud'},
1542             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1543             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1544             'preference': {'type': 'alias', 'field': 'ie_pref'},
1545             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1546             'format_id': {'type': 'alias', 'field': 'id'},
1547         }
1548
1549         _order = []
1550
1551         def _get_field_setting(self, field, key):
1552             if field not in self.settings:
1553                 self.settings[field] = {}
1554             propObj = self.settings[field]
1555             if key not in propObj:
1556                 type = propObj.get('type')
1557                 if key == 'field':
1558                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1559                 elif key == 'convert':
1560                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1561                 else:
1562                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1563                 propObj[key] = default
1564             return propObj[key]
1565
1566         def _resolve_field_value(self, field, value, convertNone=False):
1567             if value is None:
1568                 if not convertNone:
1569                     return None
1570             else:
1571                 value = value.lower()
1572             conversion = self._get_field_setting(field, 'convert')
1573             if conversion == 'ignore':
1574                 return None
1575             if conversion == 'string':
1576                 return value
1577             elif conversion == 'float_none':
1578                 return float_or_none(value)
1579             elif conversion == 'bytes':
1580                 return FileDownloader.parse_bytes(value)
1581             elif conversion == 'order':
1582                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1583                 use_regex = self._get_field_setting(field, 'regex')
1584                 list_length = len(order_list)
1585                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1586                 if use_regex and value is not None:
1587                     for i, regex in enumerate(order_list):
1588                         if regex and re.match(regex, value):
1589                             return list_length - i
1590                     return list_length - empty_pos  # not in list
1591                 else:  # not regex or  value = None
1592                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1593             else:
1594                 if value.isnumeric():
1595                     return float(value)
1596                 else:
1597                     self.settings[field]['convert'] = 'string'
1598                     return value
1599
1600         def evaluate_params(self, params, sort_extractor):
1601             self._use_free_order = params.get('prefer_free_formats', False)
1602             self._sort_user = params.get('format_sort', [])
1603             self._sort_extractor = sort_extractor
1604
1605             def add_item(field, reverse, closest, limit_text):
1606                 field = field.lower()
1607                 if field in self._order:
1608                     return
1609                 self._order.append(field)
1610                 limit = self._resolve_field_value(field, limit_text)
1611                 data = {
1612                     'reverse': reverse,
1613                     'closest': False if limit is None else closest,
1614                     'limit_text': limit_text,
1615                     'limit': limit}
1616                 if field in self.settings:
1617                     self.settings[field].update(data)
1618                 else:
1619                     self.settings[field] = data
1620
1621             sort_list = (
1622                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1623                 + (tuple() if params.get('format_sort_force', False)
1624                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1625                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1626
1627             for item in sort_list:
1628                 match = re.match(self.regex, item)
1629                 if match is None:
1630                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1631                 field = match.group('field')
1632                 if field is None:
1633                     continue
1634                 if self._get_field_setting(field, 'type') == 'alias':
1635                     field = self._get_field_setting(field, 'field')
1636                 reverse = match.group('reverse') is not None
1637                 closest = match.group('separator') == '~'
1638                 limit_text = match.group('limit')
1639
1640                 has_limit = limit_text is not None
1641                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1642                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1643
1644                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1645                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1646                 limit_count = len(limits)
1647                 for (i, f) in enumerate(fields):
1648                     add_item(f, reverse, closest,
1649                              limits[i] if i < limit_count
1650                              else limits[0] if has_limit and not has_multiple_limits
1651                              else None)
1652
1653         def print_verbose_info(self, write_debug):
1654             if self._sort_user:
1655                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1656             if self._sort_extractor:
1657                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1658             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1659                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1660                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1661                               self._get_field_setting(field, 'limit_text'),
1662                               self._get_field_setting(field, 'limit'))
1663                 if self._get_field_setting(field, 'limit_text') is not None else '')
1664                 for field in self._order if self._get_field_setting(field, 'visible')]))
1665
1666         def _calculate_field_preference_from_value(self, format, field, type, value):
1667             reverse = self._get_field_setting(field, 'reverse')
1668             closest = self._get_field_setting(field, 'closest')
1669             limit = self._get_field_setting(field, 'limit')
1670
1671             if type == 'extractor':
1672                 maximum = self._get_field_setting(field, 'max')
1673                 if value is None or (maximum is not None and value >= maximum):
1674                     value = -1
1675             elif type == 'boolean':
1676                 in_list = self._get_field_setting(field, 'in_list')
1677                 not_in_list = self._get_field_setting(field, 'not_in_list')
1678                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1679             elif type == 'ordered':
1680                 value = self._resolve_field_value(field, value, True)
1681
1682             # try to convert to number
1683             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1684             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1685             if is_num:
1686                 value = val_num
1687
1688             return ((-10, 0) if value is None
1689                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1690                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1691                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1692                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1693                     else (-1, value, 0))
1694
1695         def _calculate_field_preference(self, format, field):
1696             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1697             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1698             if type == 'multiple':
1699                 type = 'field'  # Only 'field' is allowed in multiple for now
1700                 actual_fields = self._get_field_setting(field, 'field')
1701
1702                 def wrapped_function(values):
1703                     values = tuple(filter(lambda x: x is not None, values))
1704                     return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1705                             else values[0] if values
1706                             else None)
1707
1708                 value = wrapped_function((get_value(f) for f in actual_fields))
1709             else:
1710                 value = get_value(field)
1711             return self._calculate_field_preference_from_value(format, field, type, value)
1712
1713         def calculate_preference(self, format):
1714             # Determine missing protocol
1715             if not format.get('protocol'):
1716                 format['protocol'] = determine_protocol(format)
1717
1718             # Determine missing ext
1719             if not format.get('ext') and 'url' in format:
1720                 format['ext'] = determine_ext(format['url'])
1721             if format.get('vcodec') == 'none':
1722                 format['audio_ext'] = format['ext']
1723                 format['video_ext'] = 'none'
1724             else:
1725                 format['video_ext'] = format['ext']
1726                 format['audio_ext'] = 'none'
1727             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1728             #    format['preference'] = -1000
1729
1730             # Determine missing bitrates
1731             if format.get('tbr') is None:
1732                 if format.get('vbr') is not None and format.get('abr') is not None:
1733                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1734             else:
1735                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1736                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1737                 if format.get('acodec') != "none" and format.get('abr') is None:
1738                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1739
1740             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1741
1742     def _sort_formats(self, formats, field_preference=[]):
1743         if not formats:
1744             if self.get_param('ignore_no_formats_error'):
1745                 return
1746             raise ExtractorError('No video formats found')
1747         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1748         format_sort.evaluate_params(self._downloader.params, field_preference)
1749         if self.get_param('verbose', False):
1750             format_sort.print_verbose_info(self._downloader.write_debug)
1751         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1752
1753     def _check_formats(self, formats, video_id):
1754         if formats:
1755             formats[:] = filter(
1756                 lambda f: self._is_valid_url(
1757                     f['url'], video_id,
1758                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1759                 formats)
1760
1761     @staticmethod
1762     def _remove_duplicate_formats(formats):
1763         format_urls = set()
1764         unique_formats = []
1765         for f in formats:
1766             if f['url'] not in format_urls:
1767                 format_urls.add(f['url'])
1768                 unique_formats.append(f)
1769         formats[:] = unique_formats
1770
1771     def _is_valid_url(self, url, video_id, item='video', headers={}):
1772         url = self._proto_relative_url(url, scheme='http:')
1773         # For now assume non HTTP(S) URLs always valid
1774         if not (url.startswith('http://') or url.startswith('https://')):
1775             return True
1776         try:
1777             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1778             return True
1779         except ExtractorError as e:
1780             self.to_screen(
1781                 '%s: %s URL is invalid, skipping: %s'
1782                 % (video_id, item, error_to_compat_str(e.cause)))
1783             return False
1784
1785     def http_scheme(self):
1786         """ Either "http:" or "https:", depending on the user's preferences """
1787         return (
1788             'http:'
1789             if self.get_param('prefer_insecure', False)
1790             else 'https:')
1791
1792     def _proto_relative_url(self, url, scheme=None):
1793         if url is None:
1794             return url
1795         if url.startswith('//'):
1796             if scheme is None:
1797                 scheme = self.http_scheme()
1798             return scheme + url
1799         else:
1800             return url
1801
1802     def _sleep(self, timeout, video_id, msg_template=None):
1803         if msg_template is None:
1804             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1805         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1806         self.to_screen(msg)
1807         time.sleep(timeout)
1808
1809     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1810                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1811                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1812         manifest = self._download_xml(
1813             manifest_url, video_id, 'Downloading f4m manifest',
1814             'Unable to download f4m manifest',
1815             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1816             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1817             transform_source=transform_source,
1818             fatal=fatal, data=data, headers=headers, query=query)
1819
1820         if manifest is False:
1821             return []
1822
1823         return self._parse_f4m_formats(
1824             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1825             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1826
1827     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1828                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1829                            fatal=True, m3u8_id=None):
1830         if not isinstance(manifest, compat_etree_Element) and not fatal:
1831             return []
1832
1833         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1834         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1835         if akamai_pv is not None and ';' in akamai_pv.text:
1836             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1837             if playerVerificationChallenge.strip() != '':
1838                 return []
1839
1840         formats = []
1841         manifest_version = '1.0'
1842         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1843         if not media_nodes:
1844             manifest_version = '2.0'
1845             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1846         # Remove unsupported DRM protected media from final formats
1847         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1848         media_nodes = remove_encrypted_media(media_nodes)
1849         if not media_nodes:
1850             return formats
1851
1852         manifest_base_url = get_base_url(manifest)
1853
1854         bootstrap_info = xpath_element(
1855             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1856             'bootstrap info', default=None)
1857
1858         vcodec = None
1859         mime_type = xpath_text(
1860             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1861             'base URL', default=None)
1862         if mime_type and mime_type.startswith('audio/'):
1863             vcodec = 'none'
1864
1865         for i, media_el in enumerate(media_nodes):
1866             tbr = int_or_none(media_el.attrib.get('bitrate'))
1867             width = int_or_none(media_el.attrib.get('width'))
1868             height = int_or_none(media_el.attrib.get('height'))
1869             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1870             # If <bootstrapInfo> is present, the specified f4m is a
1871             # stream-level manifest, and only set-level manifests may refer to
1872             # external resources.  See section 11.4 and section 4 of F4M spec
1873             if bootstrap_info is None:
1874                 media_url = None
1875                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1876                 if manifest_version == '2.0':
1877                     media_url = media_el.attrib.get('href')
1878                 if media_url is None:
1879                     media_url = media_el.attrib.get('url')
1880                 if not media_url:
1881                     continue
1882                 manifest_url = (
1883                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1884                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1885                 # If media_url is itself a f4m manifest do the recursive extraction
1886                 # since bitrates in parent manifest (this one) and media_url manifest
1887                 # may differ leading to inability to resolve the format by requested
1888                 # bitrate in f4m downloader
1889                 ext = determine_ext(manifest_url)
1890                 if ext == 'f4m':
1891                     f4m_formats = self._extract_f4m_formats(
1892                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1893                         transform_source=transform_source, fatal=fatal)
1894                     # Sometimes stream-level manifest contains single media entry that
1895                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1896                     # At the same time parent's media entry in set-level manifest may
1897                     # contain it. We will copy it from parent in such cases.
1898                     if len(f4m_formats) == 1:
1899                         f = f4m_formats[0]
1900                         f.update({
1901                             'tbr': f.get('tbr') or tbr,
1902                             'width': f.get('width') or width,
1903                             'height': f.get('height') or height,
1904                             'format_id': f.get('format_id') if not tbr else format_id,
1905                             'vcodec': vcodec,
1906                         })
1907                     formats.extend(f4m_formats)
1908                     continue
1909                 elif ext == 'm3u8':
1910                     formats.extend(self._extract_m3u8_formats(
1911                         manifest_url, video_id, 'mp4', preference=preference,
1912                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1913                     continue
1914             formats.append({
1915                 'format_id': format_id,
1916                 'url': manifest_url,
1917                 'manifest_url': manifest_url,
1918                 'ext': 'flv' if bootstrap_info is not None else None,
1919                 'protocol': 'f4m',
1920                 'tbr': tbr,
1921                 'width': width,
1922                 'height': height,
1923                 'vcodec': vcodec,
1924                 'preference': preference,
1925                 'quality': quality,
1926             })
1927         return formats
1928
1929     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1930         return {
1931             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1932             'url': m3u8_url,
1933             'ext': ext,
1934             'protocol': 'm3u8',
1935             'preference': preference - 100 if preference else -100,
1936             'quality': quality,
1937             'resolution': 'multiple',
1938             'format_note': 'Quality selection URL',
1939         }
1940
1941     def _extract_m3u8_formats(self, *args, **kwargs):
1942         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1943         if subs:
1944             self.report_warning(bug_reports_message(
1945                 "Ignoring subtitle tracks found in the HLS manifest; "
1946                 "if any subtitle tracks are missing,"
1947             ))
1948         return fmts
1949
1950     def _extract_m3u8_formats_and_subtitles(
1951             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1952             preference=None, quality=None, m3u8_id=None, note=None,
1953             errnote=None, fatal=True, live=False, data=None, headers={},
1954             query={}):
1955
1956         res = self._download_webpage_handle(
1957             m3u8_url, video_id,
1958             note='Downloading m3u8 information' if note is None else note,
1959             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1960             fatal=fatal, data=data, headers=headers, query=query)
1961
1962         if res is False:
1963             return [], {}
1964
1965         m3u8_doc, urlh = res
1966         m3u8_url = urlh.geturl()
1967
1968         return self._parse_m3u8_formats_and_subtitles(
1969             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1970             preference=preference, quality=quality, m3u8_id=m3u8_id,
1971             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1972             headers=headers, query=query, video_id=video_id)
1973
1974     def _parse_m3u8_formats_and_subtitles(
1975             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
1976             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1977             errnote=None, fatal=True, data=None, headers={}, query={},
1978             video_id=None):
1979
1980         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1981             return [], {}
1982
1983         if (not self.get_param('allow_unplayable_formats')
1984                 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)):  # Apple FairPlay
1985             return [], {}
1986
1987         formats = []
1988
1989         subtitles = {}
1990
1991         format_url = lambda u: (
1992             u
1993             if re.match(r'^https?://', u)
1994             else compat_urlparse.urljoin(m3u8_url, u))
1995
1996         split_discontinuity = self.get_param('hls_split_discontinuity', False)
1997
1998         # References:
1999         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2000         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2001         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2002
2003         # We should try extracting formats only from master playlists [1, 4.3.4],
2004         # i.e. playlists that describe available qualities. On the other hand
2005         # media playlists [1, 4.3.3] should be returned as is since they contain
2006         # just the media without qualities renditions.
2007         # Fortunately, master playlist can be easily distinguished from media
2008         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2009         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2010         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2011         # media playlist and MUST NOT appear in master playlist thus we can
2012         # clearly detect media playlist with this criterion.
2013
2014         def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None,
2015                                            fatal=True, data=None, headers={}):
2016             if not m3u8_doc:
2017                 if not format_url:
2018                     return []
2019                 res = self._download_webpage_handle(
2020                     format_url, video_id,
2021                     note=False,
2022                     errnote='Failed to download m3u8 playlist information',
2023                     fatal=fatal, data=data, headers=headers)
2024
2025                 if res is False:
2026                     return []
2027
2028                 m3u8_doc, urlh = res
2029                 format_url = urlh.geturl()
2030
2031             playlist_formats = []
2032             i = (
2033                 0
2034                 if split_discontinuity
2035                 else None)
2036             format_info = {
2037                 'index': i,
2038                 'key_data': None,
2039                 'files': [],
2040             }
2041             for line in m3u8_doc.splitlines():
2042                 if not line.startswith('#'):
2043                     format_info['files'].append(line)
2044                 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
2045                     i += 1
2046                     playlist_formats.append(format_info)
2047                     format_info = {
2048                         'index': i,
2049                         'url': format_url,
2050                         'files': [],
2051                     }
2052             playlist_formats.append(format_info)
2053             return playlist_formats
2054
2055         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2056
2057             playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
2058
2059             for format in playlist_formats:
2060                 format_id = []
2061                 if m3u8_id:
2062                     format_id.append(m3u8_id)
2063                 format_index = format.get('index')
2064                 if format_index:
2065                     format_id.append(str(format_index))
2066                 f = {
2067                     'format_id': '-'.join(format_id),
2068                     'format_index': format_index,
2069                     'url': m3u8_url,
2070                     'ext': ext,
2071                     'protocol': entry_protocol,
2072                     'preference': preference,
2073                     'quality': quality,
2074                 }
2075                 formats.append(f)
2076
2077             return formats, subtitles
2078
2079         groups = {}
2080         last_stream_inf = {}
2081
2082         def extract_media(x_media_line):
2083             media = parse_m3u8_attributes(x_media_line)
2084             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2085             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2086             if not (media_type and group_id and name):
2087                 return
2088             groups.setdefault(group_id, []).append(media)
2089             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2090             if media_type == 'SUBTITLES':
2091                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2092                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2093                 # However, lack of URI has been spotted in the wild.
2094                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2095                 if not media.get('URI'):
2096                     return
2097                 url = format_url(media['URI'])
2098                 sub_info = {
2099                     'url': url,
2100                     'ext': determine_ext(url),
2101                 }
2102                 if sub_info['ext'] == 'm3u8':
2103                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2104                     # files may contain is WebVTT:
2105                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2106                     sub_info['ext'] = 'vtt'
2107                     sub_info['protocol'] = 'm3u8_native'
2108                 lang = media.get('LANGUAGE') or 'und'
2109                 subtitles.setdefault(lang, []).append(sub_info)
2110             if media_type not in ('VIDEO', 'AUDIO'):
2111                 return
2112             media_url = media.get('URI')
2113             if media_url:
2114                 manifest_url = format_url(media_url)
2115                 format_id = []
2116                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2117                                                                   fatal=fatal, data=data, headers=headers)
2118
2119                 for format in playlist_formats:
2120                     format_index = format.get('index')
2121                     for v in (m3u8_id, group_id, name):
2122                         if v:
2123                             format_id.append(v)
2124                     if format_index:
2125                         format_id.append(str(format_index))
2126                     f = {
2127                         'format_id': '-'.join(format_id),
2128                         'format_index': format_index,
2129                         'url': manifest_url,
2130                         'manifest_url': m3u8_url,
2131                         'language': media.get('LANGUAGE'),
2132                         'ext': ext,
2133                         'protocol': entry_protocol,
2134                         'preference': preference,
2135                         'quality': quality,
2136                     }
2137                     if media_type == 'AUDIO':
2138                         f['vcodec'] = 'none'
2139                     formats.append(f)
2140
2141         def build_stream_name():
2142             # Despite specification does not mention NAME attribute for
2143             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2144             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2145             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2146             stream_name = last_stream_inf.get('NAME')
2147             if stream_name:
2148                 return stream_name
2149             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2150             # from corresponding rendition group
2151             stream_group_id = last_stream_inf.get('VIDEO')
2152             if not stream_group_id:
2153                 return
2154             stream_group = groups.get(stream_group_id)
2155             if not stream_group:
2156                 return stream_group_id
2157             rendition = stream_group[0]
2158             return rendition.get('NAME') or stream_group_id
2159
2160         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2161         # chance to detect video only formats when EXT-X-STREAM-INF tags
2162         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2163         for line in m3u8_doc.splitlines():
2164             if line.startswith('#EXT-X-MEDIA:'):
2165                 extract_media(line)
2166
2167         for line in m3u8_doc.splitlines():
2168             if line.startswith('#EXT-X-STREAM-INF:'):
2169                 last_stream_inf = parse_m3u8_attributes(line)
2170             elif line.startswith('#') or not line.strip():
2171                 continue
2172             else:
2173                 tbr = float_or_none(
2174                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2175                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2176                 manifest_url = format_url(line.strip())
2177
2178                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2179                                                                   fatal=fatal, data=data, headers=headers)
2180
2181                 for frmt in playlist_formats:
2182                     format_id = []
2183                     if m3u8_id:
2184                         format_id.append(m3u8_id)
2185                     format_index = frmt.get('index')
2186                     stream_name = build_stream_name()
2187                     # Bandwidth of live streams may differ over time thus making
2188                     # format_id unpredictable. So it's better to keep provided
2189                     # format_id intact.
2190                     if not live:
2191                         format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2192                     if format_index:
2193                         format_id.append(str(format_index))
2194                     f = {
2195                         'format_id': '-'.join(format_id),
2196                         'format_index': format_index,
2197                         'url': manifest_url,
2198                         'manifest_url': m3u8_url,
2199                         'tbr': tbr,
2200                         'ext': ext,
2201                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2202                         'protocol': entry_protocol,
2203                         'preference': preference,
2204                         'quality': quality,
2205                     }
2206                     resolution = last_stream_inf.get('RESOLUTION')
2207                     if resolution:
2208                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2209                         if mobj:
2210                             f['width'] = int(mobj.group('width'))
2211                             f['height'] = int(mobj.group('height'))
2212                     # Unified Streaming Platform
2213                     mobj = re.search(
2214                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2215                     if mobj:
2216                         abr, vbr = mobj.groups()
2217                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2218                         f.update({
2219                             'vbr': vbr,
2220                             'abr': abr,
2221                         })
2222                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2223                     f.update(codecs)
2224                     audio_group_id = last_stream_inf.get('AUDIO')
2225                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2226                     # references a rendition group MUST have a CODECS attribute.
2227                     # However, this is not always respected, for example, [2]
2228                     # contains EXT-X-STREAM-INF tag which references AUDIO
2229                     # rendition group but does not have CODECS and despite
2230                     # referencing an audio group it represents a complete
2231                     # (with audio and video) format. So, for such cases we will
2232                     # ignore references to rendition groups and treat them
2233                     # as complete formats.
2234                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2235                         audio_group = groups.get(audio_group_id)
2236                         if audio_group and audio_group[0].get('URI'):
2237                             # TODO: update acodec for audio only formats with
2238                             # the same GROUP-ID
2239                             f['acodec'] = 'none'
2240                     if not f.get('ext'):
2241                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2242                     formats.append(f)
2243
2244                     # for DailyMotion
2245                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2246                     if progressive_uri:
2247                         http_f = f.copy()
2248                         del http_f['manifest_url']
2249                         http_f.update({
2250                             'format_id': f['format_id'].replace('hls-', 'http-'),
2251                             'protocol': 'http',
2252                             'url': progressive_uri,
2253                         })
2254                         formats.append(http_f)
2255
2256                 last_stream_inf = {}
2257         return formats, subtitles
2258
2259     @staticmethod
2260     def _xpath_ns(path, namespace=None):
2261         if not namespace:
2262             return path
2263         out = []
2264         for c in path.split('/'):
2265             if not c or c == '.':
2266                 out.append(c)
2267             else:
2268                 out.append('{%s}%s' % (namespace, c))
2269         return '/'.join(out)
2270
2271     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2272         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2273
2274         if smil is False:
2275             assert not fatal
2276             return []
2277
2278         namespace = self._parse_smil_namespace(smil)
2279
2280         return self._parse_smil_formats(
2281             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2282
2283     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2284         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2285         if smil is False:
2286             return {}
2287         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2288
2289     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2290         return self._download_xml(
2291             smil_url, video_id, 'Downloading SMIL file',
2292             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2293
2294     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2295         namespace = self._parse_smil_namespace(smil)
2296
2297         formats = self._parse_smil_formats(
2298             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2299         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2300
2301         video_id = os.path.splitext(url_basename(smil_url))[0]
2302         title = None
2303         description = None
2304         upload_date = None
2305         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2306             name = meta.attrib.get('name')
2307             content = meta.attrib.get('content')
2308             if not name or not content:
2309                 continue
2310             if not title and name == 'title':
2311                 title = content
2312             elif not description and name in ('description', 'abstract'):
2313                 description = content
2314             elif not upload_date and name == 'date':
2315                 upload_date = unified_strdate(content)
2316
2317         thumbnails = [{
2318             'id': image.get('type'),
2319             'url': image.get('src'),
2320             'width': int_or_none(image.get('width')),
2321             'height': int_or_none(image.get('height')),
2322         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2323
2324         return {
2325             'id': video_id,
2326             'title': title or video_id,
2327             'description': description,
2328             'upload_date': upload_date,
2329             'thumbnails': thumbnails,
2330             'formats': formats,
2331             'subtitles': subtitles,
2332         }
2333
2334     def _parse_smil_namespace(self, smil):
2335         return self._search_regex(
2336             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2337
2338     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2339         base = smil_url
2340         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2341             b = meta.get('base') or meta.get('httpBase')
2342             if b:
2343                 base = b
2344                 break
2345
2346         formats = []
2347         rtmp_count = 0
2348         http_count = 0
2349         m3u8_count = 0
2350
2351         srcs = []
2352         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2353         for medium in media:
2354             src = medium.get('src')
2355             if not src or src in srcs:
2356                 continue
2357             srcs.append(src)
2358
2359             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2360             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2361             width = int_or_none(medium.get('width'))
2362             height = int_or_none(medium.get('height'))
2363             proto = medium.get('proto')
2364             ext = medium.get('ext')
2365             src_ext = determine_ext(src)
2366             streamer = medium.get('streamer') or base
2367
2368             if proto == 'rtmp' or streamer.startswith('rtmp'):
2369                 rtmp_count += 1
2370                 formats.append({
2371                     'url': streamer,
2372                     'play_path': src,
2373                     'ext': 'flv',
2374                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2375                     'tbr': bitrate,
2376                     'filesize': filesize,
2377                     'width': width,
2378                     'height': height,
2379                 })
2380                 if transform_rtmp_url:
2381                     streamer, src = transform_rtmp_url(streamer, src)
2382                     formats[-1].update({
2383                         'url': streamer,
2384                         'play_path': src,
2385                     })
2386                 continue
2387
2388             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2389             src_url = src_url.strip()
2390
2391             if proto == 'm3u8' or src_ext == 'm3u8':
2392                 m3u8_formats = self._extract_m3u8_formats(
2393                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2394                 if len(m3u8_formats) == 1:
2395                     m3u8_count += 1
2396                     m3u8_formats[0].update({
2397                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2398                         'tbr': bitrate,
2399                         'width': width,
2400                         'height': height,
2401                     })
2402                 formats.extend(m3u8_formats)
2403             elif src_ext == 'f4m':
2404                 f4m_url = src_url
2405                 if not f4m_params:
2406                     f4m_params = {
2407                         'hdcore': '3.2.0',
2408                         'plugin': 'flowplayer-3.2.0.1',
2409                     }
2410                 f4m_url += '&' if '?' in f4m_url else '?'
2411                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2412                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2413             elif src_ext == 'mpd':
2414                 formats.extend(self._extract_mpd_formats(
2415                     src_url, video_id, mpd_id='dash', fatal=False))
2416             elif re.search(r'\.ism/[Mm]anifest', src_url):
2417                 formats.extend(self._extract_ism_formats(
2418                     src_url, video_id, ism_id='mss', fatal=False))
2419             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2420                 http_count += 1
2421                 formats.append({
2422                     'url': src_url,
2423                     'ext': ext or src_ext or 'flv',
2424                     'format_id': 'http-%d' % (bitrate or http_count),
2425                     'tbr': bitrate,
2426                     'filesize': filesize,
2427                     'width': width,
2428                     'height': height,
2429                 })
2430
2431         return formats
2432
2433     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2434         urls = []
2435         subtitles = {}
2436         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2437             src = textstream.get('src')
2438             if not src or src in urls:
2439                 continue
2440             urls.append(src)
2441             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2442             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2443             subtitles.setdefault(lang, []).append({
2444                 'url': src,
2445                 'ext': ext,
2446             })
2447         return subtitles
2448
2449     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2450         xspf = self._download_xml(
2451             xspf_url, playlist_id, 'Downloading xpsf playlist',
2452             'Unable to download xspf manifest', fatal=fatal)
2453         if xspf is False:
2454             return []
2455         return self._parse_xspf(
2456             xspf, playlist_id, xspf_url=xspf_url,
2457             xspf_base_url=base_url(xspf_url))
2458
2459     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2460         NS_MAP = {
2461             'xspf': 'http://xspf.org/ns/0/',
2462             's1': 'http://static.streamone.nl/player/ns/0',
2463         }
2464
2465         entries = []
2466         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2467             title = xpath_text(
2468                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2469             description = xpath_text(
2470                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2471             thumbnail = xpath_text(
2472                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2473             duration = float_or_none(
2474                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2475
2476             formats = []
2477             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2478                 format_url = urljoin(xspf_base_url, location.text)
2479                 if not format_url:
2480                     continue
2481                 formats.append({
2482                     'url': format_url,
2483                     'manifest_url': xspf_url,
2484                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2485                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2486                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2487                 })
2488             self._sort_formats(formats)
2489
2490             entries.append({
2491                 'id': playlist_id,
2492                 'title': title,
2493                 'description': description,
2494                 'thumbnail': thumbnail,
2495                 'duration': duration,
2496                 'formats': formats,
2497             })
2498         return entries
2499
2500     def _extract_mpd_formats(self, *args, **kwargs):
2501         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2502         if subs:
2503             self.report_warning(bug_reports_message(
2504                 "Ignoring subtitle tracks found in the DASH manifest; "
2505                 "if any subtitle tracks are missing,"
2506             ))
2507         return fmts
2508
2509     def _extract_mpd_formats_and_subtitles(
2510             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2511             fatal=True, data=None, headers={}, query={}):
2512         res = self._download_xml_handle(
2513             mpd_url, video_id,
2514             note='Downloading MPD manifest' if note is None else note,
2515             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2516             fatal=fatal, data=data, headers=headers, query=query)
2517         if res is False:
2518             return [], {}
2519         mpd_doc, urlh = res
2520         if mpd_doc is None:
2521             return [], {}
2522         mpd_base_url = base_url(urlh.geturl())
2523
2524         return self._parse_mpd_formats_and_subtitles(
2525             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2526
2527     def _parse_mpd_formats(self, *args, **kwargs):
2528         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2529         if subs:
2530             self.report_warning(bug_reports_message(
2531                 "Ignoring subtitle tracks found in the DASH manifest; "
2532                 "if any subtitle tracks are missing,"
2533             ))
2534         return fmts
2535
2536     def _parse_mpd_formats_and_subtitles(
2537             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2538         """
2539         Parse formats from MPD manifest.
2540         References:
2541          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2542             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2543          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2544         """
2545         if not self.get_param('dynamic_mpd', True):
2546             if mpd_doc.get('type') == 'dynamic':
2547                 return [], {}
2548
2549         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2550
2551         def _add_ns(path):
2552             return self._xpath_ns(path, namespace)
2553
2554         def is_drm_protected(element):
2555             return element.find(_add_ns('ContentProtection')) is not None
2556
2557         def extract_multisegment_info(element, ms_parent_info):
2558             ms_info = ms_parent_info.copy()
2559
2560             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2561             # common attributes and elements.  We will only extract relevant
2562             # for us.
2563             def extract_common(source):
2564                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2565                 if segment_timeline is not None:
2566                     s_e = segment_timeline.findall(_add_ns('S'))
2567                     if s_e:
2568                         ms_info['total_number'] = 0
2569                         ms_info['s'] = []
2570                         for s in s_e:
2571                             r = int(s.get('r', 0))
2572                             ms_info['total_number'] += 1 + r
2573                             ms_info['s'].append({
2574                                 't': int(s.get('t', 0)),
2575                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2576                                 'd': int(s.attrib['d']),
2577                                 'r': r,
2578                             })
2579                 start_number = source.get('startNumber')
2580                 if start_number:
2581                     ms_info['start_number'] = int(start_number)
2582                 timescale = source.get('timescale')
2583                 if timescale:
2584                     ms_info['timescale'] = int(timescale)
2585                 segment_duration = source.get('duration')
2586                 if segment_duration:
2587                     ms_info['segment_duration'] = float(segment_duration)
2588
2589             def extract_Initialization(source):
2590                 initialization = source.find(_add_ns('Initialization'))
2591                 if initialization is not None:
2592                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2593
2594             segment_list = element.find(_add_ns('SegmentList'))
2595             if segment_list is not None:
2596                 extract_common(segment_list)
2597                 extract_Initialization(segment_list)
2598                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2599                 if segment_urls_e:
2600                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2601             else:
2602                 segment_template = element.find(_add_ns('SegmentTemplate'))
2603                 if segment_template is not None:
2604                     extract_common(segment_template)
2605                     media = segment_template.get('media')
2606                     if media:
2607                         ms_info['media'] = media
2608                     initialization = segment_template.get('initialization')
2609                     if initialization:
2610                         ms_info['initialization'] = initialization
2611                     else:
2612                         extract_Initialization(segment_template)
2613             return ms_info
2614
2615         skip_unplayable = not self.get_param('allow_unplayable_formats')
2616
2617         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2618         formats = []
2619         subtitles = {}
2620         for period in mpd_doc.findall(_add_ns('Period')):
2621             period_duration = parse_duration(period.get('duration')) or mpd_duration
2622             period_ms_info = extract_multisegment_info(period, {
2623                 'start_number': 1,
2624                 'timescale': 1,
2625             })
2626             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2627                 if skip_unplayable and is_drm_protected(adaptation_set):
2628                     continue
2629                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2630                 for representation in adaptation_set.findall(_add_ns('Representation')):
2631                     if skip_unplayable and is_drm_protected(representation):
2632                         continue
2633                     representation_attrib = adaptation_set.attrib.copy()
2634                     representation_attrib.update(representation.attrib)
2635                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2636                     mime_type = representation_attrib['mimeType']
2637                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2638
2639                     if content_type in ('video', 'audio', 'text'):
2640                         base_url = ''
2641                         for element in (representation, adaptation_set, period, mpd_doc):
2642                             base_url_e = element.find(_add_ns('BaseURL'))
2643                             if base_url_e is not None:
2644                                 base_url = base_url_e.text + base_url
2645                                 if re.match(r'^https?://', base_url):
2646                                     break
2647                         if mpd_base_url and not re.match(r'^https?://', base_url):
2648                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2649                                 mpd_base_url += '/'
2650                             base_url = mpd_base_url + base_url
2651                         representation_id = representation_attrib.get('id')
2652                         lang = representation_attrib.get('lang')
2653                         url_el = representation.find(_add_ns('BaseURL'))
2654                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2655                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2656                         if content_type in ('video', 'audio'):
2657                             f = {
2658                                 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2659                                 'manifest_url': mpd_url,
2660                                 'ext': mimetype2ext(mime_type),
2661                                 'width': int_or_none(representation_attrib.get('width')),
2662                                 'height': int_or_none(representation_attrib.get('height')),
2663                                 'tbr': float_or_none(bandwidth, 1000),
2664                                 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2665                                 'fps': int_or_none(representation_attrib.get('frameRate')),
2666                                 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2667                                 'format_note': 'DASH %s' % content_type,
2668                                 'filesize': filesize,
2669                                 'container': mimetype2ext(mime_type) + '_dash',
2670                             }
2671                             f.update(parse_codecs(representation_attrib.get('codecs')))
2672                         elif content_type == 'text':
2673                             f = {
2674                                 'ext': mimetype2ext(mime_type),
2675                                 'manifest_url': mpd_url,
2676                                 'filesize': filesize,
2677                             }
2678                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2679
2680                         def prepare_template(template_name, identifiers):
2681                             tmpl = representation_ms_info[template_name]
2682                             # First of, % characters outside $...$ templates
2683                             # must be escaped by doubling for proper processing
2684                             # by % operator string formatting used further (see
2685                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2686                             t = ''
2687                             in_template = False
2688                             for c in tmpl:
2689                                 t += c
2690                                 if c == '$':
2691                                     in_template = not in_template
2692                                 elif c == '%' and not in_template:
2693                                     t += c
2694                             # Next, $...$ templates are translated to their
2695                             # %(...) counterparts to be used with % operator
2696                             t = t.replace('$RepresentationID$', representation_id)
2697                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2698                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2699                             t.replace('$$', '$')
2700                             return t
2701
2702                         # @initialization is a regular template like @media one
2703                         # so it should be handled just the same way (see
2704                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2705                         if 'initialization' in representation_ms_info:
2706                             initialization_template = prepare_template(
2707                                 'initialization',
2708                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2709                                 # $Time$ shall not be included for @initialization thus
2710                                 # only $Bandwidth$ remains
2711                                 ('Bandwidth', ))
2712                             representation_ms_info['initialization_url'] = initialization_template % {
2713                                 'Bandwidth': bandwidth,
2714                             }
2715
2716                         def location_key(location):
2717                             return 'url' if re.match(r'^https?://', location) else 'path'
2718
2719                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2720
2721                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2722                             media_location_key = location_key(media_template)
2723
2724                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2725                             # can't be used at the same time
2726                             if '%(Number' in media_template and 's' not in representation_ms_info:
2727                                 segment_duration = None
2728                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2729                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2730                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2731                                 representation_ms_info['fragments'] = [{
2732                                     media_location_key: media_template % {
2733                                         'Number': segment_number,
2734                                         'Bandwidth': bandwidth,
2735                                     },
2736                                     'duration': segment_duration,
2737                                 } for segment_number in range(
2738                                     representation_ms_info['start_number'],
2739                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2740                             else:
2741                                 # $Number*$ or $Time$ in media template with S list available
2742                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2743                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2744                                 representation_ms_info['fragments'] = []
2745                                 segment_time = 0
2746                                 segment_d = None
2747                                 segment_number = representation_ms_info['start_number']
2748
2749                                 def add_segment_url():
2750                                     segment_url = media_template % {
2751                                         'Time': segment_time,
2752                                         'Bandwidth': bandwidth,
2753                                         'Number': segment_number,
2754                                     }
2755                                     representation_ms_info['fragments'].append({
2756                                         media_location_key: segment_url,
2757                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2758                                     })
2759
2760                                 for num, s in enumerate(representation_ms_info['s']):
2761                                     segment_time = s.get('t') or segment_time
2762                                     segment_d = s['d']
2763                                     add_segment_url()
2764                                     segment_number += 1
2765                                     for r in range(s.get('r', 0)):
2766                                         segment_time += segment_d
2767                                         add_segment_url()
2768                                         segment_number += 1
2769                                     segment_time += segment_d
2770                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2771                             # No media template
2772                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2773                             # or any YouTube dashsegments video
2774                             fragments = []
2775                             segment_index = 0
2776                             timescale = representation_ms_info['timescale']
2777                             for s in representation_ms_info['s']:
2778                                 duration = float_or_none(s['d'], timescale)
2779                                 for r in range(s.get('r', 0) + 1):
2780                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2781                                     fragments.append({
2782                                         location_key(segment_uri): segment_uri,
2783                                         'duration': duration,
2784                                     })
2785                                     segment_index += 1
2786                             representation_ms_info['fragments'] = fragments
2787                         elif 'segment_urls' in representation_ms_info:
2788                             # Segment URLs with no SegmentTimeline
2789                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2790                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2791                             fragments = []
2792                             segment_duration = float_or_none(
2793                                 representation_ms_info['segment_duration'],
2794                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2795                             for segment_url in representation_ms_info['segment_urls']:
2796                                 fragment = {
2797                                     location_key(segment_url): segment_url,
2798                                 }
2799                                 if segment_duration:
2800                                     fragment['duration'] = segment_duration
2801                                 fragments.append(fragment)
2802                             representation_ms_info['fragments'] = fragments
2803                         # If there is a fragments key available then we correctly recognized fragmented media.
2804                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2805                         # assumption is not necessarily correct since we may simply have no support for
2806                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2807                         if 'fragments' in representation_ms_info:
2808                             f.update({
2809                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2810                                 'url': mpd_url or base_url,
2811                                 'fragment_base_url': base_url,
2812                                 'fragments': [],
2813                                 'protocol': 'http_dash_segments',
2814                             })
2815                             if 'initialization_url' in representation_ms_info:
2816                                 initialization_url = representation_ms_info['initialization_url']
2817                                 if not f.get('url'):
2818                                     f['url'] = initialization_url
2819                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2820                             f['fragments'].extend(representation_ms_info['fragments'])
2821                         else:
2822                             # Assuming direct URL to unfragmented media.
2823                             f['url'] = base_url
2824                         if content_type in ('video', 'audio'):
2825                             formats.append(f)
2826                         elif content_type == 'text':
2827                             subtitles.setdefault(lang or 'und', []).append(f)
2828                     else:
2829                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2830         return formats, subtitles
2831
2832     def _extract_ism_formats(self, *args, **kwargs):
2833         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2834         if subs:
2835             self.report_warning(bug_reports_message(
2836                 "Ignoring subtitle tracks found in the ISM manifest; "
2837                 "if any subtitle tracks are missing,"
2838             ))
2839         return fmts
2840
2841     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2842         res = self._download_xml_handle(
2843             ism_url, video_id,
2844             note='Downloading ISM manifest' if note is None else note,
2845             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2846             fatal=fatal, data=data, headers=headers, query=query)
2847         if res is False:
2848             return [], {}
2849         ism_doc, urlh = res
2850         if ism_doc is None:
2851             return [], {}
2852
2853         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2854
2855     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2856         """
2857         Parse formats from ISM manifest.
2858         References:
2859          1. [MS-SSTR]: Smooth Streaming Protocol,
2860             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2861         """
2862         if ism_doc.get('IsLive') == 'TRUE':
2863             return [], {}
2864         if (not self.get_param('allow_unplayable_formats')
2865                 and ism_doc.find('Protection') is not None):
2866             return [], {}
2867
2868         duration = int(ism_doc.attrib['Duration'])
2869         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2870
2871         formats = []
2872         subtitles = {}
2873         for stream in ism_doc.findall('StreamIndex'):
2874             stream_type = stream.get('Type')
2875             if stream_type not in ('video', 'audio', 'text'):
2876                 continue
2877             url_pattern = stream.attrib['Url']
2878             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2879             stream_name = stream.get('Name')
2880             stream_language = stream.get('Language', 'und')
2881             for track in stream.findall('QualityLevel'):
2882                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2883                 # TODO: add support for WVC1 and WMAP
2884                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2885                     self.report_warning('%s is not a supported codec' % fourcc)
2886                     continue
2887                 tbr = int(track.attrib['Bitrate']) // 1000
2888                 # [1] does not mention Width and Height attributes. However,
2889                 # they're often present while MaxWidth and MaxHeight are
2890                 # missing, so should be used as fallbacks
2891                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2892                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2893                 sampling_rate = int_or_none(track.get('SamplingRate'))
2894
2895                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2896                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2897
2898                 fragments = []
2899                 fragment_ctx = {
2900                     'time': 0,
2901                 }
2902                 stream_fragments = stream.findall('c')
2903                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2904                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2905                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2906                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2907                     if not fragment_ctx['duration']:
2908                         try:
2909                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2910                         except IndexError:
2911                             next_fragment_time = duration
2912                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2913                     for _ in range(fragment_repeat):
2914                         fragments.append({
2915                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2916                             'duration': fragment_ctx['duration'] / stream_timescale,
2917                         })
2918                         fragment_ctx['time'] += fragment_ctx['duration']
2919
2920                 format_id = []
2921                 if ism_id:
2922                     format_id.append(ism_id)
2923                 if stream_name:
2924                     format_id.append(stream_name)
2925                 format_id.append(compat_str(tbr))
2926
2927                 if stream_type == 'text':
2928                     subtitles.setdefault(stream_language, []).append({
2929                         'ext': 'ismt',
2930                         'protocol': 'ism',
2931                         'url': ism_url,
2932                         'manifest_url': ism_url,
2933                         'fragments': fragments,
2934                         '_download_params': {
2935                             'stream_type': stream_type,
2936                             'duration': duration,
2937                             'timescale': stream_timescale,
2938                             'fourcc': fourcc,
2939                             'language': stream_language,
2940                             'codec_private_data': track.get('CodecPrivateData'),
2941                         }
2942                     })
2943                 elif stream_type in ('video', 'audio'):
2944                     formats.append({
2945                         'format_id': '-'.join(format_id),
2946                         'url': ism_url,
2947                         'manifest_url': ism_url,
2948                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2949                         'width': width,
2950                         'height': height,
2951                         'tbr': tbr,
2952                         'asr': sampling_rate,
2953                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2954                         'acodec': 'none' if stream_type == 'video' else fourcc,
2955                         'protocol': 'ism',
2956                         'fragments': fragments,
2957                         '_download_params': {
2958                             'stream_type': stream_type,
2959                             'duration': duration,
2960                             'timescale': stream_timescale,
2961                             'width': width or 0,
2962                             'height': height or 0,
2963                             'fourcc': fourcc,
2964                             'language': stream_language,
2965                             'codec_private_data': track.get('CodecPrivateData'),
2966                             'sampling_rate': sampling_rate,
2967                             'channels': int_or_none(track.get('Channels', 2)),
2968                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2969                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2970                         },
2971                     })
2972         return formats, subtitles
2973
2974     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2975         def absolute_url(item_url):
2976             return urljoin(base_url, item_url)
2977
2978         def parse_content_type(content_type):
2979             if not content_type:
2980                 return {}
2981             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2982             if ctr:
2983                 mimetype, codecs = ctr.groups()
2984                 f = parse_codecs(codecs)
2985                 f['ext'] = mimetype2ext(mimetype)
2986                 return f
2987             return {}
2988
2989         def _media_formats(src, cur_media_type, type_info={}):
2990             full_url = absolute_url(src)
2991             ext = type_info.get('ext') or determine_ext(full_url)
2992             if ext == 'm3u8':
2993                 is_plain_url = False
2994                 formats = self._extract_m3u8_formats(
2995                     full_url, video_id, ext='mp4',
2996                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2997                     preference=preference, quality=quality, fatal=False)
2998             elif ext == 'mpd':
2999                 is_plain_url = False
3000                 formats = self._extract_mpd_formats(
3001                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3002             else:
3003                 is_plain_url = True
3004                 formats = [{
3005                     'url': full_url,
3006                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3007                 }]
3008             return is_plain_url, formats
3009
3010         entries = []
3011         # amp-video and amp-audio are very similar to their HTML5 counterparts
3012         # so we wll include them right here (see
3013         # https://www.ampproject.org/docs/reference/components/amp-video)
3014         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3015         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3016         media_tags = [(media_tag, media_tag_name, media_type, '')
3017                       for media_tag, media_tag_name, media_type
3018                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3019         media_tags.extend(re.findall(
3020             # We only allow video|audio followed by a whitespace or '>'.
3021             # Allowing more characters may end up in significant slow down (see
3022             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3023             # http://www.porntrex.com/maps/videositemap.xml).
3024             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3025         for media_tag, _, media_type, media_content in media_tags:
3026             media_info = {
3027                 'formats': [],
3028                 'subtitles': {},
3029             }
3030             media_attributes = extract_attributes(media_tag)
3031             src = strip_or_none(media_attributes.get('src'))
3032             if src:
3033                 _, formats = _media_formats(src, media_type)
3034                 media_info['formats'].extend(formats)
3035             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3036             if media_content:
3037                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3038                     s_attr = extract_attributes(source_tag)
3039                     # data-video-src and data-src are non standard but seen
3040                     # several times in the wild
3041                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3042                     if not src:
3043                         continue
3044                     f = parse_content_type(s_attr.get('type'))
3045                     is_plain_url, formats = _media_formats(src, media_type, f)
3046                     if is_plain_url:
3047                         # width, height, res, label and title attributes are
3048                         # all not standard but seen several times in the wild
3049                         labels = [
3050                             s_attr.get(lbl)
3051                             for lbl in ('label', 'title')
3052                             if str_or_none(s_attr.get(lbl))
3053                         ]
3054                         width = int_or_none(s_attr.get('width'))
3055                         height = (int_or_none(s_attr.get('height'))
3056                                   or int_or_none(s_attr.get('res')))
3057                         if not width or not height:
3058                             for lbl in labels:
3059                                 resolution = parse_resolution(lbl)
3060                                 if not resolution:
3061                                     continue
3062                                 width = width or resolution.get('width')
3063                                 height = height or resolution.get('height')
3064                         for lbl in labels:
3065                             tbr = parse_bitrate(lbl)
3066                             if tbr:
3067                                 break
3068                         else:
3069                             tbr = None
3070                         f.update({
3071                             'width': width,
3072                             'height': height,
3073                             'tbr': tbr,
3074                             'format_id': s_attr.get('label') or s_attr.get('title'),
3075                         })
3076                         f.update(formats[0])
3077                         media_info['formats'].append(f)
3078                     else:
3079                         media_info['formats'].extend(formats)
3080                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3081                     track_attributes = extract_attributes(track_tag)
3082                     kind = track_attributes.get('kind')
3083                     if not kind or kind in ('subtitles', 'captions'):
3084                         src = strip_or_none(track_attributes.get('src'))
3085                         if not src:
3086                             continue
3087                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3088                         media_info['subtitles'].setdefault(lang, []).append({
3089                             'url': absolute_url(src),
3090                         })
3091             for f in media_info['formats']:
3092                 f.setdefault('http_headers', {})['Referer'] = base_url
3093             if media_info['formats'] or media_info['subtitles']:
3094                 entries.append(media_info)
3095         return entries
3096
3097     def _extract_akamai_formats(self, *args, **kwargs):
3098         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3099         if subs:
3100             self.report_warning(bug_reports_message(
3101                 "Ignoring subtitle tracks found in the manifests; "
3102                 "if any subtitle tracks are missing,"
3103             ))
3104         return fmts
3105
3106     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3107         signed = 'hdnea=' in manifest_url
3108         if not signed:
3109             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3110             manifest_url = re.sub(
3111                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3112                 '', manifest_url).strip('?')
3113
3114         formats = []
3115         subtitles = {}
3116
3117         hdcore_sign = 'hdcore=3.7.0'
3118         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3119         hds_host = hosts.get('hds')
3120         if hds_host:
3121             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3122         if 'hdcore=' not in f4m_url:
3123             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3124         f4m_formats = self._extract_f4m_formats(
3125             f4m_url, video_id, f4m_id='hds', fatal=False)
3126         for entry in f4m_formats:
3127             entry.update({'extra_param_to_segment_url': hdcore_sign})
3128         formats.extend(f4m_formats)
3129
3130         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3131         hls_host = hosts.get('hls')
3132         if hls_host:
3133             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3134         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3135             m3u8_url, video_id, 'mp4', 'm3u8_native',
3136             m3u8_id='hls', fatal=False)
3137         formats.extend(m3u8_formats)
3138         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3139
3140         http_host = hosts.get('http')
3141         if http_host and m3u8_formats and not signed:
3142             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3143             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3144             qualities_length = len(qualities)
3145             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3146                 i = 0
3147                 for f in m3u8_formats:
3148                     if f['vcodec'] != 'none':
3149                         for protocol in ('http', 'https'):
3150                             http_f = f.copy()
3151                             del http_f['manifest_url']
3152                             http_url = re.sub(
3153                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3154                             http_f.update({
3155                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3156                                 'url': http_url,
3157                                 'protocol': protocol,
3158                             })
3159                             formats.append(http_f)
3160                         i += 1
3161
3162         return formats, subtitles
3163
3164     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3165         query = compat_urlparse.urlparse(url).query
3166         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3167         mobj = re.search(
3168             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3169         url_base = mobj.group('url')
3170         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3171         formats = []
3172
3173         def manifest_url(manifest):
3174             m_url = '%s/%s' % (http_base_url, manifest)
3175             if query:
3176                 m_url += '?%s' % query
3177             return m_url
3178
3179         if 'm3u8' not in skip_protocols:
3180             formats.extend(self._extract_m3u8_formats(
3181                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3182                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3183         if 'f4m' not in skip_protocols:
3184             formats.extend(self._extract_f4m_formats(
3185                 manifest_url('manifest.f4m'),
3186                 video_id, f4m_id='hds', fatal=False))
3187         if 'dash' not in skip_protocols:
3188             formats.extend(self._extract_mpd_formats(
3189                 manifest_url('manifest.mpd'),
3190                 video_id, mpd_id='dash', fatal=False))
3191         if re.search(r'(?:/smil:|\.smil)', url_base):
3192             if 'smil' not in skip_protocols:
3193                 rtmp_formats = self._extract_smil_formats(
3194                     manifest_url('jwplayer.smil'),
3195                     video_id, fatal=False)
3196                 for rtmp_format in rtmp_formats:
3197                     rtsp_format = rtmp_format.copy()
3198                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3199                     del rtsp_format['play_path']
3200                     del rtsp_format['ext']
3201                     rtsp_format.update({
3202                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3203                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3204                         'protocol': 'rtsp',
3205                     })
3206                     formats.extend([rtmp_format, rtsp_format])
3207         else:
3208             for protocol in ('rtmp', 'rtsp'):
3209                 if protocol not in skip_protocols:
3210                     formats.append({
3211                         'url': '%s:%s' % (protocol, url_base),
3212                         'format_id': protocol,
3213                         'protocol': protocol,
3214                     })
3215         return formats
3216
3217     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3218         mobj = re.search(
3219             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3220             webpage)
3221         if mobj:
3222             try:
3223                 jwplayer_data = self._parse_json(mobj.group('options'),
3224                                                  video_id=video_id,
3225                                                  transform_source=transform_source)
3226             except ExtractorError:
3227                 pass
3228             else:
3229                 if isinstance(jwplayer_data, dict):
3230                     return jwplayer_data
3231
3232     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3233         jwplayer_data = self._find_jwplayer_data(
3234             webpage, video_id, transform_source=js_to_json)
3235         return self._parse_jwplayer_data(
3236             jwplayer_data, video_id, *args, **kwargs)
3237
3238     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3239                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3240         # JWPlayer backward compatibility: flattened playlists
3241         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3242         if 'playlist' not in jwplayer_data:
3243             jwplayer_data = {'playlist': [jwplayer_data]}
3244
3245         entries = []
3246
3247         # JWPlayer backward compatibility: single playlist item
3248         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3249         if not isinstance(jwplayer_data['playlist'], list):
3250             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3251
3252         for video_data in jwplayer_data['playlist']:
3253             # JWPlayer backward compatibility: flattened sources
3254             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3255             if 'sources' not in video_data:
3256                 video_data['sources'] = [video_data]
3257
3258             this_video_id = video_id or video_data['mediaid']
3259
3260             formats = self._parse_jwplayer_formats(
3261                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3262                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3263
3264             subtitles = {}
3265             tracks = video_data.get('tracks')
3266             if tracks and isinstance(tracks, list):
3267                 for track in tracks:
3268                     if not isinstance(track, dict):
3269                         continue
3270                     track_kind = track.get('kind')
3271                     if not track_kind or not isinstance(track_kind, compat_str):
3272                         continue
3273                     if track_kind.lower() not in ('captions', 'subtitles'):
3274                         continue
3275                     track_url = urljoin(base_url, track.get('file'))
3276                     if not track_url:
3277                         continue
3278                     subtitles.setdefault(track.get('label') or 'en', []).append({
3279                         'url': self._proto_relative_url(track_url)
3280                     })
3281
3282             entry = {
3283                 'id': this_video_id,
3284                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3285                 'description': clean_html(video_data.get('description')),
3286                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3287                 'timestamp': int_or_none(video_data.get('pubdate')),
3288                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3289                 'subtitles': subtitles,
3290             }
3291             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3292             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3293                 entry.update({
3294                     '_type': 'url_transparent',
3295                     'url': formats[0]['url'],
3296                 })
3297             else:
3298                 self._sort_formats(formats)
3299                 entry['formats'] = formats
3300             entries.append(entry)
3301         if len(entries) == 1:
3302             return entries[0]
3303         else:
3304             return self.playlist_result(entries)
3305
3306     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3307                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3308         urls = []
3309         formats = []
3310         for source in jwplayer_sources_data:
3311             if not isinstance(source, dict):
3312                 continue
3313             source_url = urljoin(
3314                 base_url, self._proto_relative_url(source.get('file')))
3315             if not source_url or source_url in urls:
3316                 continue
3317             urls.append(source_url)
3318             source_type = source.get('type') or ''
3319             ext = mimetype2ext(source_type) or determine_ext(source_url)
3320             if source_type == 'hls' or ext == 'm3u8':
3321                 formats.extend(self._extract_m3u8_formats(
3322                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3323                     m3u8_id=m3u8_id, fatal=False))
3324             elif source_type == 'dash' or ext == 'mpd':
3325                 formats.extend(self._extract_mpd_formats(
3326                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3327             elif ext == 'smil':
3328                 formats.extend(self._extract_smil_formats(
3329                     source_url, video_id, fatal=False))
3330             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3331             elif source_type.startswith('audio') or ext in (
3332                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3333                 formats.append({
3334                     'url': source_url,
3335                     'vcodec': 'none',
3336                     'ext': ext,
3337                 })
3338             else:
3339                 height = int_or_none(source.get('height'))
3340                 if height is None:
3341                     # Often no height is provided but there is a label in
3342                     # format like "1080p", "720p SD", or 1080.
3343                     height = int_or_none(self._search_regex(
3344                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3345                         'height', default=None))
3346                 a_format = {
3347                     'url': source_url,
3348                     'width': int_or_none(source.get('width')),
3349                     'height': height,
3350                     'tbr': int_or_none(source.get('bitrate')),
3351                     'ext': ext,
3352                 }
3353                 if source_url.startswith('rtmp'):
3354                     a_format['ext'] = 'flv'
3355                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3356                     # of jwplayer.flash.swf
3357                     rtmp_url_parts = re.split(
3358                         r'((?:mp4|mp3|flv):)', source_url, 1)
3359                     if len(rtmp_url_parts) == 3:
3360                         rtmp_url, prefix, play_path = rtmp_url_parts
3361                         a_format.update({
3362                             'url': rtmp_url,
3363                             'play_path': prefix + play_path,
3364                         })
3365                     if rtmp_params:
3366                         a_format.update(rtmp_params)
3367                 formats.append(a_format)
3368         return formats
3369
3370     def _live_title(self, name):
3371         """ Generate the title for a live video """
3372         now = datetime.datetime.now()
3373         now_str = now.strftime('%Y-%m-%d %H:%M')
3374         return name + ' ' + now_str
3375
3376     def _int(self, v, name, fatal=False, **kwargs):
3377         res = int_or_none(v, **kwargs)
3378         if 'get_attr' in kwargs:
3379             print(getattr(v, kwargs['get_attr']))
3380         if res is None:
3381             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3382             if fatal:
3383                 raise ExtractorError(msg)
3384             else:
3385                 self.report_warning(msg)
3386         return res
3387
3388     def _float(self, v, name, fatal=False, **kwargs):
3389         res = float_or_none(v, **kwargs)
3390         if res is None:
3391             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3392             if fatal:
3393                 raise ExtractorError(msg)
3394             else:
3395                 self.report_warning(msg)
3396         return res
3397
3398     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3399                     path='/', secure=False, discard=False, rest={}, **kwargs):
3400         cookie = compat_cookiejar_Cookie(
3401             0, name, value, port, port is not None, domain, True,
3402             domain.startswith('.'), path, True, secure, expire_time,
3403             discard, None, None, rest)
3404         self._downloader.cookiejar.set_cookie(cookie)
3405
3406     def _get_cookies(self, url):
3407         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3408         req = sanitized_Request(url)
3409         self._downloader.cookiejar.add_cookie_header(req)
3410         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3411
3412     def _apply_first_set_cookie_header(self, url_handle, cookie):
3413         """
3414         Apply first Set-Cookie header instead of the last. Experimental.
3415
3416         Some sites (e.g. [1-3]) may serve two cookies under the same name
3417         in Set-Cookie header and expect the first (old) one to be set rather
3418         than second (new). However, as of RFC6265 the newer one cookie
3419         should be set into cookie store what actually happens.
3420         We will workaround this issue by resetting the cookie to
3421         the first one manually.
3422         1. https://new.vk.com/
3423         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3424         3. https://learning.oreilly.com/
3425         """
3426         for header, cookies in url_handle.headers.items():
3427             if header.lower() != 'set-cookie':
3428                 continue
3429             if sys.version_info[0] >= 3:
3430                 cookies = cookies.encode('iso-8859-1')
3431             cookies = cookies.decode('utf-8')
3432             cookie_value = re.search(
3433                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3434             if cookie_value:
3435                 value, domain = cookie_value.groups()
3436                 self._set_cookie(domain, cookie, value)
3437                 break
3438
3439     def get_testcases(self, include_onlymatching=False):
3440         t = getattr(self, '_TEST', None)
3441         if t:
3442             assert not hasattr(self, '_TESTS'), \
3443                 '%s has _TEST and _TESTS' % type(self).__name__
3444             tests = [t]
3445         else:
3446             tests = getattr(self, '_TESTS', [])
3447         for t in tests:
3448             if not include_onlymatching and t.get('only_matching', False):
3449                 continue
3450             t['name'] = type(self).__name__[:-len('IE')]
3451             yield t
3452
3453     def is_suitable(self, age_limit):
3454         """ Test whether the extractor is generally suitable for the given
3455         age limit (i.e. pornographic sites are not, all others usually are) """
3456
3457         any_restricted = False
3458         for tc in self.get_testcases(include_onlymatching=False):
3459             if tc.get('playlist', []):
3460                 tc = tc['playlist'][0]
3461             is_restricted = age_restricted(
3462                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3463             if not is_restricted:
3464                 return True
3465             any_restricted = any_restricted or is_restricted
3466         return not any_restricted
3467
3468     def extract_subtitles(self, *args, **kwargs):
3469         if (self.get_param('writesubtitles', False)
3470                 or self.get_param('listsubtitles')):
3471             return self._get_subtitles(*args, **kwargs)
3472         return {}
3473
3474     def _get_subtitles(self, *args, **kwargs):
3475         raise NotImplementedError('This method must be implemented by subclasses')
3476
3477     @staticmethod
3478     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3479         """ Merge subtitle items for one language. Items with duplicated URLs
3480         will be dropped. """
3481         list1_urls = set([item['url'] for item in subtitle_list1])
3482         ret = list(subtitle_list1)
3483         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3484         return ret
3485
3486     @classmethod
3487     def _merge_subtitles(cls, *dicts, **kwargs):
3488         """ Merge subtitle dictionaries, language by language. """
3489
3490         target = (lambda target=None: target)(**kwargs)
3491         # The above lambda extracts the keyword argument 'target' from kwargs
3492         # while ensuring there are no stray ones. When Python 2 support
3493         # is dropped, remove it and change the function signature to:
3494         #
3495         #     def _merge_subtitles(cls, *dicts, target=None):
3496
3497         if target is None:
3498             target = {}
3499         for d in dicts:
3500             for lang, subs in d.items():
3501                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3502         return target
3503
3504     def extract_automatic_captions(self, *args, **kwargs):
3505         if (self.get_param('writeautomaticsub', False)
3506                 or self.get_param('listsubtitles')):
3507             return self._get_automatic_captions(*args, **kwargs)
3508         return {}
3509
3510     def _get_automatic_captions(self, *args, **kwargs):
3511         raise NotImplementedError('This method must be implemented by subclasses')
3512
3513     def mark_watched(self, *args, **kwargs):
3514         if (self.get_param('mark_watched', False)
3515                 and (self._get_login_info()[0] is not None
3516                      or self.get_param('cookiefile') is not None)):
3517             self._mark_watched(*args, **kwargs)
3518
3519     def _mark_watched(self, *args, **kwargs):
3520         raise NotImplementedError('This method must be implemented by subclasses')
3521
3522     def geo_verification_headers(self):
3523         headers = {}
3524         geo_verification_proxy = self.get_param('geo_verification_proxy')
3525         if geo_verification_proxy:
3526             headers['Ytdl-request-proxy'] = geo_verification_proxy
3527         return headers
3528
3529     def _generic_id(self, url):
3530         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3531
3532     def _generic_title(self, url):
3533         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3534
3535     @staticmethod
3536     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3537         all_known = all(map(
3538             lambda x: x is not None,
3539             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3540         return (
3541             'private' if is_private
3542             else 'premium_only' if needs_premium
3543             else 'subscriber_only' if needs_subscription
3544             else 'needs_auth' if needs_auth
3545             else 'unlisted' if is_unlisted
3546             else 'public' if all_known
3547             else None)
3548
3549
3550 class SearchInfoExtractor(InfoExtractor):
3551     """
3552     Base class for paged search queries extractors.
3553     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3554     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3555     """
3556
3557     @classmethod
3558     def _make_valid_url(cls):
3559         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3560
3561     @classmethod
3562     def suitable(cls, url):
3563         return re.match(cls._make_valid_url(), url) is not None
3564
3565     def _real_extract(self, query):
3566         mobj = re.match(self._make_valid_url(), query)
3567         if mobj is None:
3568             raise ExtractorError('Invalid search query "%s"' % query)
3569
3570         prefix = mobj.group('prefix')
3571         query = mobj.group('query')
3572         if prefix == '':
3573             return self._get_n_results(query, 1)
3574         elif prefix == 'all':
3575             return self._get_n_results(query, self._MAX_RESULTS)
3576         else:
3577             n = int(prefix)
3578             if n <= 0:
3579                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3580             elif n > self._MAX_RESULTS:
3581                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3582                 n = self._MAX_RESULTS
3583             return self._get_n_results(query, n)
3584
3585     def _get_n_results(self, query, n):
3586         """Get a specified number of results for a query"""
3587         raise NotImplementedError('This method must be implemented by subclasses')
3588
3589     @property
3590     def SEARCH_KEY(self):
3591         return self._SEARCH_KEY