yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import ssl
  14 import sys
  15 import time
  16 import math
  17
  18 from ..compat import (
  19     compat_cookiejar_Cookie,
  20     compat_cookies,
  21     compat_etree_Element,
  22     compat_etree_fromstring,
  23     compat_getpass,
  24     compat_integer_types,
  25     compat_http_client,
  26     compat_os_name,
  27     compat_str,
  28     compat_urllib_error,
  29     compat_urllib_parse_unquote,
  30     compat_urllib_parse_urlencode,
  31     compat_urllib_request,
  32     compat_urlparse,
  33     compat_xml_parse_error,
  34 )
  35 from ..downloader import FileDownloader
  36 from ..downloader.f4m import (
  37     get_base_url,
  38     remove_encrypted_media,
  39 )
  40 from ..utils import (
  41     NO_DEFAULT,
  42     age_restricted,
  43     base_url,
  44     bug_reports_message,
  45     clean_html,
  46     compiled_regex_type,
  47     determine_ext,
  48     determine_protocol,
  49     dict_get,
  50     error_to_compat_str,
  51     ExtractorError,
  52     extract_attributes,
  53     fix_xml_ampersands,
  54     float_or_none,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     js_to_json,
  59     JSON_LD_RE,
  60     mimetype2ext,
  61     orderedSet,
  62     parse_bitrate,
  63     parse_codecs,
  64     parse_duration,
  65     parse_iso8601,
  66     parse_m3u8_attributes,
  67     parse_resolution,
  68     RegexNotFoundError,
  69     sanitized_Request,
  70     sanitize_filename,
  71     str_or_none,
  72     str_to_int,
  73     strip_or_none,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     urljoin,
  80     url_basename,
  81     url_or_none,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor(object):
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped.
 107
 108     Additionally, it must contain either a formats entry or a url one:
 109
 110     formats:        A list of dictionaries for each format available, ordered
 111                     from worst to best quality.
 112
 113                     Potential fields:
 114                     * url        The mandatory URL representing the media:
 115                                    for plain file media - HTTP URL of this file,
 116                                    for RTMP - RTMP URL,
 117                                    for HLS - URL of the M3U8 media playlist,
 118                                    for HDS - URL of the F4M manifest,
 119                                    for DASH
 120                                      - HTTP URL to plain file media (in case of
 121                                        unfragmented media)
 122                                      - URL of the MPD manifest or base URL
 123                                        representing the media if MPD manifest
 124                                        is parsed from a string (in case of
 125                                        fragmented media)
 126                                    for MSS - URL of the ISM manifest.
 127                     * manifest_url
 128                                  The URL of the manifest file in case of
 129                                  fragmented media:
 130                                    for HLS - URL of the M3U8 master playlist,
 131                                    for HDS - URL of the F4M manifest,
 132                                    for DASH - URL of the MPD manifest,
 133                                    for MSS - URL of the ISM manifest.
 134                     * ext        Will be calculated from URL if missing
 135                     * format     A human-readable description of the format
 136                                  ("mp4 container with h264/opus").
 137                                  Calculated from the format_id, width, height.
 138                                  and format_note fields if missing.
 139                     * format_id  A short description of the format
 140                                  ("mp4_h264_opus" or "19").
 141                                 Technically optional, but strongly recommended.
 142                     * format_note Additional info about the format
 143                                  ("3D" or "DASH video")
 144                     * width      Width of the video, if known
 145                     * height     Height of the video, if known
 146                     * resolution Textual description of width and height
 147                     * tbr        Average bitrate of audio and video in KBit/s
 148                     * abr        Average audio bitrate in KBit/s
 149                     * acodec     Name of the audio codec in use
 150                     * asr        Audio sampling rate in Hertz
 151                     * vbr        Average video bitrate in KBit/s
 152                     * fps        Frame rate
 153                     * vcodec     Name of the video codec in use
 154                     * container  Name of the container format
 155                     * filesize   The number of bytes, if known in advance
 156                     * filesize_approx  An estimate for the number of bytes
 157                     * player_url SWF Player URL (used for rtmpdump).
 158                     * protocol   The protocol that will be used for the actual
 159                                  download, lower-case.
 160                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 161                                  "m3u8", "m3u8_native" or "http_dash_segments".
 162                     * fragment_base_url
 163                                  Base URL for fragments. Each fragment's path
 164                                  value (if present) will be relative to
 165                                  this URL.
 166                     * fragments  A list of fragments of a fragmented media.
 167                                  Each fragment entry must contain either an url
 168                                  or a path. If an url is present it should be
 169                                  considered by a client. Otherwise both path and
 170                                  fragment_base_url must be present. Here is
 171                                  the list of all potential fields:
 172                                  * "url" - fragment's URL
 173                                  * "path" - fragment's path relative to
 174                                             fragment_base_url
 175                                  * "duration" (optional, int or float)
 176                                  * "filesize" (optional, int)
 177                     * preference Order number of this format. If this field is
 178                                  present and not None, the formats get sorted
 179                                  by this field, regardless of all other values.
 180                                  -1 for default (order by other properties),
 181                                  -2 or smaller for less than default.
 182                                  < -1000 to hide the format (if there is
 183                                     another one which is strictly better)
 184                     * language   Language code, e.g. "de" or "en-US".
 185                     * language_preference  Is this in the language mentioned in
 186                                  the URL?
 187                                  10 if it's what the URL is about,
 188                                  -1 for default (don't know),
 189                                  -10 otherwise, other values reserved for now.
 190                     * quality    Order number of the video quality of this
 191                                  format, irrespective of the file format.
 192                                  -1 for default (order by other properties),
 193                                  -2 or smaller for less than default.
 194                     * source_preference  Order number for this video source
 195                                   (quality takes higher priority)
 196                                  -1 for default (order by other properties),
 197                                  -2 or smaller for less than default.
 198                     * http_headers  A dictionary of additional HTTP headers
 199                                  to add to the request.
 200                     * stretched_ratio  If given and not 1, indicates that the
 201                                  video's pixels are not square.
 202                                  width : height ratio as float.
 203                     * no_resume  The server does not support resuming the
 204                                  (HTTP or RTMP) download. Boolean.
 205                     * downloader_options  A dictionary of downloader options as
 206                                  described in FileDownloader
 207
 208     url:            Final video URL.
 209     ext:            Video filename extension.
 210     format:         The video format, defaults to ext (used for --get-format)
 211     player_url:     SWF Player URL (used for rtmpdump).
 212
 213     The following fields are optional:
 214
 215     alt_title:      A secondary title of the video.
 216     display_id      An alternative identifier for the video, not necessarily
 217                     unique, but available before title. Typically, id is
 218                     something like "4234987", title "Dancing naked mole rats",
 219                     and display_id "dancing-naked-mole-rats"
 220     thumbnails:     A list of dictionaries, with the following entries:
 221                         * "id" (optional, string) - Thumbnail format ID
 222                         * "url"
 223                         * "preference" (optional, int) - quality of the image
 224                         * "width" (optional, int)
 225                         * "height" (optional, int)
 226                         * "resolution" (optional, string "{width}x{height}",
 227                                         deprecated)
 228                         * "filesize" (optional, int)
 229     thumbnail:      Full URL to a video thumbnail image.
 230     description:    Full video description.
 231     uploader:       Full name of the video uploader.
 232     license:        License name the video is licensed under.
 233     creator:        The creator of the video.
 234     release_date:   The date (YYYYMMDD) when the video was released.
 235     timestamp:      UNIX timestamp of the moment the video became available.
 236     upload_date:    Video upload date (YYYYMMDD).
 237                     If not explicitly set, calculated from timestamp.
 238     uploader_id:    Nickname or id of the video uploader.
 239     uploader_url:   Full URL to a personal webpage of the video uploader.
 240     channel:        Full name of the channel the video is uploaded on.
 241                     Note that channel fields may or may not repeat uploader
 242                     fields. This depends on a particular extractor.
 243     channel_id:     Id of the channel.
 244     channel_url:    Full URL to a channel webpage.
 245     location:       Physical location where the video was filmed.
 246     subtitles:      The available subtitles as a dictionary in the format
 247                     {tag: subformats}. "tag" is usually a language code, and
 248                     "subformats" is a list sorted from lower to higher
 249                     preference, each element is a dictionary with the "ext"
 250                     entry and one of:
 251                         * "data": The subtitles file contents
 252                         * "url": A URL pointing to the subtitles file
 253                     "ext" will be calculated from URL if missing
 254     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 255                     automatically generated captions
 256     duration:       Length of the video in seconds, as an integer or float.
 257     view_count:     How many users have watched the video on the platform.
 258     like_count:     Number of positive ratings of the video
 259     dislike_count:  Number of negative ratings of the video
 260     repost_count:   Number of reposts of the video
 261     average_rating: Average rating give by users, the scale used depends on the webpage
 262     comment_count:  Number of comments on the video
 263     comments:       A list of comments, each with one or more of the following
 264                     properties (all but one of text or html optional):
 265                         * "author" - human-readable name of the comment author
 266                         * "author_id" - user ID of the comment author
 267                         * "id" - Comment ID
 268                         * "html" - Comment as HTML
 269                         * "text" - Plain text of the comment
 270                         * "timestamp" - UNIX timestamp of comment
 271                         * "parent" - ID of the comment this one is replying to.
 272                                      Set to "root" to indicate that this is a
 273                                      comment to the original video.
 274     age_limit:      Age restriction for the video, as an integer (years)
 275     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 276                     should allow to get the same result again. (It will be set
 277                     by YoutubeDL if it's missing)
 278     categories:     A list of categories that the video falls in, for example
 279                     ["Sports", "Berlin"]
 280     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 281     is_live:        True, False, or None (=unknown). Whether this video is a
 282                     live stream that goes on instead of a fixed-length video.
 283     was_live:       True, False, or None (=unknown). Whether this video was
 284                     originally a live stream.
 285     start_time:     Time in seconds where the reproduction should start, as
 286                     specified in the URL.
 287     end_time:       Time in seconds where the reproduction should end, as
 288                     specified in the URL.
 289     chapters:       A list of dictionaries, with the following entries:
 290                         * "start_time" - The start time of the chapter in seconds
 291                         * "end_time" - The end time of the chapter in seconds
 292                         * "title" (optional, string)
 293     playable_in_embed: Whether this video is allowed to play in embedded
 294                     players on other sites. Can be True (=always allowed),
 295                     False (=never allowed), None (=unknown), or a string
 296                     specifying the criteria for embedability (Eg: 'whitelist').
 297
 298     The following fields should only be used when the video belongs to some logical
 299     chapter or section:
 300
 301     chapter:        Name or title of the chapter the video belongs to.
 302     chapter_number: Number of the chapter the video belongs to, as an integer.
 303     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 304
 305     The following fields should only be used when the video is an episode of some
 306     series, programme or podcast:
 307
 308     series:         Title of the series or programme the video episode belongs to.
 309     season:         Title of the season the video episode belongs to.
 310     season_number:  Number of the season the video episode belongs to, as an integer.
 311     season_id:      Id of the season the video episode belongs to, as a unicode string.
 312     episode:        Title of the video episode. Unlike mandatory video title field,
 313                     this field should denote the exact title of the video episode
 314                     without any kind of decoration.
 315     episode_number: Number of the video episode within a season, as an integer.
 316     episode_id:     Id of the video episode, as a unicode string.
 317
 318     The following fields should only be used when the media is a track or a part of
 319     a music album:
 320
 321     track:          Title of the track.
 322     track_number:   Number of the track within an album or a disc, as an integer.
 323     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 324                     as a unicode string.
 325     artist:         Artist(s) of the track.
 326     genre:          Genre(s) of the track.
 327     album:          Title of the album the track belongs to.
 328     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 329     album_artist:   List of all artists appeared on the album (e.g.
 330                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 331                     and compilations).
 332     disc_number:    Number of the disc or other physical medium the track belongs to,
 333                     as an integer.
 334     release_year:   Year (YYYY) when the album was released.
 335
 336     Unless mentioned otherwise, the fields should be Unicode strings.
 337
 338     Unless mentioned otherwise, None is equivalent to absence of information.
 339
 340
 341     _type "playlist" indicates multiple videos.
 342     There must be a key "entries", which is a list, an iterable, or a PagedList
 343     object, each element of which is a valid dictionary by this specification.
 344
 345     Additionally, playlists can have "id", "title", and any other relevent
 346     attributes with the same semantics as videos (see above).
 347
 348
 349     _type "multi_video" indicates that there are multiple videos that
 350     form a single show, for examples multiple acts of an opera or TV episode.
 351     It must have an entries key like a playlist and contain all the keys
 352     required for a video at the same time.
 353
 354
 355     _type "url" indicates that the video must be extracted from another
 356     location, possibly by a different extractor. Its only required key is:
 357     "url" - the next URL to extract.
 358     The key "ie_key" can be set to the class name (minus the trailing "IE",
 359     e.g. "Youtube") if the extractor class is known in advance.
 360     Additionally, the dictionary may have any properties of the resolved entity
 361     known in advance, for example "title" if the title of the referred video is
 362     known ahead of time.
 363
 364
 365     _type "url_transparent" entities have the same specification as "url", but
 366     indicate that the given additional information is more precise than the one
 367     associated with the resolved URL.
 368     This is useful when a site employs a video service that hosts the video and
 369     its technical metadata, but that video service does not embed a useful
 370     title, description etc.
 371
 372
 373     Subclasses of this one should re-define the _real_initialize() and
 374     _real_extract() methods and define a _VALID_URL regexp.
 375     Probably, they should also be added to the list of extractors.
 376
 377     _GEO_BYPASS attribute may be set to False in order to disable
 378     geo restriction bypass mechanisms for a particular extractor.
 379     Though it won't disable explicit geo restriction bypass based on
 380     country code provided with geo_bypass_country.
 381
 382     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 383     countries for this extractor. One of these countries will be used by
 384     geo restriction bypass mechanism right away in order to bypass
 385     geo restriction, of course, if the mechanism is not disabled.
 386
 387     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 388     IP blocks in CIDR notation for this extractor. One of these IP blocks
 389     will be used by geo restriction bypass mechanism similarly
 390     to _GEO_COUNTRIES.
 391
 392     Finally, the _WORKING attribute should be set to False for broken IEs
 393     in order to warn the users and skip the tests.
 394     """
 395
 396     _ready = False
 397     _downloader = None
 398     _x_forwarded_for_ip = None
 399     _GEO_BYPASS = True
 400     _GEO_COUNTRIES = None
 401     _GEO_IP_BLOCKS = None
 402     _WORKING = True
 403
 404     def __init__(self, downloader=None):
 405         """Constructor. Receives an optional downloader."""
 406         self._ready = False
 407         self._x_forwarded_for_ip = None
 408         self.set_downloader(downloader)
 409
 410     @classmethod
 411     def suitable(cls, url):
 412         """Receives a URL and returns True if suitable for this IE."""
 413
 414         # This does not use has/getattr intentionally - we want to know whether
 415         # we have cached the regexp for *this* class, whereas getattr would also
 416         # match the superclass
 417         if '_VALID_URL_RE' not in cls.__dict__:
 418             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 419         return cls._VALID_URL_RE.match(url) is not None
 420
 421     @classmethod
 422     def _match_id(cls, url):
 423         if '_VALID_URL_RE' not in cls.__dict__:
 424             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 425         m = cls._VALID_URL_RE.match(url)
 426         assert m
 427         return compat_str(m.group('id'))
 428
 429     @classmethod
 430     def working(cls):
 431         """Getter method for _WORKING."""
 432         return cls._WORKING
 433
 434     def initialize(self):
 435         """Initializes an instance (authentication, etc)."""
 436         self._initialize_geo_bypass({
 437             'countries': self._GEO_COUNTRIES,
 438             'ip_blocks': self._GEO_IP_BLOCKS,
 439         })
 440         if not self._ready:
 441             self._real_initialize()
 442             self._ready = True
 443
 444     def _initialize_geo_bypass(self, geo_bypass_context):
 445         """
 446         Initialize geo restriction bypass mechanism.
 447
 448         This method is used to initialize geo bypass mechanism based on faking
 449         X-Forwarded-For HTTP header. A random country from provided country list
 450         is selected and a random IP belonging to this country is generated. This
 451         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 452         HTTP requests.
 453
 454         This method will be used for initial geo bypass mechanism initialization
 455         during the instance initialization with _GEO_COUNTRIES and
 456         _GEO_IP_BLOCKS.
 457
 458         You may also manually call it from extractor's code if geo bypass
 459         information is not available beforehand (e.g. obtained during
 460         extraction) or due to some other reason. In this case you should pass
 461         this information in geo bypass context passed as first argument. It may
 462         contain following fields:
 463
 464         countries:  List of geo unrestricted countries (similar
 465                     to _GEO_COUNTRIES)
 466         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 467                     (similar to _GEO_IP_BLOCKS)
 468
 469         """
 470         if not self._x_forwarded_for_ip:
 471
 472             # Geo bypass mechanism is explicitly disabled by user
 473             if not self._downloader.params.get('geo_bypass', True):
 474                 return
 475
 476             if not geo_bypass_context:
 477                 geo_bypass_context = {}
 478
 479             # Backward compatibility: previously _initialize_geo_bypass
 480             # expected a list of countries, some 3rd party code may still use
 481             # it this way
 482             if isinstance(geo_bypass_context, (list, tuple)):
 483                 geo_bypass_context = {
 484                     'countries': geo_bypass_context,
 485                 }
 486
 487             # The whole point of geo bypass mechanism is to fake IP
 488             # as X-Forwarded-For HTTP header based on some IP block or
 489             # country code.
 490
 491             # Path 1: bypassing based on IP block in CIDR notation
 492
 493             # Explicit IP block specified by user, use it right away
 494             # regardless of whether extractor is geo bypassable or not
 495             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 496
 497             # Otherwise use random IP block from geo bypass context but only
 498             # if extractor is known as geo bypassable
 499             if not ip_block:
 500                 ip_blocks = geo_bypass_context.get('ip_blocks')
 501                 if self._GEO_BYPASS and ip_blocks:
 502                     ip_block = random.choice(ip_blocks)
 503
 504             if ip_block:
 505                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 506                 if self._downloader.params.get('verbose', False):
 507                     self._downloader.to_screen(
 508                         '[debug] Using fake IP %s as X-Forwarded-For.'
 509                         % self._x_forwarded_for_ip)
 510                 return
 511
 512             # Path 2: bypassing based on country code
 513
 514             # Explicit country code specified by user, use it right away
 515             # regardless of whether extractor is geo bypassable or not
 516             country = self._downloader.params.get('geo_bypass_country', None)
 517
 518             # Otherwise use random country code from geo bypass context but
 519             # only if extractor is known as geo bypassable
 520             if not country:
 521                 countries = geo_bypass_context.get('countries')
 522                 if self._GEO_BYPASS and countries:
 523                     country = random.choice(countries)
 524
 525             if country:
 526                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 527                 if self._downloader.params.get('verbose', False):
 528                     self._downloader.to_screen(
 529                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 530                         % (self._x_forwarded_for_ip, country.upper()))
 531
 532     def extract(self, url):
 533         """Extracts URL information and returns it in list of dicts."""
 534         try:
 535             for _ in range(2):
 536                 try:
 537                     self.initialize()
 538                     ie_result = self._real_extract(url)
 539                     if self._x_forwarded_for_ip:
 540                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 541                     return ie_result
 542                 except GeoRestrictedError as e:
 543                     if self.__maybe_fake_ip_and_retry(e.countries):
 544                         continue
 545                     raise
 546         except ExtractorError:
 547             raise
 548         except compat_http_client.IncompleteRead as e:
 549             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 550         except (KeyError, StopIteration) as e:
 551             raise ExtractorError('An extractor error has occurred.', cause=e)
 552
 553     def __maybe_fake_ip_and_retry(self, countries):
 554         if (not self._downloader.params.get('geo_bypass_country', None)
 555                 and self._GEO_BYPASS
 556                 and self._downloader.params.get('geo_bypass', True)
 557                 and not self._x_forwarded_for_ip
 558                 and countries):
 559             country_code = random.choice(countries)
 560             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 561             if self._x_forwarded_for_ip:
 562                 self.report_warning(
 563                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 564                     % (self._x_forwarded_for_ip, country_code.upper()))
 565                 return True
 566         return False
 567
 568     def set_downloader(self, downloader):
 569         """Sets the downloader for this IE."""
 570         self._downloader = downloader
 571
 572     def _real_initialize(self):
 573         """Real initialization process. Redefine in subclasses."""
 574         pass
 575
 576     def _real_extract(self, url):
 577         """Real extraction process. Redefine in subclasses."""
 578         pass
 579
 580     @classmethod
 581     def ie_key(cls):
 582         """A string for getting the InfoExtractor with get_info_extractor"""
 583         return compat_str(cls.__name__[:-2])
 584
 585     @property
 586     def IE_NAME(self):
 587         return compat_str(type(self).__name__[:-2])
 588
 589     @staticmethod
 590     def __can_accept_status_code(err, expected_status):
 591         assert isinstance(err, compat_urllib_error.HTTPError)
 592         if expected_status is None:
 593             return False
 594         if isinstance(expected_status, compat_integer_types):
 595             return err.code == expected_status
 596         elif isinstance(expected_status, (list, tuple)):
 597             return err.code in expected_status
 598         elif callable(expected_status):
 599             return expected_status(err.code) is True
 600         else:
 601             assert False
 602
 603     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 604         """
 605         Return the response handle.
 606
 607         See _download_webpage docstring for arguments specification.
 608         """
 609         if not self._downloader._first_webpage_request:
 610             sleep_interval = float_or_none(self._downloader.params.get('sleep_interval_requests')) or 0
 611             if sleep_interval > 0:
 612                 self.to_screen('Sleeping %s seconds...' % sleep_interval)
 613                 time.sleep(sleep_interval)
 614         else:
 615             self._downloader._first_webpage_request = False
 616
 617         if note is None:
 618             self.report_download_webpage(video_id)
 619         elif note is not False:
 620             if video_id is None:
 621                 self.to_screen('%s' % (note,))
 622             else:
 623                 self.to_screen('%s: %s' % (video_id, note))
 624
 625         # Some sites check X-Forwarded-For HTTP header in order to figure out
 626         # the origin of the client behind proxy. This allows bypassing geo
 627         # restriction by faking this header's value to IP that belongs to some
 628         # geo unrestricted country. We will do so once we encounter any
 629         # geo restriction error.
 630         if self._x_forwarded_for_ip:
 631             if 'X-Forwarded-For' not in headers:
 632                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 633
 634         if isinstance(url_or_request, compat_urllib_request.Request):
 635             url_or_request = update_Request(
 636                 url_or_request, data=data, headers=headers, query=query)
 637         else:
 638             if query:
 639                 url_or_request = update_url_query(url_or_request, query)
 640             if data is not None or headers:
 641                 url_or_request = sanitized_Request(url_or_request, data, headers)
 642         exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 643         if hasattr(ssl, 'CertificateError'):
 644             exceptions.append(ssl.CertificateError)
 645         try:
 646             return self._downloader.urlopen(url_or_request)
 647         except tuple(exceptions) as err:
 648             if isinstance(err, compat_urllib_error.HTTPError):
 649                 if self.__can_accept_status_code(err, expected_status):
 650                     # Retain reference to error to prevent file object from
 651                     # being closed before it can be read. Works around the
 652                     # effects of <https://bugs.python.org/issue15002>
 653                     # introduced in Python 3.4.1.
 654                     err.fp._error = err
 655                     return err.fp
 656
 657             if errnote is False:
 658                 return False
 659             if errnote is None:
 660                 errnote = 'Unable to download webpage'
 661
 662             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 663             if fatal:
 664                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 665             else:
 666                 self._downloader.report_warning(errmsg)
 667                 return False
 668
 669     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 670         """
 671         Return a tuple (page content as string, URL handle).
 672
 673         See _download_webpage docstring for arguments specification.
 674         """
 675         # Strip hashes from the URL (#1038)
 676         if isinstance(url_or_request, (compat_str, str)):
 677             url_or_request = url_or_request.partition('#')[0]
 678
 679         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 680         if urlh is False:
 681             assert not fatal
 682             return False
 683         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 684         return (content, urlh)
 685
 686     @staticmethod
 687     def _guess_encoding_from_content(content_type, webpage_bytes):
 688         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 689         if m:
 690             encoding = m.group(1)
 691         else:
 692             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 693                           webpage_bytes[:1024])
 694             if m:
 695                 encoding = m.group(1).decode('ascii')
 696             elif webpage_bytes.startswith(b'\xff\xfe'):
 697                 encoding = 'utf-16'
 698             else:
 699                 encoding = 'utf-8'
 700
 701         return encoding
 702
 703     def __check_blocked(self, content):
 704         first_block = content[:512]
 705         if ('<title>Access to this site is blocked</title>' in content
 706                 and 'Websense' in first_block):
 707             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 708             blocked_iframe = self._html_search_regex(
 709                 r'<iframe src="([^"]+)"', content,
 710                 'Websense information URL', default=None)
 711             if blocked_iframe:
 712                 msg += ' Visit %s for more details' % blocked_iframe
 713             raise ExtractorError(msg, expected=True)
 714         if '<title>The URL you requested has been blocked</title>' in first_block:
 715             msg = (
 716                 'Access to this webpage has been blocked by Indian censorship. '
 717                 'Use a VPN or proxy server (with --proxy) to route around it.')
 718             block_msg = self._html_search_regex(
 719                 r'</h1><p>(.*?)</p>',
 720                 content, 'block message', default=None)
 721             if block_msg:
 722                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 723             raise ExtractorError(msg, expected=True)
 724         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 725                 and 'blocklist.rkn.gov.ru' in content):
 726             raise ExtractorError(
 727                 'Access to this webpage has been blocked by decision of the Russian government. '
 728                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 729                 expected=True)
 730
 731     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 732         content_type = urlh.headers.get('Content-Type', '')
 733         webpage_bytes = urlh.read()
 734         if prefix is not None:
 735             webpage_bytes = prefix + webpage_bytes
 736         if not encoding:
 737             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 738         if self._downloader.params.get('dump_intermediate_pages', False):
 739             self.to_screen('Dumping request to ' + urlh.geturl())
 740             dump = base64.b64encode(webpage_bytes).decode('ascii')
 741             self._downloader.to_screen(dump)
 742         if self._downloader.params.get('write_pages', False):
 743             basen = '%s_%s' % (video_id, urlh.geturl())
 744             if len(basen) > 240:
 745                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 746                 basen = basen[:240 - len(h)] + h
 747             raw_filename = basen + '.dump'
 748             filename = sanitize_filename(raw_filename, restricted=True)
 749             self.to_screen('Saving request to ' + filename)
 750             # Working around MAX_PATH limitation on Windows (see
 751             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 752             if compat_os_name == 'nt':
 753                 absfilepath = os.path.abspath(filename)
 754                 if len(absfilepath) > 259:
 755                     filename = '\\\\?\\' + absfilepath
 756             with open(filename, 'wb') as outf:
 757                 outf.write(webpage_bytes)
 758
 759         try:
 760             content = webpage_bytes.decode(encoding, 'replace')
 761         except LookupError:
 762             content = webpage_bytes.decode('utf-8', 'replace')
 763
 764         self.__check_blocked(content)
 765
 766         return content
 767
 768     def _download_webpage(
 769             self, url_or_request, video_id, note=None, errnote=None,
 770             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 771             headers={}, query={}, expected_status=None):
 772         """
 773         Return the data of the page as a string.
 774
 775         Arguments:
 776         url_or_request -- plain text URL as a string or
 777             a compat_urllib_request.Requestobject
 778         video_id -- Video/playlist/item identifier (string)
 779
 780         Keyword arguments:
 781         note -- note printed before downloading (string)
 782         errnote -- note printed in case of an error (string)
 783         fatal -- flag denoting whether error should be considered fatal,
 784             i.e. whether it should cause ExtractionError to be raised,
 785             otherwise a warning will be reported and extraction continued
 786         tries -- number of tries
 787         timeout -- sleep interval between tries
 788         encoding -- encoding for a page content decoding, guessed automatically
 789             when not explicitly specified
 790         data -- POST data (bytes)
 791         headers -- HTTP headers (dict)
 792         query -- URL query (dict)
 793         expected_status -- allows to accept failed HTTP requests (non 2xx
 794             status code) by explicitly specifying a set of accepted status
 795             codes. Can be any of the following entities:
 796                 - an integer type specifying an exact failed status code to
 797                   accept
 798                 - a list or a tuple of integer types specifying a list of
 799                   failed status codes to accept
 800                 - a callable accepting an actual failed status code and
 801                   returning True if it should be accepted
 802             Note that this argument does not affect success status codes (2xx)
 803             which are always accepted.
 804         """
 805
 806         success = False
 807         try_count = 0
 808         while success is False:
 809             try:
 810                 res = self._download_webpage_handle(
 811                     url_or_request, video_id, note, errnote, fatal,
 812                     encoding=encoding, data=data, headers=headers, query=query,
 813                     expected_status=expected_status)
 814                 success = True
 815             except compat_http_client.IncompleteRead as e:
 816                 try_count += 1
 817                 if try_count >= tries:
 818                     raise e
 819                 self._sleep(timeout, video_id)
 820         if res is False:
 821             return res
 822         else:
 823             content, _ = res
 824             return content
 825
 826     def _download_xml_handle(
 827             self, url_or_request, video_id, note='Downloading XML',
 828             errnote='Unable to download XML', transform_source=None,
 829             fatal=True, encoding=None, data=None, headers={}, query={},
 830             expected_status=None):
 831         """
 832         Return a tuple (xml as an compat_etree_Element, URL handle).
 833
 834         See _download_webpage docstring for arguments specification.
 835         """
 836         res = self._download_webpage_handle(
 837             url_or_request, video_id, note, errnote, fatal=fatal,
 838             encoding=encoding, data=data, headers=headers, query=query,
 839             expected_status=expected_status)
 840         if res is False:
 841             return res
 842         xml_string, urlh = res
 843         return self._parse_xml(
 844             xml_string, video_id, transform_source=transform_source,
 845             fatal=fatal), urlh
 846
 847     def _download_xml(
 848             self, url_or_request, video_id,
 849             note='Downloading XML', errnote='Unable to download XML',
 850             transform_source=None, fatal=True, encoding=None,
 851             data=None, headers={}, query={}, expected_status=None):
 852         """
 853         Return the xml as an compat_etree_Element.
 854
 855         See _download_webpage docstring for arguments specification.
 856         """
 857         res = self._download_xml_handle(
 858             url_or_request, video_id, note=note, errnote=errnote,
 859             transform_source=transform_source, fatal=fatal, encoding=encoding,
 860             data=data, headers=headers, query=query,
 861             expected_status=expected_status)
 862         return res if res is False else res[0]
 863
 864     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 865         if transform_source:
 866             xml_string = transform_source(xml_string)
 867         try:
 868             return compat_etree_fromstring(xml_string.encode('utf-8'))
 869         except compat_xml_parse_error as ve:
 870             errmsg = '%s: Failed to parse XML ' % video_id
 871             if fatal:
 872                 raise ExtractorError(errmsg, cause=ve)
 873             else:
 874                 self.report_warning(errmsg + str(ve))
 875
 876     def _download_json_handle(
 877             self, url_or_request, video_id, note='Downloading JSON metadata',
 878             errnote='Unable to download JSON metadata', transform_source=None,
 879             fatal=True, encoding=None, data=None, headers={}, query={},
 880             expected_status=None):
 881         """
 882         Return a tuple (JSON object, URL handle).
 883
 884         See _download_webpage docstring for arguments specification.
 885         """
 886         res = self._download_webpage_handle(
 887             url_or_request, video_id, note, errnote, fatal=fatal,
 888             encoding=encoding, data=data, headers=headers, query=query,
 889             expected_status=expected_status)
 890         if res is False:
 891             return res
 892         json_string, urlh = res
 893         return self._parse_json(
 894             json_string, video_id, transform_source=transform_source,
 895             fatal=fatal), urlh
 896
 897     def _download_json(
 898             self, url_or_request, video_id, note='Downloading JSON metadata',
 899             errnote='Unable to download JSON metadata', transform_source=None,
 900             fatal=True, encoding=None, data=None, headers={}, query={},
 901             expected_status=None):
 902         """
 903         Return the JSON object as a dict.
 904
 905         See _download_webpage docstring for arguments specification.
 906         """
 907         res = self._download_json_handle(
 908             url_or_request, video_id, note=note, errnote=errnote,
 909             transform_source=transform_source, fatal=fatal, encoding=encoding,
 910             data=data, headers=headers, query=query,
 911             expected_status=expected_status)
 912         return res if res is False else res[0]
 913
 914     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 915         if transform_source:
 916             json_string = transform_source(json_string)
 917         try:
 918             return json.loads(json_string)
 919         except ValueError as ve:
 920             errmsg = '%s: Failed to parse JSON ' % video_id
 921             if fatal:
 922                 raise ExtractorError(errmsg, cause=ve)
 923             else:
 924                 self.report_warning(errmsg + str(ve))
 925
 926     def report_warning(self, msg, video_id=None):
 927         idstr = '' if video_id is None else '%s: ' % video_id
 928         self._downloader.report_warning(
 929             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 930
 931     def to_screen(self, msg):
 932         """Print msg to screen, prefixing it with '[ie_name]'"""
 933         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 934
 935     def report_extraction(self, id_or_name):
 936         """Report information extraction."""
 937         self.to_screen('%s: Extracting information' % id_or_name)
 938
 939     def report_download_webpage(self, video_id):
 940         """Report webpage download."""
 941         self.to_screen('%s: Downloading webpage' % video_id)
 942
 943     def report_age_confirmation(self):
 944         """Report attempt to confirm age."""
 945         self.to_screen('Confirming age')
 946
 947     def report_login(self):
 948         """Report attempt to log in."""
 949         self.to_screen('Logging in')
 950
 951     @staticmethod
 952     def raise_login_required(msg='This video is only available for registered users'):
 953         raise ExtractorError(
 954             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 955             expected=True)
 956
 957     @staticmethod
 958     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 959         raise GeoRestrictedError(msg, countries=countries)
 960
 961     # Methods for following #608
 962     @staticmethod
 963     def url_result(url, ie=None, video_id=None, video_title=None):
 964         """Returns a URL that points to a page that should be processed"""
 965         # TODO: ie should be the class used for getting the info
 966         video_info = {'_type': 'url',
 967                       'url': url,
 968                       'ie_key': ie}
 969         if video_id is not None:
 970             video_info['id'] = video_id
 971         if video_title is not None:
 972             video_info['title'] = video_title
 973         return video_info
 974
 975     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 976         urls = orderedSet(
 977             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 978             for m in matches)
 979         return self.playlist_result(
 980             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 981
 982     @staticmethod
 983     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
 984         """Returns a playlist"""
 985         video_info = {'_type': 'playlist',
 986                       'entries': entries}
 987         video_info.update(kwargs)
 988         if playlist_id:
 989             video_info['id'] = playlist_id
 990         if playlist_title:
 991             video_info['title'] = playlist_title
 992         if playlist_description is not None:
 993             video_info['description'] = playlist_description
 994         return video_info
 995
 996     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 997         """
 998         Perform a regex search on the given string, using a single or a list of
 999         patterns returning the first matching group.
1000         In case of failure return a default value or raise a WARNING or a
1001         RegexNotFoundError, depending on fatal, specifying the field name.
1002         """
1003         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1004             mobj = re.search(pattern, string, flags)
1005         else:
1006             for p in pattern:
1007                 mobj = re.search(p, string, flags)
1008                 if mobj:
1009                     break
1010
1011         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1012             _name = '\033[0;34m%s\033[0m' % name
1013         else:
1014             _name = name
1015
1016         if mobj:
1017             if group is None:
1018                 # return the first matching group
1019                 return next(g for g in mobj.groups() if g is not None)
1020             else:
1021                 return mobj.group(group)
1022         elif default is not NO_DEFAULT:
1023             return default
1024         elif fatal:
1025             raise RegexNotFoundError('Unable to extract %s' % _name)
1026         else:
1027             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1028             return None
1029
1030     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1031         """
1032         Like _search_regex, but strips HTML tags and unescapes entities.
1033         """
1034         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1035         if res:
1036             return clean_html(res).strip()
1037         else:
1038             return res
1039
1040     def _get_netrc_login_info(self, netrc_machine=None):
1041         username = None
1042         password = None
1043         netrc_machine = netrc_machine or self._NETRC_MACHINE
1044
1045         if self._downloader.params.get('usenetrc', False):
1046             try:
1047                 info = netrc.netrc().authenticators(netrc_machine)
1048                 if info is not None:
1049                     username = info[0]
1050                     password = info[2]
1051                 else:
1052                     raise netrc.NetrcParseError(
1053                         'No authenticators for %s' % netrc_machine)
1054             except (IOError, netrc.NetrcParseError) as err:
1055                 self._downloader.report_warning(
1056                     'parsing .netrc: %s' % error_to_compat_str(err))
1057
1058         return username, password
1059
1060     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1061         """
1062         Get the login info as (username, password)
1063         First look for the manually specified credentials using username_option
1064         and password_option as keys in params dictionary. If no such credentials
1065         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1066         value.
1067         If there's no info available, return (None, None)
1068         """
1069         if self._downloader is None:
1070             return (None, None)
1071
1072         downloader_params = self._downloader.params
1073
1074         # Attempt to use provided username and password or .netrc data
1075         if downloader_params.get(username_option) is not None:
1076             username = downloader_params[username_option]
1077             password = downloader_params[password_option]
1078         else:
1079             username, password = self._get_netrc_login_info(netrc_machine)
1080
1081         return username, password
1082
1083     def _get_tfa_info(self, note='two-factor verification code'):
1084         """
1085         Get the two-factor authentication info
1086         TODO - asking the user will be required for sms/phone verify
1087         currently just uses the command line option
1088         If there's no info available, return None
1089         """
1090         if self._downloader is None:
1091             return None
1092         downloader_params = self._downloader.params
1093
1094         if downloader_params.get('twofactor') is not None:
1095             return downloader_params['twofactor']
1096
1097         return compat_getpass('Type %s and press [Return]: ' % note)
1098
1099     # Helper functions for extracting OpenGraph info
1100     @staticmethod
1101     def _og_regexes(prop):
1102         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1103         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1104                        % {'prop': re.escape(prop)})
1105         template = r'<meta[^>]+?%s[^>]+?%s'
1106         return [
1107             template % (property_re, content_re),
1108             template % (content_re, property_re),
1109         ]
1110
1111     @staticmethod
1112     def _meta_regex(prop):
1113         return r'''(?isx)<meta
1114                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1115                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1116
1117     def _og_search_property(self, prop, html, name=None, **kargs):
1118         if not isinstance(prop, (list, tuple)):
1119             prop = [prop]
1120         if name is None:
1121             name = 'OpenGraph %s' % prop[0]
1122         og_regexes = []
1123         for p in prop:
1124             og_regexes.extend(self._og_regexes(p))
1125         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1126         if escaped is None:
1127             return None
1128         return unescapeHTML(escaped)
1129
1130     def _og_search_thumbnail(self, html, **kargs):
1131         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1132
1133     def _og_search_description(self, html, **kargs):
1134         return self._og_search_property('description', html, fatal=False, **kargs)
1135
1136     def _og_search_title(self, html, **kargs):
1137         return self._og_search_property('title', html, **kargs)
1138
1139     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1140         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1141         if secure:
1142             regexes = self._og_regexes('video:secure_url') + regexes
1143         return self._html_search_regex(regexes, html, name, **kargs)
1144
1145     def _og_search_url(self, html, **kargs):
1146         return self._og_search_property('url', html, **kargs)
1147
1148     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1149         if not isinstance(name, (list, tuple)):
1150             name = [name]
1151         if display_name is None:
1152             display_name = name[0]
1153         return self._html_search_regex(
1154             [self._meta_regex(n) for n in name],
1155             html, display_name, fatal=fatal, group='content', **kwargs)
1156
1157     def _dc_search_uploader(self, html):
1158         return self._html_search_meta('dc.creator', html, 'uploader')
1159
1160     def _rta_search(self, html):
1161         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1162         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1163                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1164                      html):
1165             return 18
1166         return 0
1167
1168     def _media_rating_search(self, html):
1169         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1170         rating = self._html_search_meta('rating', html)
1171
1172         if not rating:
1173             return None
1174
1175         RATING_TABLE = {
1176             'safe for kids': 0,
1177             'general': 8,
1178             '14 years': 14,
1179             'mature': 17,
1180             'restricted': 19,
1181         }
1182         return RATING_TABLE.get(rating.lower())
1183
1184     def _family_friendly_search(self, html):
1185         # See http://schema.org/VideoObject
1186         family_friendly = self._html_search_meta(
1187             'isFamilyFriendly', html, default=None)
1188
1189         if not family_friendly:
1190             return None
1191
1192         RATING_TABLE = {
1193             '1': 0,
1194             'true': 0,
1195             '0': 18,
1196             'false': 18,
1197         }
1198         return RATING_TABLE.get(family_friendly.lower())
1199
1200     def _twitter_search_player(self, html):
1201         return self._html_search_meta('twitter:player', html,
1202                                       'twitter card player')
1203
1204     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1205         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1206         default = kwargs.get('default', NO_DEFAULT)
1207         # JSON-LD may be malformed and thus `fatal` should be respected.
1208         # At the same time `default` may be passed that assumes `fatal=False`
1209         # for _search_regex. Let's simulate the same behavior here as well.
1210         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1211         json_ld = []
1212         for mobj in json_ld_list:
1213             json_ld_item = self._parse_json(
1214                 mobj.group('json_ld'), video_id, fatal=fatal)
1215             if not json_ld_item:
1216                 continue
1217             if isinstance(json_ld_item, dict):
1218                 json_ld.append(json_ld_item)
1219             elif isinstance(json_ld_item, (list, tuple)):
1220                 json_ld.extend(json_ld_item)
1221         if json_ld:
1222             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1223         if json_ld:
1224             return json_ld
1225         if default is not NO_DEFAULT:
1226             return default
1227         elif fatal:
1228             raise RegexNotFoundError('Unable to extract JSON-LD')
1229         else:
1230             self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1231             return {}
1232
1233     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1234         if isinstance(json_ld, compat_str):
1235             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1236         if not json_ld:
1237             return {}
1238         info = {}
1239         if not isinstance(json_ld, (list, tuple, dict)):
1240             return info
1241         if isinstance(json_ld, dict):
1242             json_ld = [json_ld]
1243
1244         INTERACTION_TYPE_MAP = {
1245             'CommentAction': 'comment',
1246             'AgreeAction': 'like',
1247             'DisagreeAction': 'dislike',
1248             'LikeAction': 'like',
1249             'DislikeAction': 'dislike',
1250             'ListenAction': 'view',
1251             'WatchAction': 'view',
1252             'ViewAction': 'view',
1253         }
1254
1255         def extract_interaction_type(e):
1256             interaction_type = e.get('interactionType')
1257             if isinstance(interaction_type, dict):
1258                 interaction_type = interaction_type.get('@type')
1259             return str_or_none(interaction_type)
1260
1261         def extract_interaction_statistic(e):
1262             interaction_statistic = e.get('interactionStatistic')
1263             if isinstance(interaction_statistic, dict):
1264                 interaction_statistic = [interaction_statistic]
1265             if not isinstance(interaction_statistic, list):
1266                 return
1267             for is_e in interaction_statistic:
1268                 if not isinstance(is_e, dict):
1269                     continue
1270                 if is_e.get('@type') != 'InteractionCounter':
1271                     continue
1272                 interaction_type = extract_interaction_type(is_e)
1273                 if not interaction_type:
1274                     continue
1275                 # For interaction count some sites provide string instead of
1276                 # an integer (as per spec) with non digit characters (e.g. ",")
1277                 # so extracting count with more relaxed str_to_int
1278                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1279                 if interaction_count is None:
1280                     continue
1281                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1282                 if not count_kind:
1283                     continue
1284                 count_key = '%s_count' % count_kind
1285                 if info.get(count_key) is not None:
1286                     continue
1287                 info[count_key] = interaction_count
1288
1289         def extract_video_object(e):
1290             assert e['@type'] == 'VideoObject'
1291             info.update({
1292                 'url': url_or_none(e.get('contentUrl')),
1293                 'title': unescapeHTML(e.get('name')),
1294                 'description': unescapeHTML(e.get('description')),
1295                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1296                 'duration': parse_duration(e.get('duration')),
1297                 'timestamp': unified_timestamp(e.get('uploadDate')),
1298                 'uploader': str_or_none(e.get('author')),
1299                 'filesize': float_or_none(e.get('contentSize')),
1300                 'tbr': int_or_none(e.get('bitrate')),
1301                 'width': int_or_none(e.get('width')),
1302                 'height': int_or_none(e.get('height')),
1303                 'view_count': int_or_none(e.get('interactionCount')),
1304             })
1305             extract_interaction_statistic(e)
1306
1307         for e in json_ld:
1308             if '@context' in e:
1309                 item_type = e.get('@type')
1310                 if expected_type is not None and expected_type != item_type:
1311                     continue
1312                 if item_type in ('TVEpisode', 'Episode'):
1313                     episode_name = unescapeHTML(e.get('name'))
1314                     info.update({
1315                         'episode': episode_name,
1316                         'episode_number': int_or_none(e.get('episodeNumber')),
1317                         'description': unescapeHTML(e.get('description')),
1318                     })
1319                     if not info.get('title') and episode_name:
1320                         info['title'] = episode_name
1321                     part_of_season = e.get('partOfSeason')
1322                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1323                         info.update({
1324                             'season': unescapeHTML(part_of_season.get('name')),
1325                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1326                         })
1327                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1328                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1329                         info['series'] = unescapeHTML(part_of_series.get('name'))
1330                 elif item_type == 'Movie':
1331                     info.update({
1332                         'title': unescapeHTML(e.get('name')),
1333                         'description': unescapeHTML(e.get('description')),
1334                         'duration': parse_duration(e.get('duration')),
1335                         'timestamp': unified_timestamp(e.get('dateCreated')),
1336                     })
1337                 elif item_type in ('Article', 'NewsArticle'):
1338                     info.update({
1339                         'timestamp': parse_iso8601(e.get('datePublished')),
1340                         'title': unescapeHTML(e.get('headline')),
1341                         'description': unescapeHTML(e.get('articleBody')),
1342                     })
1343                 elif item_type == 'VideoObject':
1344                     extract_video_object(e)
1345                     if expected_type is None:
1346                         continue
1347                     else:
1348                         break
1349                 video = e.get('video')
1350                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1351                     extract_video_object(video)
1352                 if expected_type is None:
1353                     continue
1354                 else:
1355                     break
1356         return dict((k, v) for k, v in info.items() if v is not None)
1357
1358     @staticmethod
1359     def _hidden_inputs(html):
1360         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1361         hidden_inputs = {}
1362         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1363             attrs = extract_attributes(input)
1364             if not input:
1365                 continue
1366             if attrs.get('type') not in ('hidden', 'submit'):
1367                 continue
1368             name = attrs.get('name') or attrs.get('id')
1369             value = attrs.get('value')
1370             if name and value is not None:
1371                 hidden_inputs[name] = value
1372         return hidden_inputs
1373
1374     def _form_hidden_inputs(self, form_id, html):
1375         form = self._search_regex(
1376             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1377             html, '%s form' % form_id, group='form')
1378         return self._hidden_inputs(form)
1379
1380     class FormatSort:
1381         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<seperator>[~:])(?P<limit>.*?))?)? *$'
1382
1383         default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
1384                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1385                    'proto', 'ext', 'has_audio', 'source', 'format_id')  # These must not be aliases
1386
1387         settings = {
1388             'vcodec': {'type': 'ordered', 'regex': True,
1389                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1390             'acodec': {'type': 'ordered', 'regex': True,
1391                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1392             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1393                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
1394             'vext': {'type': 'ordered', 'field': 'video_ext',
1395                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1396                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1397             'aext': {'type': 'ordered', 'field': 'audio_ext',
1398                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1399                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1400             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1401             'ie_pref': {'priority': True, 'type': 'extractor'},
1402             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1403             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1404             'lang': {'priority': True, 'convert': 'ignore', 'type': 'extractor', 'field': 'language_preference'},
1405             'quality': {'convert': 'float_none', 'type': 'extractor'},
1406             'filesize': {'convert': 'bytes'},
1407             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1408             'id': {'convert': 'string', 'field': 'format_id'},
1409             'height': {'convert': 'float_none'},
1410             'width': {'convert': 'float_none'},
1411             'fps': {'convert': 'float_none'},
1412             'tbr': {'convert': 'float_none'},
1413             'vbr': {'convert': 'float_none'},
1414             'abr': {'convert': 'float_none'},
1415             'asr': {'convert': 'float_none'},
1416             'source': {'convert': 'ignore', 'type': 'extractor', 'field': 'source_preference'},
1417
1418             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1419             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1420             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1421             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1422             'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1423
1424             # Most of these exist only for compatibility reasons
1425             'dimension': {'type': 'alias', 'field': 'res'},
1426             'resolution': {'type': 'alias', 'field': 'res'},
1427             'extension': {'type': 'alias', 'field': 'ext'},
1428             'bitrate': {'type': 'alias', 'field': 'br'},
1429             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1430             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1431             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1432             'framerate': {'type': 'alias', 'field': 'fps'},
1433             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1434             'protocol': {'type': 'alias', 'field': 'proto'},
1435             'source_preference': {'type': 'alias', 'field': 'source'},
1436             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1437             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1438             'samplerate': {'type': 'alias', 'field': 'asr'},
1439             'video_ext': {'type': 'alias', 'field': 'vext'},
1440             'audio_ext': {'type': 'alias', 'field': 'aext'},
1441             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1442             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1443             'video': {'type': 'alias', 'field': 'hasvid'},
1444             'has_video': {'type': 'alias', 'field': 'hasvid'},
1445             'audio': {'type': 'alias', 'field': 'hasaud'},
1446             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1447             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1448             'preference': {'type': 'alias', 'field': 'ie_pref'},
1449             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1450             'format_id': {'type': 'alias', 'field': 'id'},
1451         }
1452
1453         _order = []
1454
1455         def _get_field_setting(self, field, key):
1456             if field not in self.settings:
1457                 self.settings[field] = {}
1458             propObj = self.settings[field]
1459             if key not in propObj:
1460                 type = propObj.get('type')
1461                 if key == 'field':
1462                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1463                 elif key == 'convert':
1464                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1465                 else:
1466                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1467                 propObj[key] = default
1468             return propObj[key]
1469
1470         def _resolve_field_value(self, field, value, convertNone=False):
1471             if value is None:
1472                 if not convertNone:
1473                     return None
1474             else:
1475                 value = value.lower()
1476             conversion = self._get_field_setting(field, 'convert')
1477             if conversion == 'ignore':
1478                 return None
1479             if conversion == 'string':
1480                 return value
1481             elif conversion == 'float_none':
1482                 return float_or_none(value)
1483             elif conversion == 'bytes':
1484                 return FileDownloader.parse_bytes(value)
1485             elif conversion == 'order':
1486                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1487                 use_regex = self._get_field_setting(field, 'regex')
1488                 list_length = len(order_list)
1489                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1490                 if use_regex and value is not None:
1491                     for i, regex in enumerate(order_list):
1492                         if regex and re.match(regex, value):
1493                             return list_length - i
1494                     return list_length - empty_pos  # not in list
1495                 else:  # not regex or  value = None
1496                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1497             else:
1498                 if value.isnumeric():
1499                     return float(value)
1500                 else:
1501                     self.settings[field]['convert'] = 'string'
1502                     return value
1503
1504         def evaluate_params(self, params, sort_extractor):
1505             self._use_free_order = params.get('prefer_free_formats', False)
1506             self._sort_user = params.get('format_sort', [])
1507             self._sort_extractor = sort_extractor
1508
1509             def add_item(field, reverse, closest, limit_text):
1510                 field = field.lower()
1511                 if field in self._order:
1512                     return
1513                 self._order.append(field)
1514                 limit = self._resolve_field_value(field, limit_text)
1515                 data = {
1516                     'reverse': reverse,
1517                     'closest': False if limit is None else closest,
1518                     'limit_text': limit_text,
1519                     'limit': limit}
1520                 if field in self.settings:
1521                     self.settings[field].update(data)
1522                 else:
1523                     self.settings[field] = data
1524
1525             sort_list = (
1526                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1527                 + (tuple() if params.get('format_sort_force', False)
1528                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1529                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1530
1531             for item in sort_list:
1532                 match = re.match(self.regex, item)
1533                 if match is None:
1534                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1535                 field = match.group('field')
1536                 if field is None:
1537                     continue
1538                 if self._get_field_setting(field, 'type') == 'alias':
1539                     field = self._get_field_setting(field, 'field')
1540                 reverse = match.group('reverse') is not None
1541                 closest = match.group('seperator') == '~'
1542                 limit_text = match.group('limit')
1543
1544                 has_limit = limit_text is not None
1545                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1546                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1547
1548                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1549                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1550                 limit_count = len(limits)
1551                 for (i, f) in enumerate(fields):
1552                     add_item(f, reverse, closest,
1553                              limits[i] if i < limit_count
1554                              else limits[0] if has_limit and not has_multiple_limits
1555                              else None)
1556
1557         def print_verbose_info(self, to_screen):
1558             to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
1559             if self._sort_extractor:
1560                 to_screen('[debug] Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1561             to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1562                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1563                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1564                               self._get_field_setting(field, 'limit_text'),
1565                               self._get_field_setting(field, 'limit'))
1566                 if self._get_field_setting(field, 'limit_text') is not None else '')
1567                 for field in self._order if self._get_field_setting(field, 'visible')]))
1568
1569         def _calculate_field_preference_from_value(self, format, field, type, value):
1570             reverse = self._get_field_setting(field, 'reverse')
1571             closest = self._get_field_setting(field, 'closest')
1572             limit = self._get_field_setting(field, 'limit')
1573
1574             if type == 'extractor':
1575                 maximum = self._get_field_setting(field, 'max')
1576                 if value is None or (maximum is not None and value >= maximum):
1577                     value = -1
1578             elif type == 'boolean':
1579                 in_list = self._get_field_setting(field, 'in_list')
1580                 not_in_list = self._get_field_setting(field, 'not_in_list')
1581                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1582             elif type == 'ordered':
1583                 value = self._resolve_field_value(field, value, True)
1584
1585             # try to convert to number
1586             val_num = float_or_none(value)
1587             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1588             if is_num:
1589                 value = val_num
1590
1591             return ((-10, 0) if value is None
1592                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1593                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1594                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1595                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1596                     else (-1, value, 0))
1597
1598         def _calculate_field_preference(self, format, field):
1599             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1600             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1601             if type == 'multiple':
1602                 type = 'field'  # Only 'field' is allowed in multiple for now
1603                 actual_fields = self._get_field_setting(field, 'field')
1604
1605                 def wrapped_function(values):
1606                     values = tuple(filter(lambda x: x is not None, values))
1607                     return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1608                             else values[0] if values
1609                             else None)
1610
1611                 value = wrapped_function((get_value(f) for f in actual_fields))
1612             else:
1613                 value = get_value(field)
1614             return self._calculate_field_preference_from_value(format, field, type, value)
1615
1616         def calculate_preference(self, format):
1617             # Determine missing protocol
1618             if not format.get('protocol'):
1619                 format['protocol'] = determine_protocol(format)
1620
1621             # Determine missing ext
1622             if not format.get('ext') and 'url' in format:
1623                 format['ext'] = determine_ext(format['url'])
1624             if format.get('vcodec') == 'none':
1625                 format['audio_ext'] = format['ext']
1626                 format['video_ext'] = 'none'
1627             else:
1628                 format['video_ext'] = format['ext']
1629                 format['audio_ext'] = 'none'
1630             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1631             #    format['preference'] = -1000
1632
1633             # Determine missing bitrates
1634             if format.get('tbr') is None:
1635                 if format.get('vbr') is not None and format.get('abr') is not None:
1636                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1637             else:
1638                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1639                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1640                 if format.get('acodec') != "none" and format.get('abr') is None:
1641                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1642
1643             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1644
1645     def _sort_formats(self, formats, field_preference=[]):
1646         if not formats:
1647             raise ExtractorError('No video formats found')
1648         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1649         format_sort.evaluate_params(self._downloader.params, field_preference)
1650         if self._downloader.params.get('verbose', False):
1651             format_sort.print_verbose_info(self._downloader.to_screen)
1652         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1653
1654     def _check_formats(self, formats, video_id):
1655         if formats:
1656             formats[:] = filter(
1657                 lambda f: self._is_valid_url(
1658                     f['url'], video_id,
1659                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1660                 formats)
1661
1662     @staticmethod
1663     def _remove_duplicate_formats(formats):
1664         format_urls = set()
1665         unique_formats = []
1666         for f in formats:
1667             if f['url'] not in format_urls:
1668                 format_urls.add(f['url'])
1669                 unique_formats.append(f)
1670         formats[:] = unique_formats
1671
1672     def _is_valid_url(self, url, video_id, item='video', headers={}):
1673         url = self._proto_relative_url(url, scheme='http:')
1674         # For now assume non HTTP(S) URLs always valid
1675         if not (url.startswith('http://') or url.startswith('https://')):
1676             return True
1677         try:
1678             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1679             return True
1680         except ExtractorError as e:
1681             self.to_screen(
1682                 '%s: %s URL is invalid, skipping: %s'
1683                 % (video_id, item, error_to_compat_str(e.cause)))
1684             return False
1685
1686     def http_scheme(self):
1687         """ Either "http:" or "https:", depending on the user's preferences """
1688         return (
1689             'http:'
1690             if self._downloader.params.get('prefer_insecure', False)
1691             else 'https:')
1692
1693     def _proto_relative_url(self, url, scheme=None):
1694         if url is None:
1695             return url
1696         if url.startswith('//'):
1697             if scheme is None:
1698                 scheme = self.http_scheme()
1699             return scheme + url
1700         else:
1701             return url
1702
1703     def _sleep(self, timeout, video_id, msg_template=None):
1704         if msg_template is None:
1705             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1706         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1707         self.to_screen(msg)
1708         time.sleep(timeout)
1709
1710     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1711                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1712                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1713         manifest = self._download_xml(
1714             manifest_url, video_id, 'Downloading f4m manifest',
1715             'Unable to download f4m manifest',
1716             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1717             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1718             transform_source=transform_source,
1719             fatal=fatal, data=data, headers=headers, query=query)
1720
1721         if manifest is False:
1722             return []
1723
1724         return self._parse_f4m_formats(
1725             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1726             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1727
1728     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1729                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1730                            fatal=True, m3u8_id=None):
1731         if not isinstance(manifest, compat_etree_Element) and not fatal:
1732             return []
1733
1734         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1735         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1736         if akamai_pv is not None and ';' in akamai_pv.text:
1737             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1738             if playerVerificationChallenge.strip() != '':
1739                 return []
1740
1741         formats = []
1742         manifest_version = '1.0'
1743         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1744         if not media_nodes:
1745             manifest_version = '2.0'
1746             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1747         # Remove unsupported DRM protected media from final formats
1748         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1749         media_nodes = remove_encrypted_media(media_nodes)
1750         if not media_nodes:
1751             return formats
1752
1753         manifest_base_url = get_base_url(manifest)
1754
1755         bootstrap_info = xpath_element(
1756             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1757             'bootstrap info', default=None)
1758
1759         vcodec = None
1760         mime_type = xpath_text(
1761             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1762             'base URL', default=None)
1763         if mime_type and mime_type.startswith('audio/'):
1764             vcodec = 'none'
1765
1766         for i, media_el in enumerate(media_nodes):
1767             tbr = int_or_none(media_el.attrib.get('bitrate'))
1768             width = int_or_none(media_el.attrib.get('width'))
1769             height = int_or_none(media_el.attrib.get('height'))
1770             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1771             # If <bootstrapInfo> is present, the specified f4m is a
1772             # stream-level manifest, and only set-level manifests may refer to
1773             # external resources.  See section 11.4 and section 4 of F4M spec
1774             if bootstrap_info is None:
1775                 media_url = None
1776                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1777                 if manifest_version == '2.0':
1778                     media_url = media_el.attrib.get('href')
1779                 if media_url is None:
1780                     media_url = media_el.attrib.get('url')
1781                 if not media_url:
1782                     continue
1783                 manifest_url = (
1784                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1785                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1786                 # If media_url is itself a f4m manifest do the recursive extraction
1787                 # since bitrates in parent manifest (this one) and media_url manifest
1788                 # may differ leading to inability to resolve the format by requested
1789                 # bitrate in f4m downloader
1790                 ext = determine_ext(manifest_url)
1791                 if ext == 'f4m':
1792                     f4m_formats = self._extract_f4m_formats(
1793                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1794                         transform_source=transform_source, fatal=fatal)
1795                     # Sometimes stream-level manifest contains single media entry that
1796                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1797                     # At the same time parent's media entry in set-level manifest may
1798                     # contain it. We will copy it from parent in such cases.
1799                     if len(f4m_formats) == 1:
1800                         f = f4m_formats[0]
1801                         f.update({
1802                             'tbr': f.get('tbr') or tbr,
1803                             'width': f.get('width') or width,
1804                             'height': f.get('height') or height,
1805                             'format_id': f.get('format_id') if not tbr else format_id,
1806                             'vcodec': vcodec,
1807                         })
1808                     formats.extend(f4m_formats)
1809                     continue
1810                 elif ext == 'm3u8':
1811                     formats.extend(self._extract_m3u8_formats(
1812                         manifest_url, video_id, 'mp4', preference=preference,
1813                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1814                     continue
1815             formats.append({
1816                 'format_id': format_id,
1817                 'url': manifest_url,
1818                 'manifest_url': manifest_url,
1819                 'ext': 'flv' if bootstrap_info is not None else None,
1820                 'protocol': 'f4m',
1821                 'tbr': tbr,
1822                 'width': width,
1823                 'height': height,
1824                 'vcodec': vcodec,
1825                 'preference': preference,
1826                 'quality': quality,
1827             })
1828         return formats
1829
1830     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1831         return {
1832             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1833             'url': m3u8_url,
1834             'ext': ext,
1835             'protocol': 'm3u8',
1836             'preference': preference - 100 if preference else -100,
1837             'quality': quality,
1838             'resolution': 'multiple',
1839             'format_note': 'Quality selection URL',
1840         }
1841
1842     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1843                               entry_protocol='m3u8', preference=None, quality=None,
1844                               m3u8_id=None, live=False, note=None, errnote=None,
1845                               fatal=True, data=None, headers={}, query={}):
1846         res = self._download_webpage_handle(
1847             m3u8_url, video_id,
1848             note=note or 'Downloading m3u8 information',
1849             errnote=errnote or 'Failed to download m3u8 information',
1850             fatal=fatal, data=data, headers=headers, query=query)
1851
1852         if res is False:
1853             return []
1854
1855         m3u8_doc, urlh = res
1856         m3u8_url = urlh.geturl()
1857
1858         return self._parse_m3u8_formats(
1859             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1860             preference=preference, quality=quality, m3u8_id=m3u8_id,
1861             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1862             headers=headers, query=query, video_id=video_id)
1863
1864     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1865                             entry_protocol='m3u8', preference=None, quality=None,
1866                             m3u8_id=None, live=False, note=None, errnote=None,
1867                             fatal=True, data=None, headers={}, query={}, video_id=None):
1868         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1869             return []
1870
1871         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1872             return []
1873
1874         formats = []
1875
1876         format_url = lambda u: (
1877             u
1878             if re.match(r'^https?://', u)
1879             else compat_urlparse.urljoin(m3u8_url, u))
1880
1881         split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
1882
1883         # References:
1884         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1885         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1886         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1887
1888         # We should try extracting formats only from master playlists [1, 4.3.4],
1889         # i.e. playlists that describe available qualities. On the other hand
1890         # media playlists [1, 4.3.3] should be returned as is since they contain
1891         # just the media without qualities renditions.
1892         # Fortunately, master playlist can be easily distinguished from media
1893         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1894         # master playlist tags MUST NOT appear in a media playlist and vice versa.
1895         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1896         # media playlist and MUST NOT appear in master playlist thus we can
1897         # clearly detect media playlist with this criterion.
1898
1899         def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None):
1900             if not m3u8_doc:
1901                 if not format_url:
1902                     return []
1903                 res = self._download_webpage_handle(
1904                     format_url, video_id,
1905                     note=False,
1906                     errnote=errnote or 'Failed to download m3u8 playlist information',
1907                     fatal=fatal, data=data, headers=headers, query=query)
1908
1909                 if res is False:
1910                     return []
1911
1912                 m3u8_doc, urlh = res
1913                 format_url = urlh.geturl()
1914
1915             playlist_formats = []
1916             i = (
1917                 0
1918                 if split_discontinuity
1919                 else None)
1920             format_info = {
1921                 'index': i,
1922                 'key_data': None,
1923                 'files': [],
1924             }
1925             for line in m3u8_doc.splitlines():
1926                 if not line.startswith('#'):
1927                     format_info['files'].append(line)
1928                 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
1929                     i += 1
1930                     playlist_formats.append(format_info)
1931                     format_info = {
1932                         'index': i,
1933                         'url': format_url,
1934                         'files': [],
1935                     }
1936             playlist_formats.append(format_info)
1937             return playlist_formats
1938
1939         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1940
1941             playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
1942
1943             for format in playlist_formats:
1944                 format_id = []
1945                 if m3u8_id:
1946                     format_id.append(m3u8_id)
1947                 format_index = format.get('index')
1948                 if format_index:
1949                     format_id.append(str(format_index))
1950                 f = {
1951                     'format_id': '-'.join(format_id),
1952                     'format_index': format_index,
1953                     'url': m3u8_url,
1954                     'ext': ext,
1955                     'protocol': entry_protocol,
1956                     'preference': preference,
1957                     'quality': quality,
1958                 }
1959                 formats.append(f)
1960
1961             return formats
1962
1963         groups = {}
1964         last_stream_inf = {}
1965
1966         def extract_media(x_media_line):
1967             media = parse_m3u8_attributes(x_media_line)
1968             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1969             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1970             if not (media_type and group_id and name):
1971                 return
1972             groups.setdefault(group_id, []).append(media)
1973             if media_type not in ('VIDEO', 'AUDIO'):
1974                 return
1975             media_url = media.get('URI')
1976             if media_url:
1977                 manifest_url = format_url(media_url)
1978                 format_id = []
1979                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
1980
1981                 for format in playlist_formats:
1982                     format_index = format.get('index')
1983                     for v in (m3u8_id, group_id, name):
1984                         if v:
1985                             format_id.append(v)
1986                     if format_index:
1987                         format_id.append(str(format_index))
1988                     f = {
1989                         'format_id': '-'.join(format_id),
1990                         'format_index': format_index,
1991                         'url': manifest_url,
1992                         'manifest_url': m3u8_url,
1993                         'language': media.get('LANGUAGE'),
1994                         'ext': ext,
1995                         'protocol': entry_protocol,
1996                         'preference': preference,
1997                         'quality': quality,
1998                     }
1999                     if media_type == 'AUDIO':
2000                         f['vcodec'] = 'none'
2001                     formats.append(f)
2002
2003         def build_stream_name():
2004             # Despite specification does not mention NAME attribute for
2005             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2006             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2007             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2008             stream_name = last_stream_inf.get('NAME')
2009             if stream_name:
2010                 return stream_name
2011             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2012             # from corresponding rendition group
2013             stream_group_id = last_stream_inf.get('VIDEO')
2014             if not stream_group_id:
2015                 return
2016             stream_group = groups.get(stream_group_id)
2017             if not stream_group:
2018                 return stream_group_id
2019             rendition = stream_group[0]
2020             return rendition.get('NAME') or stream_group_id
2021
2022         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2023         # chance to detect video only formats when EXT-X-STREAM-INF tags
2024         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2025         for line in m3u8_doc.splitlines():
2026             if line.startswith('#EXT-X-MEDIA:'):
2027                 extract_media(line)
2028
2029         for line in m3u8_doc.splitlines():
2030             if line.startswith('#EXT-X-STREAM-INF:'):
2031                 last_stream_inf = parse_m3u8_attributes(line)
2032             elif line.startswith('#') or not line.strip():
2033                 continue
2034             else:
2035                 tbr = float_or_none(
2036                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2037                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2038                 manifest_url = format_url(line.strip())
2039
2040                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
2041
2042                 for format in playlist_formats:
2043                     format_id = []
2044                     if m3u8_id:
2045                         format_id.append(m3u8_id)
2046                     format_index = format.get('index')
2047                     stream_name = build_stream_name()
2048                     # Bandwidth of live streams may differ over time thus making
2049                     # format_id unpredictable. So it's better to keep provided
2050                     # format_id intact.
2051                     if not live:
2052                         format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2053                     if format_index:
2054                         format_id.append(str(format_index))
2055                     f = {
2056                         'format_id': '-'.join(format_id),
2057                         'format_index': format_index,
2058                         'url': manifest_url,
2059                         'manifest_url': m3u8_url,
2060                         'tbr': tbr,
2061                         'ext': ext,
2062                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2063                         'protocol': entry_protocol,
2064                         'preference': preference,
2065                         'quality': quality,
2066                     }
2067                     resolution = last_stream_inf.get('RESOLUTION')
2068                     if resolution:
2069                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2070                         if mobj:
2071                             f['width'] = int(mobj.group('width'))
2072                             f['height'] = int(mobj.group('height'))
2073                     # Unified Streaming Platform
2074                     mobj = re.search(
2075                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2076                     if mobj:
2077                         abr, vbr = mobj.groups()
2078                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2079                         f.update({
2080                             'vbr': vbr,
2081                             'abr': abr,
2082                         })
2083                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2084                     f.update(codecs)
2085                     audio_group_id = last_stream_inf.get('AUDIO')
2086                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2087                     # references a rendition group MUST have a CODECS attribute.
2088                     # However, this is not always respected, for example, [2]
2089                     # contains EXT-X-STREAM-INF tag which references AUDIO
2090                     # rendition group but does not have CODECS and despite
2091                     # referencing an audio group it represents a complete
2092                     # (with audio and video) format. So, for such cases we will
2093                     # ignore references to rendition groups and treat them
2094                     # as complete formats.
2095                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2096                         audio_group = groups.get(audio_group_id)
2097                         if audio_group and audio_group[0].get('URI'):
2098                             # TODO: update acodec for audio only formats with
2099                             # the same GROUP-ID
2100                             f['acodec'] = 'none'
2101                     formats.append(f)
2102
2103                     # for DailyMotion
2104                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2105                     if progressive_uri:
2106                         http_f = f.copy()
2107                         del http_f['manifest_url']
2108                         http_f.update({
2109                             'format_id': f['format_id'].replace('hls-', 'http-'),
2110                             'protocol': 'http',
2111                             'url': progressive_uri,
2112                         })
2113                         formats.append(http_f)
2114
2115                 last_stream_inf = {}
2116         return formats
2117
2118     @staticmethod
2119     def _xpath_ns(path, namespace=None):
2120         if not namespace:
2121             return path
2122         out = []
2123         for c in path.split('/'):
2124             if not c or c == '.':
2125                 out.append(c)
2126             else:
2127                 out.append('{%s}%s' % (namespace, c))
2128         return '/'.join(out)
2129
2130     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2131         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2132
2133         if smil is False:
2134             assert not fatal
2135             return []
2136
2137         namespace = self._parse_smil_namespace(smil)
2138
2139         return self._parse_smil_formats(
2140             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2141
2142     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2143         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2144         if smil is False:
2145             return {}
2146         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2147
2148     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2149         return self._download_xml(
2150             smil_url, video_id, 'Downloading SMIL file',
2151             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2152
2153     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2154         namespace = self._parse_smil_namespace(smil)
2155
2156         formats = self._parse_smil_formats(
2157             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2158         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2159
2160         video_id = os.path.splitext(url_basename(smil_url))[0]
2161         title = None
2162         description = None
2163         upload_date = None
2164         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2165             name = meta.attrib.get('name')
2166             content = meta.attrib.get('content')
2167             if not name or not content:
2168                 continue
2169             if not title and name == 'title':
2170                 title = content
2171             elif not description and name in ('description', 'abstract'):
2172                 description = content
2173             elif not upload_date and name == 'date':
2174                 upload_date = unified_strdate(content)
2175
2176         thumbnails = [{
2177             'id': image.get('type'),
2178             'url': image.get('src'),
2179             'width': int_or_none(image.get('width')),
2180             'height': int_or_none(image.get('height')),
2181         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2182
2183         return {
2184             'id': video_id,
2185             'title': title or video_id,
2186             'description': description,
2187             'upload_date': upload_date,
2188             'thumbnails': thumbnails,
2189             'formats': formats,
2190             'subtitles': subtitles,
2191         }
2192
2193     def _parse_smil_namespace(self, smil):
2194         return self._search_regex(
2195             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2196
2197     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2198         base = smil_url
2199         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2200             b = meta.get('base') or meta.get('httpBase')
2201             if b:
2202                 base = b
2203                 break
2204
2205         formats = []
2206         rtmp_count = 0
2207         http_count = 0
2208         m3u8_count = 0
2209
2210         srcs = []
2211         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2212         for medium in media:
2213             src = medium.get('src')
2214             if not src or src in srcs:
2215                 continue
2216             srcs.append(src)
2217
2218             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2219             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2220             width = int_or_none(medium.get('width'))
2221             height = int_or_none(medium.get('height'))
2222             proto = medium.get('proto')
2223             ext = medium.get('ext')
2224             src_ext = determine_ext(src)
2225             streamer = medium.get('streamer') or base
2226
2227             if proto == 'rtmp' or streamer.startswith('rtmp'):
2228                 rtmp_count += 1
2229                 formats.append({
2230                     'url': streamer,
2231                     'play_path': src,
2232                     'ext': 'flv',
2233                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2234                     'tbr': bitrate,
2235                     'filesize': filesize,
2236                     'width': width,
2237                     'height': height,
2238                 })
2239                 if transform_rtmp_url:
2240                     streamer, src = transform_rtmp_url(streamer, src)
2241                     formats[-1].update({
2242                         'url': streamer,
2243                         'play_path': src,
2244                     })
2245                 continue
2246
2247             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2248             src_url = src_url.strip()
2249
2250             if proto == 'm3u8' or src_ext == 'm3u8':
2251                 m3u8_formats = self._extract_m3u8_formats(
2252                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2253                 if len(m3u8_formats) == 1:
2254                     m3u8_count += 1
2255                     m3u8_formats[0].update({
2256                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2257                         'tbr': bitrate,
2258                         'width': width,
2259                         'height': height,
2260                     })
2261                 formats.extend(m3u8_formats)
2262             elif src_ext == 'f4m':
2263                 f4m_url = src_url
2264                 if not f4m_params:
2265                     f4m_params = {
2266                         'hdcore': '3.2.0',
2267                         'plugin': 'flowplayer-3.2.0.1',
2268                     }
2269                 f4m_url += '&' if '?' in f4m_url else '?'
2270                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2271                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2272             elif src_ext == 'mpd':
2273                 formats.extend(self._extract_mpd_formats(
2274                     src_url, video_id, mpd_id='dash', fatal=False))
2275             elif re.search(r'\.ism/[Mm]anifest', src_url):
2276                 formats.extend(self._extract_ism_formats(
2277                     src_url, video_id, ism_id='mss', fatal=False))
2278             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2279                 http_count += 1
2280                 formats.append({
2281                     'url': src_url,
2282                     'ext': ext or src_ext or 'flv',
2283                     'format_id': 'http-%d' % (bitrate or http_count),
2284                     'tbr': bitrate,
2285                     'filesize': filesize,
2286                     'width': width,
2287                     'height': height,
2288                 })
2289
2290         return formats
2291
2292     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2293         urls = []
2294         subtitles = {}
2295         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2296             src = textstream.get('src')
2297             if not src or src in urls:
2298                 continue
2299             urls.append(src)
2300             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2301             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2302             subtitles.setdefault(lang, []).append({
2303                 'url': src,
2304                 'ext': ext,
2305             })
2306         return subtitles
2307
2308     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2309         xspf = self._download_xml(
2310             xspf_url, playlist_id, 'Downloading xpsf playlist',
2311             'Unable to download xspf manifest', fatal=fatal)
2312         if xspf is False:
2313             return []
2314         return self._parse_xspf(
2315             xspf, playlist_id, xspf_url=xspf_url,
2316             xspf_base_url=base_url(xspf_url))
2317
2318     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2319         NS_MAP = {
2320             'xspf': 'http://xspf.org/ns/0/',
2321             's1': 'http://static.streamone.nl/player/ns/0',
2322         }
2323
2324         entries = []
2325         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2326             title = xpath_text(
2327                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2328             description = xpath_text(
2329                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2330             thumbnail = xpath_text(
2331                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2332             duration = float_or_none(
2333                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2334
2335             formats = []
2336             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2337                 format_url = urljoin(xspf_base_url, location.text)
2338                 if not format_url:
2339                     continue
2340                 formats.append({
2341                     'url': format_url,
2342                     'manifest_url': xspf_url,
2343                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2344                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2345                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2346                 })
2347             self._sort_formats(formats)
2348
2349             entries.append({
2350                 'id': playlist_id,
2351                 'title': title,
2352                 'description': description,
2353                 'thumbnail': thumbnail,
2354                 'duration': duration,
2355                 'formats': formats,
2356             })
2357         return entries
2358
2359     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2360         res = self._download_xml_handle(
2361             mpd_url, video_id,
2362             note=note or 'Downloading MPD manifest',
2363             errnote=errnote or 'Failed to download MPD manifest',
2364             fatal=fatal, data=data, headers=headers, query=query)
2365         if res is False:
2366             return []
2367         mpd_doc, urlh = res
2368         if mpd_doc is None:
2369             return []
2370         mpd_base_url = base_url(urlh.geturl())
2371
2372         return self._parse_mpd_formats(
2373             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2374
2375     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2376         """
2377         Parse formats from MPD manifest.
2378         References:
2379          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2380             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2381          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2382         """
2383         if not self._downloader.params.get('dynamic_mpd'):
2384             if mpd_doc.get('type') == 'dynamic':
2385                 return []
2386
2387         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2388
2389         def _add_ns(path):
2390             return self._xpath_ns(path, namespace)
2391
2392         def is_drm_protected(element):
2393             return element.find(_add_ns('ContentProtection')) is not None
2394
2395         def extract_multisegment_info(element, ms_parent_info):
2396             ms_info = ms_parent_info.copy()
2397
2398             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2399             # common attributes and elements.  We will only extract relevant
2400             # for us.
2401             def extract_common(source):
2402                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2403                 if segment_timeline is not None:
2404                     s_e = segment_timeline.findall(_add_ns('S'))
2405                     if s_e:
2406                         ms_info['total_number'] = 0
2407                         ms_info['s'] = []
2408                         for s in s_e:
2409                             r = int(s.get('r', 0))
2410                             ms_info['total_number'] += 1 + r
2411                             ms_info['s'].append({
2412                                 't': int(s.get('t', 0)),
2413                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2414                                 'd': int(s.attrib['d']),
2415                                 'r': r,
2416                             })
2417                 start_number = source.get('startNumber')
2418                 if start_number:
2419                     ms_info['start_number'] = int(start_number)
2420                 timescale = source.get('timescale')
2421                 if timescale:
2422                     ms_info['timescale'] = int(timescale)
2423                 segment_duration = source.get('duration')
2424                 if segment_duration:
2425                     ms_info['segment_duration'] = float(segment_duration)
2426
2427             def extract_Initialization(source):
2428                 initialization = source.find(_add_ns('Initialization'))
2429                 if initialization is not None:
2430                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2431
2432             segment_list = element.find(_add_ns('SegmentList'))
2433             if segment_list is not None:
2434                 extract_common(segment_list)
2435                 extract_Initialization(segment_list)
2436                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2437                 if segment_urls_e:
2438                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2439             else:
2440                 segment_template = element.find(_add_ns('SegmentTemplate'))
2441                 if segment_template is not None:
2442                     extract_common(segment_template)
2443                     media = segment_template.get('media')
2444                     if media:
2445                         ms_info['media'] = media
2446                     initialization = segment_template.get('initialization')
2447                     if initialization:
2448                         ms_info['initialization'] = initialization
2449                     else:
2450                         extract_Initialization(segment_template)
2451             return ms_info
2452
2453         skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
2454
2455         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2456         formats = []
2457         for period in mpd_doc.findall(_add_ns('Period')):
2458             period_duration = parse_duration(period.get('duration')) or mpd_duration
2459             period_ms_info = extract_multisegment_info(period, {
2460                 'start_number': 1,
2461                 'timescale': 1,
2462             })
2463             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2464                 if skip_unplayable and is_drm_protected(adaptation_set):
2465                     continue
2466                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2467                 for representation in adaptation_set.findall(_add_ns('Representation')):
2468                     if skip_unplayable and is_drm_protected(representation):
2469                         continue
2470                     representation_attrib = adaptation_set.attrib.copy()
2471                     representation_attrib.update(representation.attrib)
2472                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2473                     mime_type = representation_attrib['mimeType']
2474                     content_type = mime_type.split('/')[0]
2475                     if content_type == 'text':
2476                         # TODO implement WebVTT downloading
2477                         pass
2478                     elif content_type in ('video', 'audio'):
2479                         base_url = ''
2480                         for element in (representation, adaptation_set, period, mpd_doc):
2481                             base_url_e = element.find(_add_ns('BaseURL'))
2482                             if base_url_e is not None:
2483                                 base_url = base_url_e.text + base_url
2484                                 if re.match(r'^https?://', base_url):
2485                                     break
2486                         if mpd_base_url and not re.match(r'^https?://', base_url):
2487                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2488                                 mpd_base_url += '/'
2489                             base_url = mpd_base_url + base_url
2490                         representation_id = representation_attrib.get('id')
2491                         lang = representation_attrib.get('lang')
2492                         url_el = representation.find(_add_ns('BaseURL'))
2493                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2494                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2495                         f = {
2496                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2497                             'manifest_url': mpd_url,
2498                             'ext': mimetype2ext(mime_type),
2499                             'width': int_or_none(representation_attrib.get('width')),
2500                             'height': int_or_none(representation_attrib.get('height')),
2501                             'tbr': float_or_none(bandwidth, 1000),
2502                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2503                             'fps': int_or_none(representation_attrib.get('frameRate')),
2504                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2505                             'format_note': 'DASH %s' % content_type,
2506                             'filesize': filesize,
2507                             'container': mimetype2ext(mime_type) + '_dash',
2508                         }
2509                         f.update(parse_codecs(representation_attrib.get('codecs')))
2510                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2511
2512                         def prepare_template(template_name, identifiers):
2513                             tmpl = representation_ms_info[template_name]
2514                             # First of, % characters outside $...$ templates
2515                             # must be escaped by doubling for proper processing
2516                             # by % operator string formatting used further (see
2517                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2518                             t = ''
2519                             in_template = False
2520                             for c in tmpl:
2521                                 t += c
2522                                 if c == '$':
2523                                     in_template = not in_template
2524                                 elif c == '%' and not in_template:
2525                                     t += c
2526                             # Next, $...$ templates are translated to their
2527                             # %(...) counterparts to be used with % operator
2528                             t = t.replace('$RepresentationID$', representation_id)
2529                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2530                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2531                             t.replace('$$', '$')
2532                             return t
2533
2534                         # @initialization is a regular template like @media one
2535                         # so it should be handled just the same way (see
2536                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2537                         if 'initialization' in representation_ms_info:
2538                             initialization_template = prepare_template(
2539                                 'initialization',
2540                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2541                                 # $Time$ shall not be included for @initialization thus
2542                                 # only $Bandwidth$ remains
2543                                 ('Bandwidth', ))
2544                             representation_ms_info['initialization_url'] = initialization_template % {
2545                                 'Bandwidth': bandwidth,
2546                             }
2547
2548                         def location_key(location):
2549                             return 'url' if re.match(r'^https?://', location) else 'path'
2550
2551                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2552
2553                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2554                             media_location_key = location_key(media_template)
2555
2556                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2557                             # can't be used at the same time
2558                             if '%(Number' in media_template and 's' not in representation_ms_info:
2559                                 segment_duration = None
2560                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2561                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2562                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2563                                 representation_ms_info['fragments'] = [{
2564                                     media_location_key: media_template % {
2565                                         'Number': segment_number,
2566                                         'Bandwidth': bandwidth,
2567                                     },
2568                                     'duration': segment_duration,
2569                                 } for segment_number in range(
2570                                     representation_ms_info['start_number'],
2571                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2572                             else:
2573                                 # $Number*$ or $Time$ in media template with S list available
2574                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2575                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2576                                 representation_ms_info['fragments'] = []
2577                                 segment_time = 0
2578                                 segment_d = None
2579                                 segment_number = representation_ms_info['start_number']
2580
2581                                 def add_segment_url():
2582                                     segment_url = media_template % {
2583                                         'Time': segment_time,
2584                                         'Bandwidth': bandwidth,
2585                                         'Number': segment_number,
2586                                     }
2587                                     representation_ms_info['fragments'].append({
2588                                         media_location_key: segment_url,
2589                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2590                                     })
2591
2592                                 for num, s in enumerate(representation_ms_info['s']):
2593                                     segment_time = s.get('t') or segment_time
2594                                     segment_d = s['d']
2595                                     add_segment_url()
2596                                     segment_number += 1
2597                                     for r in range(s.get('r', 0)):
2598                                         segment_time += segment_d
2599                                         add_segment_url()
2600                                         segment_number += 1
2601                                     segment_time += segment_d
2602                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2603                             # No media template
2604                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2605                             # or any YouTube dashsegments video
2606                             fragments = []
2607                             segment_index = 0
2608                             timescale = representation_ms_info['timescale']
2609                             for s in representation_ms_info['s']:
2610                                 duration = float_or_none(s['d'], timescale)
2611                                 for r in range(s.get('r', 0) + 1):
2612                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2613                                     fragments.append({
2614                                         location_key(segment_uri): segment_uri,
2615                                         'duration': duration,
2616                                     })
2617                                     segment_index += 1
2618                             representation_ms_info['fragments'] = fragments
2619                         elif 'segment_urls' in representation_ms_info:
2620                             # Segment URLs with no SegmentTimeline
2621                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2622                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2623                             fragments = []
2624                             segment_duration = float_or_none(
2625                                 representation_ms_info['segment_duration'],
2626                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2627                             for segment_url in representation_ms_info['segment_urls']:
2628                                 fragment = {
2629                                     location_key(segment_url): segment_url,
2630                                 }
2631                                 if segment_duration:
2632                                     fragment['duration'] = segment_duration
2633                                 fragments.append(fragment)
2634                             representation_ms_info['fragments'] = fragments
2635                         # If there is a fragments key available then we correctly recognized fragmented media.
2636                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2637                         # assumption is not necessarily correct since we may simply have no support for
2638                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2639                         if 'fragments' in representation_ms_info:
2640                             f.update({
2641                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2642                                 'url': mpd_url or base_url,
2643                                 'fragment_base_url': base_url,
2644                                 'fragments': [],
2645                                 'protocol': 'http_dash_segments',
2646                             })
2647                             if 'initialization_url' in representation_ms_info:
2648                                 initialization_url = representation_ms_info['initialization_url']
2649                                 if not f.get('url'):
2650                                     f['url'] = initialization_url
2651                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2652                             f['fragments'].extend(representation_ms_info['fragments'])
2653                         else:
2654                             # Assuming direct URL to unfragmented media.
2655                             f['url'] = base_url
2656                         formats.append(f)
2657                     else:
2658                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2659         return formats
2660
2661     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2662         res = self._download_xml_handle(
2663             ism_url, video_id,
2664             note=note or 'Downloading ISM manifest',
2665             errnote=errnote or 'Failed to download ISM manifest',
2666             fatal=fatal, data=data, headers=headers, query=query)
2667         if res is False:
2668             return []
2669         ism_doc, urlh = res
2670         if ism_doc is None:
2671             return []
2672
2673         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2674
2675     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2676         """
2677         Parse formats from ISM manifest.
2678         References:
2679          1. [MS-SSTR]: Smooth Streaming Protocol,
2680             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2681         """
2682         if ism_doc.get('IsLive') == 'TRUE':
2683             return []
2684         if (not self._downloader.params.get('allow_unplayable_formats')
2685                 and ism_doc.find('Protection') is not None):
2686             return []
2687
2688         duration = int(ism_doc.attrib['Duration'])
2689         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2690
2691         formats = []
2692         for stream in ism_doc.findall('StreamIndex'):
2693             stream_type = stream.get('Type')
2694             if stream_type not in ('video', 'audio'):
2695                 continue
2696             url_pattern = stream.attrib['Url']
2697             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2698             stream_name = stream.get('Name')
2699             for track in stream.findall('QualityLevel'):
2700                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2701                 # TODO: add support for WVC1 and WMAP
2702                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2703                     self.report_warning('%s is not a supported codec' % fourcc)
2704                     continue
2705                 tbr = int(track.attrib['Bitrate']) // 1000
2706                 # [1] does not mention Width and Height attributes. However,
2707                 # they're often present while MaxWidth and MaxHeight are
2708                 # missing, so should be used as fallbacks
2709                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2710                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2711                 sampling_rate = int_or_none(track.get('SamplingRate'))
2712
2713                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2714                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2715
2716                 fragments = []
2717                 fragment_ctx = {
2718                     'time': 0,
2719                 }
2720                 stream_fragments = stream.findall('c')
2721                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2722                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2723                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2724                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2725                     if not fragment_ctx['duration']:
2726                         try:
2727                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2728                         except IndexError:
2729                             next_fragment_time = duration
2730                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2731                     for _ in range(fragment_repeat):
2732                         fragments.append({
2733                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2734                             'duration': fragment_ctx['duration'] / stream_timescale,
2735                         })
2736                         fragment_ctx['time'] += fragment_ctx['duration']
2737
2738                 format_id = []
2739                 if ism_id:
2740                     format_id.append(ism_id)
2741                 if stream_name:
2742                     format_id.append(stream_name)
2743                 format_id.append(compat_str(tbr))
2744
2745                 formats.append({
2746                     'format_id': '-'.join(format_id),
2747                     'url': ism_url,
2748                     'manifest_url': ism_url,
2749                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2750                     'width': width,
2751                     'height': height,
2752                     'tbr': tbr,
2753                     'asr': sampling_rate,
2754                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2755                     'acodec': 'none' if stream_type == 'video' else fourcc,
2756                     'protocol': 'ism',
2757                     'fragments': fragments,
2758                     '_download_params': {
2759                         'duration': duration,
2760                         'timescale': stream_timescale,
2761                         'width': width or 0,
2762                         'height': height or 0,
2763                         'fourcc': fourcc,
2764                         'codec_private_data': track.get('CodecPrivateData'),
2765                         'sampling_rate': sampling_rate,
2766                         'channels': int_or_none(track.get('Channels', 2)),
2767                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2768                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2769                     },
2770                 })
2771         return formats
2772
2773     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2774         def absolute_url(item_url):
2775             return urljoin(base_url, item_url)
2776
2777         def parse_content_type(content_type):
2778             if not content_type:
2779                 return {}
2780             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2781             if ctr:
2782                 mimetype, codecs = ctr.groups()
2783                 f = parse_codecs(codecs)
2784                 f['ext'] = mimetype2ext(mimetype)
2785                 return f
2786             return {}
2787
2788         def _media_formats(src, cur_media_type, type_info={}):
2789             full_url = absolute_url(src)
2790             ext = type_info.get('ext') or determine_ext(full_url)
2791             if ext == 'm3u8':
2792                 is_plain_url = False
2793                 formats = self._extract_m3u8_formats(
2794                     full_url, video_id, ext='mp4',
2795                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2796                     preference=preference, quality=quality, fatal=False)
2797             elif ext == 'mpd':
2798                 is_plain_url = False
2799                 formats = self._extract_mpd_formats(
2800                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2801             else:
2802                 is_plain_url = True
2803                 formats = [{
2804                     'url': full_url,
2805                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2806                 }]
2807             return is_plain_url, formats
2808
2809         entries = []
2810         # amp-video and amp-audio are very similar to their HTML5 counterparts
2811         # so we wll include them right here (see
2812         # https://www.ampproject.org/docs/reference/components/amp-video)
2813         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2814         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2815         media_tags = [(media_tag, media_tag_name, media_type, '')
2816                       for media_tag, media_tag_name, media_type
2817                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2818         media_tags.extend(re.findall(
2819             # We only allow video|audio followed by a whitespace or '>'.
2820             # Allowing more characters may end up in significant slow down (see
2821             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2822             # http://www.porntrex.com/maps/videositemap.xml).
2823             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2824         for media_tag, _, media_type, media_content in media_tags:
2825             media_info = {
2826                 'formats': [],
2827                 'subtitles': {},
2828             }
2829             media_attributes = extract_attributes(media_tag)
2830             src = strip_or_none(media_attributes.get('src'))
2831             if src:
2832                 _, formats = _media_formats(src, media_type)
2833                 media_info['formats'].extend(formats)
2834             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2835             if media_content:
2836                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2837                     s_attr = extract_attributes(source_tag)
2838                     # data-video-src and data-src are non standard but seen
2839                     # several times in the wild
2840                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2841                     if not src:
2842                         continue
2843                     f = parse_content_type(s_attr.get('type'))
2844                     is_plain_url, formats = _media_formats(src, media_type, f)
2845                     if is_plain_url:
2846                         # width, height, res, label and title attributes are
2847                         # all not standard but seen several times in the wild
2848                         labels = [
2849                             s_attr.get(lbl)
2850                             for lbl in ('label', 'title')
2851                             if str_or_none(s_attr.get(lbl))
2852                         ]
2853                         width = int_or_none(s_attr.get('width'))
2854                         height = (int_or_none(s_attr.get('height'))
2855                                   or int_or_none(s_attr.get('res')))
2856                         if not width or not height:
2857                             for lbl in labels:
2858                                 resolution = parse_resolution(lbl)
2859                                 if not resolution:
2860                                     continue
2861                                 width = width or resolution.get('width')
2862                                 height = height or resolution.get('height')
2863                         for lbl in labels:
2864                             tbr = parse_bitrate(lbl)
2865                             if tbr:
2866                                 break
2867                         else:
2868                             tbr = None
2869                         f.update({
2870                             'width': width,
2871                             'height': height,
2872                             'tbr': tbr,
2873                             'format_id': s_attr.get('label') or s_attr.get('title'),
2874                         })
2875                         f.update(formats[0])
2876                         media_info['formats'].append(f)
2877                     else:
2878                         media_info['formats'].extend(formats)
2879                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2880                     track_attributes = extract_attributes(track_tag)
2881                     kind = track_attributes.get('kind')
2882                     if not kind or kind in ('subtitles', 'captions'):
2883                         src = strip_or_none(track_attributes.get('src'))
2884                         if not src:
2885                             continue
2886                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2887                         media_info['subtitles'].setdefault(lang, []).append({
2888                             'url': absolute_url(src),
2889                         })
2890             for f in media_info['formats']:
2891                 f.setdefault('http_headers', {})['Referer'] = base_url
2892             if media_info['formats'] or media_info['subtitles']:
2893                 entries.append(media_info)
2894         return entries
2895
2896     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2897         signed = 'hdnea=' in manifest_url
2898         if not signed:
2899             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2900             manifest_url = re.sub(
2901                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2902                 '', manifest_url).strip('?')
2903
2904         formats = []
2905
2906         hdcore_sign = 'hdcore=3.7.0'
2907         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2908         hds_host = hosts.get('hds')
2909         if hds_host:
2910             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2911         if 'hdcore=' not in f4m_url:
2912             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2913         f4m_formats = self._extract_f4m_formats(
2914             f4m_url, video_id, f4m_id='hds', fatal=False)
2915         for entry in f4m_formats:
2916             entry.update({'extra_param_to_segment_url': hdcore_sign})
2917         formats.extend(f4m_formats)
2918
2919         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2920         hls_host = hosts.get('hls')
2921         if hls_host:
2922             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2923         m3u8_formats = self._extract_m3u8_formats(
2924             m3u8_url, video_id, 'mp4', 'm3u8_native',
2925             m3u8_id='hls', fatal=False)
2926         formats.extend(m3u8_formats)
2927
2928         http_host = hosts.get('http')
2929         if http_host and m3u8_formats and not signed:
2930             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
2931             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2932             qualities_length = len(qualities)
2933             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
2934                 i = 0
2935                 for f in m3u8_formats:
2936                     if f['vcodec'] != 'none':
2937                         for protocol in ('http', 'https'):
2938                             http_f = f.copy()
2939                             del http_f['manifest_url']
2940                             http_url = re.sub(
2941                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
2942                             http_f.update({
2943                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2944                                 'url': http_url,
2945                                 'protocol': protocol,
2946                             })
2947                             formats.append(http_f)
2948                         i += 1
2949
2950         return formats
2951
2952     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2953         query = compat_urlparse.urlparse(url).query
2954         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2955         mobj = re.search(
2956             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2957         url_base = mobj.group('url')
2958         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2959         formats = []
2960
2961         def manifest_url(manifest):
2962             m_url = '%s/%s' % (http_base_url, manifest)
2963             if query:
2964                 m_url += '?%s' % query
2965             return m_url
2966
2967         if 'm3u8' not in skip_protocols:
2968             formats.extend(self._extract_m3u8_formats(
2969                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2970                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2971         if 'f4m' not in skip_protocols:
2972             formats.extend(self._extract_f4m_formats(
2973                 manifest_url('manifest.f4m'),
2974                 video_id, f4m_id='hds', fatal=False))
2975         if 'dash' not in skip_protocols:
2976             formats.extend(self._extract_mpd_formats(
2977                 manifest_url('manifest.mpd'),
2978                 video_id, mpd_id='dash', fatal=False))
2979         if re.search(r'(?:/smil:|\.smil)', url_base):
2980             if 'smil' not in skip_protocols:
2981                 rtmp_formats = self._extract_smil_formats(
2982                     manifest_url('jwplayer.smil'),
2983                     video_id, fatal=False)
2984                 for rtmp_format in rtmp_formats:
2985                     rtsp_format = rtmp_format.copy()
2986                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2987                     del rtsp_format['play_path']
2988                     del rtsp_format['ext']
2989                     rtsp_format.update({
2990                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2991                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2992                         'protocol': 'rtsp',
2993                     })
2994                     formats.extend([rtmp_format, rtsp_format])
2995         else:
2996             for protocol in ('rtmp', 'rtsp'):
2997                 if protocol not in skip_protocols:
2998                     formats.append({
2999                         'url': '%s:%s' % (protocol, url_base),
3000                         'format_id': protocol,
3001                         'protocol': protocol,
3002                     })
3003         return formats
3004
3005     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3006         mobj = re.search(
3007             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3008             webpage)
3009         if mobj:
3010             try:
3011                 jwplayer_data = self._parse_json(mobj.group('options'),
3012                                                  video_id=video_id,
3013                                                  transform_source=transform_source)
3014             except ExtractorError:
3015                 pass
3016             else:
3017                 if isinstance(jwplayer_data, dict):
3018                     return jwplayer_data
3019
3020     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3021         jwplayer_data = self._find_jwplayer_data(
3022             webpage, video_id, transform_source=js_to_json)
3023         return self._parse_jwplayer_data(
3024             jwplayer_data, video_id, *args, **kwargs)
3025
3026     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3027                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3028         # JWPlayer backward compatibility: flattened playlists
3029         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3030         if 'playlist' not in jwplayer_data:
3031             jwplayer_data = {'playlist': [jwplayer_data]}
3032
3033         entries = []
3034
3035         # JWPlayer backward compatibility: single playlist item
3036         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3037         if not isinstance(jwplayer_data['playlist'], list):
3038             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3039
3040         for video_data in jwplayer_data['playlist']:
3041             # JWPlayer backward compatibility: flattened sources
3042             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3043             if 'sources' not in video_data:
3044                 video_data['sources'] = [video_data]
3045
3046             this_video_id = video_id or video_data['mediaid']
3047
3048             formats = self._parse_jwplayer_formats(
3049                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3050                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3051
3052             subtitles = {}
3053             tracks = video_data.get('tracks')
3054             if tracks and isinstance(tracks, list):
3055                 for track in tracks:
3056                     if not isinstance(track, dict):
3057                         continue
3058                     track_kind = track.get('kind')
3059                     if not track_kind or not isinstance(track_kind, compat_str):
3060                         continue
3061                     if track_kind.lower() not in ('captions', 'subtitles'):
3062                         continue
3063                     track_url = urljoin(base_url, track.get('file'))
3064                     if not track_url:
3065                         continue
3066                     subtitles.setdefault(track.get('label') or 'en', []).append({
3067                         'url': self._proto_relative_url(track_url)
3068                     })
3069
3070             entry = {
3071                 'id': this_video_id,
3072                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3073                 'description': clean_html(video_data.get('description')),
3074                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3075                 'timestamp': int_or_none(video_data.get('pubdate')),
3076                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3077                 'subtitles': subtitles,
3078             }
3079             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3080             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3081                 entry.update({
3082                     '_type': 'url_transparent',
3083                     'url': formats[0]['url'],
3084                 })
3085             else:
3086                 self._sort_formats(formats)
3087                 entry['formats'] = formats
3088             entries.append(entry)
3089         if len(entries) == 1:
3090             return entries[0]
3091         else:
3092             return self.playlist_result(entries)
3093
3094     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3095                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3096         urls = []
3097         formats = []
3098         for source in jwplayer_sources_data:
3099             if not isinstance(source, dict):
3100                 continue
3101             source_url = urljoin(
3102                 base_url, self._proto_relative_url(source.get('file')))
3103             if not source_url or source_url in urls:
3104                 continue
3105             urls.append(source_url)
3106             source_type = source.get('type') or ''
3107             ext = mimetype2ext(source_type) or determine_ext(source_url)
3108             if source_type == 'hls' or ext == 'm3u8':
3109                 formats.extend(self._extract_m3u8_formats(
3110                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3111                     m3u8_id=m3u8_id, fatal=False))
3112             elif source_type == 'dash' or ext == 'mpd':
3113                 formats.extend(self._extract_mpd_formats(
3114                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3115             elif ext == 'smil':
3116                 formats.extend(self._extract_smil_formats(
3117                     source_url, video_id, fatal=False))
3118             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3119             elif source_type.startswith('audio') or ext in (
3120                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3121                 formats.append({
3122                     'url': source_url,
3123                     'vcodec': 'none',
3124                     'ext': ext,
3125                 })
3126             else:
3127                 height = int_or_none(source.get('height'))
3128                 if height is None:
3129                     # Often no height is provided but there is a label in
3130                     # format like "1080p", "720p SD", or 1080.
3131                     height = int_or_none(self._search_regex(
3132                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3133                         'height', default=None))
3134                 a_format = {
3135                     'url': source_url,
3136                     'width': int_or_none(source.get('width')),
3137                     'height': height,
3138                     'tbr': int_or_none(source.get('bitrate')),
3139                     'ext': ext,
3140                 }
3141                 if source_url.startswith('rtmp'):
3142                     a_format['ext'] = 'flv'
3143                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3144                     # of jwplayer.flash.swf
3145                     rtmp_url_parts = re.split(
3146                         r'((?:mp4|mp3|flv):)', source_url, 1)
3147                     if len(rtmp_url_parts) == 3:
3148                         rtmp_url, prefix, play_path = rtmp_url_parts
3149                         a_format.update({
3150                             'url': rtmp_url,
3151                             'play_path': prefix + play_path,
3152                         })
3153                     if rtmp_params:
3154                         a_format.update(rtmp_params)
3155                 formats.append(a_format)
3156         return formats
3157
3158     def _live_title(self, name):
3159         """ Generate the title for a live video """
3160         now = datetime.datetime.now()
3161         now_str = now.strftime('%Y-%m-%d %H:%M')
3162         return name + ' ' + now_str
3163
3164     def _int(self, v, name, fatal=False, **kwargs):
3165         res = int_or_none(v, **kwargs)
3166         if 'get_attr' in kwargs:
3167             print(getattr(v, kwargs['get_attr']))
3168         if res is None:
3169             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3170             if fatal:
3171                 raise ExtractorError(msg)
3172             else:
3173                 self._downloader.report_warning(msg)
3174         return res
3175
3176     def _float(self, v, name, fatal=False, **kwargs):
3177         res = float_or_none(v, **kwargs)
3178         if res is None:
3179             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3180             if fatal:
3181                 raise ExtractorError(msg)
3182             else:
3183                 self._downloader.report_warning(msg)
3184         return res
3185
3186     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3187                     path='/', secure=False, discard=False, rest={}, **kwargs):
3188         cookie = compat_cookiejar_Cookie(
3189             0, name, value, port, port is not None, domain, True,
3190             domain.startswith('.'), path, True, secure, expire_time,
3191             discard, None, None, rest)
3192         self._downloader.cookiejar.set_cookie(cookie)
3193
3194     def _get_cookies(self, url):
3195         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
3196         req = sanitized_Request(url)
3197         self._downloader.cookiejar.add_cookie_header(req)
3198         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
3199
3200     def _apply_first_set_cookie_header(self, url_handle, cookie):
3201         """
3202         Apply first Set-Cookie header instead of the last. Experimental.
3203
3204         Some sites (e.g. [1-3]) may serve two cookies under the same name
3205         in Set-Cookie header and expect the first (old) one to be set rather
3206         than second (new). However, as of RFC6265 the newer one cookie
3207         should be set into cookie store what actually happens.
3208         We will workaround this issue by resetting the cookie to
3209         the first one manually.
3210         1. https://new.vk.com/
3211         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3212         3. https://learning.oreilly.com/
3213         """
3214         for header, cookies in url_handle.headers.items():
3215             if header.lower() != 'set-cookie':
3216                 continue
3217             if sys.version_info[0] >= 3:
3218                 cookies = cookies.encode('iso-8859-1')
3219             cookies = cookies.decode('utf-8')
3220             cookie_value = re.search(
3221                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3222             if cookie_value:
3223                 value, domain = cookie_value.groups()
3224                 self._set_cookie(domain, cookie, value)
3225                 break
3226
3227     def get_testcases(self, include_onlymatching=False):
3228         t = getattr(self, '_TEST', None)
3229         if t:
3230             assert not hasattr(self, '_TESTS'), \
3231                 '%s has _TEST and _TESTS' % type(self).__name__
3232             tests = [t]
3233         else:
3234             tests = getattr(self, '_TESTS', [])
3235         for t in tests:
3236             if not include_onlymatching and t.get('only_matching', False):
3237                 continue
3238             t['name'] = type(self).__name__[:-len('IE')]
3239             yield t
3240
3241     def is_suitable(self, age_limit):
3242         """ Test whether the extractor is generally suitable for the given
3243         age limit (i.e. pornographic sites are not, all others usually are) """
3244
3245         any_restricted = False
3246         for tc in self.get_testcases(include_onlymatching=False):
3247             if tc.get('playlist', []):
3248                 tc = tc['playlist'][0]
3249             is_restricted = age_restricted(
3250                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3251             if not is_restricted:
3252                 return True
3253             any_restricted = any_restricted or is_restricted
3254         return not any_restricted
3255
3256     def extract_subtitles(self, *args, **kwargs):
3257         if (self._downloader.params.get('writesubtitles', False)
3258                 or self._downloader.params.get('listsubtitles')):
3259             return self._get_subtitles(*args, **kwargs)
3260         return {}
3261
3262     def _get_subtitles(self, *args, **kwargs):
3263         raise NotImplementedError('This method must be implemented by subclasses')
3264
3265     @staticmethod
3266     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3267         """ Merge subtitle items for one language. Items with duplicated URLs
3268         will be dropped. """
3269         list1_urls = set([item['url'] for item in subtitle_list1])
3270         ret = list(subtitle_list1)
3271         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3272         return ret
3273
3274     @classmethod
3275     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
3276         """ Merge two subtitle dictionaries, language by language. """
3277         ret = dict(subtitle_dict1)
3278         for lang in subtitle_dict2:
3279             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
3280         return ret
3281
3282     def extract_automatic_captions(self, *args, **kwargs):
3283         if (self._downloader.params.get('writeautomaticsub', False)
3284                 or self._downloader.params.get('listsubtitles')):
3285             return self._get_automatic_captions(*args, **kwargs)
3286         return {}
3287
3288     def _get_automatic_captions(self, *args, **kwargs):
3289         raise NotImplementedError('This method must be implemented by subclasses')
3290
3291     def mark_watched(self, *args, **kwargs):
3292         if (self._downloader.params.get('mark_watched', False)
3293                 and (self._get_login_info()[0] is not None
3294                      or self._downloader.params.get('cookiefile') is not None)):
3295             self._mark_watched(*args, **kwargs)
3296
3297     def _mark_watched(self, *args, **kwargs):
3298         raise NotImplementedError('This method must be implemented by subclasses')
3299
3300     def geo_verification_headers(self):
3301         headers = {}
3302         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3303         if geo_verification_proxy:
3304             headers['Ytdl-request-proxy'] = geo_verification_proxy
3305         return headers
3306
3307     def _generic_id(self, url):
3308         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3309
3310     def _generic_title(self, url):
3311         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3312
3313
3314 class SearchInfoExtractor(InfoExtractor):
3315     """
3316     Base class for paged search queries extractors.
3317     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3318     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3319     """
3320
3321     @classmethod
3322     def _make_valid_url(cls):
3323         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3324
3325     @classmethod
3326     def suitable(cls, url):
3327         return re.match(cls._make_valid_url(), url) is not None
3328
3329     def _real_extract(self, query):
3330         mobj = re.match(self._make_valid_url(), query)
3331         if mobj is None:
3332             raise ExtractorError('Invalid search query "%s"' % query)
3333
3334         prefix = mobj.group('prefix')
3335         query = mobj.group('query')
3336         if prefix == '':
3337             return self._get_n_results(query, 1)
3338         elif prefix == 'all':
3339             return self._get_n_results(query, self._MAX_RESULTS)
3340         else:
3341             n = int(prefix)
3342             if n <= 0:
3343                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3344             elif n > self._MAX_RESULTS:
3345                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3346                 n = self._MAX_RESULTS
3347             return self._get_n_results(query, n)
3348
3349     def _get_n_results(self, query, n):
3350         """Get a specified number of results for a query"""
3351         raise NotImplementedError('This method must be implemented by subclasses')
3352
3353     @property
3354     def SEARCH_KEY(self):
3355         return self._SEARCH_KEY