yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import ssl
  14 import sys
  15 import time
  16 import math
  17
  18 from ..compat import (
  19     compat_cookiejar_Cookie,
  20     compat_cookies,
  21     compat_etree_Element,
  22     compat_etree_fromstring,
  23     compat_getpass,
  24     compat_integer_types,
  25     compat_http_client,
  26     compat_os_name,
  27     compat_str,
  28     compat_urllib_error,
  29     compat_urllib_parse_unquote,
  30     compat_urllib_parse_urlencode,
  31     compat_urllib_request,
  32     compat_urlparse,
  33     compat_xml_parse_error,
  34 )
  35 from ..downloader import FileDownloader
  36 from ..downloader.f4m import (
  37     get_base_url,
  38     remove_encrypted_media,
  39 )
  40 from ..utils import (
  41     NO_DEFAULT,
  42     age_restricted,
  43     base_url,
  44     bug_reports_message,
  45     clean_html,
  46     compiled_regex_type,
  47     determine_ext,
  48     determine_protocol,
  49     dict_get,
  50     error_to_compat_str,
  51     ExtractorError,
  52     extract_attributes,
  53     fix_xml_ampersands,
  54     float_or_none,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     js_to_json,
  59     JSON_LD_RE,
  60     mimetype2ext,
  61     orderedSet,
  62     parse_bitrate,
  63     parse_codecs,
  64     parse_duration,
  65     parse_iso8601,
  66     parse_m3u8_attributes,
  67     parse_resolution,
  68     RegexNotFoundError,
  69     sanitized_Request,
  70     sanitize_filename,
  71     str_or_none,
  72     str_to_int,
  73     strip_or_none,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     urljoin,
  80     url_basename,
  81     url_or_none,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor(object):
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped.
 107
 108     Additionally, it must contain either a formats entry or a url one:
 109
 110     formats:        A list of dictionaries for each format available, ordered
 111                     from worst to best quality.
 112
 113                     Potential fields:
 114                     * url        The mandatory URL representing the media:
 115                                    for plain file media - HTTP URL of this file,
 116                                    for RTMP - RTMP URL,
 117                                    for HLS - URL of the M3U8 media playlist,
 118                                    for HDS - URL of the F4M manifest,
 119                                    for DASH
 120                                      - HTTP URL to plain file media (in case of
 121                                        unfragmented media)
 122                                      - URL of the MPD manifest or base URL
 123                                        representing the media if MPD manifest
 124                                        is parsed from a string (in case of
 125                                        fragmented media)
 126                                    for MSS - URL of the ISM manifest.
 127                     * manifest_url
 128                                  The URL of the manifest file in case of
 129                                  fragmented media:
 130                                    for HLS - URL of the M3U8 master playlist,
 131                                    for HDS - URL of the F4M manifest,
 132                                    for DASH - URL of the MPD manifest,
 133                                    for MSS - URL of the ISM manifest.
 134                     * ext        Will be calculated from URL if missing
 135                     * format     A human-readable description of the format
 136                                  ("mp4 container with h264/opus").
 137                                  Calculated from the format_id, width, height.
 138                                  and format_note fields if missing.
 139                     * format_id  A short description of the format
 140                                  ("mp4_h264_opus" or "19").
 141                                 Technically optional, but strongly recommended.
 142                     * format_note Additional info about the format
 143                                  ("3D" or "DASH video")
 144                     * width      Width of the video, if known
 145                     * height     Height of the video, if known
 146                     * resolution Textual description of width and height
 147                     * tbr        Average bitrate of audio and video in KBit/s
 148                     * abr        Average audio bitrate in KBit/s
 149                     * acodec     Name of the audio codec in use
 150                     * asr        Audio sampling rate in Hertz
 151                     * vbr        Average video bitrate in KBit/s
 152                     * fps        Frame rate
 153                     * vcodec     Name of the video codec in use
 154                     * container  Name of the container format
 155                     * filesize   The number of bytes, if known in advance
 156                     * filesize_approx  An estimate for the number of bytes
 157                     * player_url SWF Player URL (used for rtmpdump).
 158                     * protocol   The protocol that will be used for the actual
 159                                  download, lower-case.
 160                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 161                                  "m3u8", "m3u8_native" or "http_dash_segments".
 162                     * fragment_base_url
 163                                  Base URL for fragments. Each fragment's path
 164                                  value (if present) will be relative to
 165                                  this URL.
 166                     * fragments  A list of fragments of a fragmented media.
 167                                  Each fragment entry must contain either an url
 168                                  or a path. If an url is present it should be
 169                                  considered by a client. Otherwise both path and
 170                                  fragment_base_url must be present. Here is
 171                                  the list of all potential fields:
 172                                  * "url" - fragment's URL
 173                                  * "path" - fragment's path relative to
 174                                             fragment_base_url
 175                                  * "duration" (optional, int or float)
 176                                  * "filesize" (optional, int)
 177                     * preference Order number of this format. If this field is
 178                                  present and not None, the formats get sorted
 179                                  by this field, regardless of all other values.
 180                                  -1 for default (order by other properties),
 181                                  -2 or smaller for less than default.
 182                                  < -1000 to hide the format (if there is
 183                                     another one which is strictly better)
 184                     * language   Language code, e.g. "de" or "en-US".
 185                     * language_preference  Is this in the language mentioned in
 186                                  the URL?
 187                                  10 if it's what the URL is about,
 188                                  -1 for default (don't know),
 189                                  -10 otherwise, other values reserved for now.
 190                     * quality    Order number of the video quality of this
 191                                  format, irrespective of the file format.
 192                                  -1 for default (order by other properties),
 193                                  -2 or smaller for less than default.
 194                     * source_preference  Order number for this video source
 195                                   (quality takes higher priority)
 196                                  -1 for default (order by other properties),
 197                                  -2 or smaller for less than default.
 198                     * http_headers  A dictionary of additional HTTP headers
 199                                  to add to the request.
 200                     * stretched_ratio  If given and not 1, indicates that the
 201                                  video's pixels are not square.
 202                                  width : height ratio as float.
 203                     * no_resume  The server does not support resuming the
 204                                  (HTTP or RTMP) download. Boolean.
 205                     * downloader_options  A dictionary of downloader options as
 206                                  described in FileDownloader
 207
 208     url:            Final video URL.
 209     ext:            Video filename extension.
 210     format:         The video format, defaults to ext (used for --get-format)
 211     player_url:     SWF Player URL (used for rtmpdump).
 212
 213     The following fields are optional:
 214
 215     alt_title:      A secondary title of the video.
 216     display_id      An alternative identifier for the video, not necessarily
 217                     unique, but available before title. Typically, id is
 218                     something like "4234987", title "Dancing naked mole rats",
 219                     and display_id "dancing-naked-mole-rats"
 220     thumbnails:     A list of dictionaries, with the following entries:
 221                         * "id" (optional, string) - Thumbnail format ID
 222                         * "url"
 223                         * "preference" (optional, int) - quality of the image
 224                         * "width" (optional, int)
 225                         * "height" (optional, int)
 226                         * "resolution" (optional, string "{width}x{height}",
 227                                         deprecated)
 228                         * "filesize" (optional, int)
 229     thumbnail:      Full URL to a video thumbnail image.
 230     description:    Full video description.
 231     uploader:       Full name of the video uploader.
 232     license:        License name the video is licensed under.
 233     creator:        The creator of the video.
 234     release_date:   The date (YYYYMMDD) when the video was released.
 235     timestamp:      UNIX timestamp of the moment the video became available.
 236     upload_date:    Video upload date (YYYYMMDD).
 237                     If not explicitly set, calculated from timestamp.
 238     uploader_id:    Nickname or id of the video uploader.
 239     uploader_url:   Full URL to a personal webpage of the video uploader.
 240     channel:        Full name of the channel the video is uploaded on.
 241                     Note that channel fields may or may not repeat uploader
 242                     fields. This depends on a particular extractor.
 243     channel_id:     Id of the channel.
 244     channel_url:    Full URL to a channel webpage.
 245     location:       Physical location where the video was filmed.
 246     subtitles:      The available subtitles as a dictionary in the format
 247                     {tag: subformats}. "tag" is usually a language code, and
 248                     "subformats" is a list sorted from lower to higher
 249                     preference, each element is a dictionary with the "ext"
 250                     entry and one of:
 251                         * "data": The subtitles file contents
 252                         * "url": A URL pointing to the subtitles file
 253                     "ext" will be calculated from URL if missing
 254     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 255                     automatically generated captions
 256     duration:       Length of the video in seconds, as an integer or float.
 257     view_count:     How many users have watched the video on the platform.
 258     like_count:     Number of positive ratings of the video
 259     dislike_count:  Number of negative ratings of the video
 260     repost_count:   Number of reposts of the video
 261     average_rating: Average rating give by users, the scale used depends on the webpage
 262     comment_count:  Number of comments on the video
 263     comments:       A list of comments, each with one or more of the following
 264                     properties (all but one of text or html optional):
 265                         * "author" - human-readable name of the comment author
 266                         * "author_id" - user ID of the comment author
 267                         * "id" - Comment ID
 268                         * "html" - Comment as HTML
 269                         * "text" - Plain text of the comment
 270                         * "timestamp" - UNIX timestamp of comment
 271                         * "parent" - ID of the comment this one is replying to.
 272                                      Set to "root" to indicate that this is a
 273                                      comment to the original video.
 274     age_limit:      Age restriction for the video, as an integer (years)
 275     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 276                     should allow to get the same result again. (It will be set
 277                     by YoutubeDL if it's missing)
 278     categories:     A list of categories that the video falls in, for example
 279                     ["Sports", "Berlin"]
 280     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 281     is_live:        True, False, or None (=unknown). Whether this video is a
 282                     live stream that goes on instead of a fixed-length video.
 283     was_live:       True, False, or None (=unknown). Whether this video was
 284                     originally a live stream.
 285     start_time:     Time in seconds where the reproduction should start, as
 286                     specified in the URL.
 287     end_time:       Time in seconds where the reproduction should end, as
 288                     specified in the URL.
 289     chapters:       A list of dictionaries, with the following entries:
 290                         * "start_time" - The start time of the chapter in seconds
 291                         * "end_time" - The end time of the chapter in seconds
 292                         * "title" (optional, string)
 293     playable_in_embed: Whether this video is allowed to play in embedded
 294                     players on other sites. Can be True (=always allowed),
 295                     False (=never allowed), None (=unknown), or a string
 296                     specifying the criteria for embedability (Eg: 'whitelist').
 297
 298     The following fields should only be used when the video belongs to some logical
 299     chapter or section:
 300
 301     chapter:        Name or title of the chapter the video belongs to.
 302     chapter_number: Number of the chapter the video belongs to, as an integer.
 303     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 304
 305     The following fields should only be used when the video is an episode of some
 306     series, programme or podcast:
 307
 308     series:         Title of the series or programme the video episode belongs to.
 309     season:         Title of the season the video episode belongs to.
 310     season_number:  Number of the season the video episode belongs to, as an integer.
 311     season_id:      Id of the season the video episode belongs to, as a unicode string.
 312     episode:        Title of the video episode. Unlike mandatory video title field,
 313                     this field should denote the exact title of the video episode
 314                     without any kind of decoration.
 315     episode_number: Number of the video episode within a season, as an integer.
 316     episode_id:     Id of the video episode, as a unicode string.
 317
 318     The following fields should only be used when the media is a track or a part of
 319     a music album:
 320
 321     track:          Title of the track.
 322     track_number:   Number of the track within an album or a disc, as an integer.
 323     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 324                     as a unicode string.
 325     artist:         Artist(s) of the track.
 326     genre:          Genre(s) of the track.
 327     album:          Title of the album the track belongs to.
 328     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 329     album_artist:   List of all artists appeared on the album (e.g.
 330                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 331                     and compilations).
 332     disc_number:    Number of the disc or other physical medium the track belongs to,
 333                     as an integer.
 334     release_year:   Year (YYYY) when the album was released.
 335
 336     Unless mentioned otherwise, the fields should be Unicode strings.
 337
 338     Unless mentioned otherwise, None is equivalent to absence of information.
 339
 340
 341     _type "playlist" indicates multiple videos.
 342     There must be a key "entries", which is a list, an iterable, or a PagedList
 343     object, each element of which is a valid dictionary by this specification.
 344
 345     Additionally, playlists can have "id", "title", and any other relevent
 346     attributes with the same semantics as videos (see above).
 347
 348
 349     _type "multi_video" indicates that there are multiple videos that
 350     form a single show, for examples multiple acts of an opera or TV episode.
 351     It must have an entries key like a playlist and contain all the keys
 352     required for a video at the same time.
 353
 354
 355     _type "url" indicates that the video must be extracted from another
 356     location, possibly by a different extractor. Its only required key is:
 357     "url" - the next URL to extract.
 358     The key "ie_key" can be set to the class name (minus the trailing "IE",
 359     e.g. "Youtube") if the extractor class is known in advance.
 360     Additionally, the dictionary may have any properties of the resolved entity
 361     known in advance, for example "title" if the title of the referred video is
 362     known ahead of time.
 363
 364
 365     _type "url_transparent" entities have the same specification as "url", but
 366     indicate that the given additional information is more precise than the one
 367     associated with the resolved URL.
 368     This is useful when a site employs a video service that hosts the video and
 369     its technical metadata, but that video service does not embed a useful
 370     title, description etc.
 371
 372
 373     Subclasses of this one should re-define the _real_initialize() and
 374     _real_extract() methods and define a _VALID_URL regexp.
 375     Probably, they should also be added to the list of extractors.
 376
 377     _GEO_BYPASS attribute may be set to False in order to disable
 378     geo restriction bypass mechanisms for a particular extractor.
 379     Though it won't disable explicit geo restriction bypass based on
 380     country code provided with geo_bypass_country.
 381
 382     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 383     countries for this extractor. One of these countries will be used by
 384     geo restriction bypass mechanism right away in order to bypass
 385     geo restriction, of course, if the mechanism is not disabled.
 386
 387     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 388     IP blocks in CIDR notation for this extractor. One of these IP blocks
 389     will be used by geo restriction bypass mechanism similarly
 390     to _GEO_COUNTRIES.
 391
 392     Finally, the _WORKING attribute should be set to False for broken IEs
 393     in order to warn the users and skip the tests.
 394     """
 395
 396     _ready = False
 397     _downloader = None
 398     _x_forwarded_for_ip = None
 399     _GEO_BYPASS = True
 400     _GEO_COUNTRIES = None
 401     _GEO_IP_BLOCKS = None
 402     _WORKING = True
 403
 404     def __init__(self, downloader=None):
 405         """Constructor. Receives an optional downloader."""
 406         self._ready = False
 407         self._x_forwarded_for_ip = None
 408         self.set_downloader(downloader)
 409
 410     @classmethod
 411     def suitable(cls, url):
 412         """Receives a URL and returns True if suitable for this IE."""
 413
 414         # This does not use has/getattr intentionally - we want to know whether
 415         # we have cached the regexp for *this* class, whereas getattr would also
 416         # match the superclass
 417         if '_VALID_URL_RE' not in cls.__dict__:
 418             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 419         return cls._VALID_URL_RE.match(url) is not None
 420
 421     @classmethod
 422     def _match_id(cls, url):
 423         if '_VALID_URL_RE' not in cls.__dict__:
 424             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 425         m = cls._VALID_URL_RE.match(url)
 426         assert m
 427         return compat_str(m.group('id'))
 428
 429     @classmethod
 430     def working(cls):
 431         """Getter method for _WORKING."""
 432         return cls._WORKING
 433
 434     def initialize(self):
 435         """Initializes an instance (authentication, etc)."""
 436         self._initialize_geo_bypass({
 437             'countries': self._GEO_COUNTRIES,
 438             'ip_blocks': self._GEO_IP_BLOCKS,
 439         })
 440         if not self._ready:
 441             self._real_initialize()
 442             self._ready = True
 443
 444     def _initialize_geo_bypass(self, geo_bypass_context):
 445         """
 446         Initialize geo restriction bypass mechanism.
 447
 448         This method is used to initialize geo bypass mechanism based on faking
 449         X-Forwarded-For HTTP header. A random country from provided country list
 450         is selected and a random IP belonging to this country is generated. This
 451         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 452         HTTP requests.
 453
 454         This method will be used for initial geo bypass mechanism initialization
 455         during the instance initialization with _GEO_COUNTRIES and
 456         _GEO_IP_BLOCKS.
 457
 458         You may also manually call it from extractor's code if geo bypass
 459         information is not available beforehand (e.g. obtained during
 460         extraction) or due to some other reason. In this case you should pass
 461         this information in geo bypass context passed as first argument. It may
 462         contain following fields:
 463
 464         countries:  List of geo unrestricted countries (similar
 465                     to _GEO_COUNTRIES)
 466         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 467                     (similar to _GEO_IP_BLOCKS)
 468
 469         """
 470         if not self._x_forwarded_for_ip:
 471
 472             # Geo bypass mechanism is explicitly disabled by user
 473             if not self._downloader.params.get('geo_bypass', True):
 474                 return
 475
 476             if not geo_bypass_context:
 477                 geo_bypass_context = {}
 478
 479             # Backward compatibility: previously _initialize_geo_bypass
 480             # expected a list of countries, some 3rd party code may still use
 481             # it this way
 482             if isinstance(geo_bypass_context, (list, tuple)):
 483                 geo_bypass_context = {
 484                     'countries': geo_bypass_context,
 485                 }
 486
 487             # The whole point of geo bypass mechanism is to fake IP
 488             # as X-Forwarded-For HTTP header based on some IP block or
 489             # country code.
 490
 491             # Path 1: bypassing based on IP block in CIDR notation
 492
 493             # Explicit IP block specified by user, use it right away
 494             # regardless of whether extractor is geo bypassable or not
 495             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 496
 497             # Otherwise use random IP block from geo bypass context but only
 498             # if extractor is known as geo bypassable
 499             if not ip_block:
 500                 ip_blocks = geo_bypass_context.get('ip_blocks')
 501                 if self._GEO_BYPASS and ip_blocks:
 502                     ip_block = random.choice(ip_blocks)
 503
 504             if ip_block:
 505                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 506                 if self._downloader.params.get('verbose', False):
 507                     self._downloader.to_screen(
 508                         '[debug] Using fake IP %s as X-Forwarded-For.'
 509                         % self._x_forwarded_for_ip)
 510                 return
 511
 512             # Path 2: bypassing based on country code
 513
 514             # Explicit country code specified by user, use it right away
 515             # regardless of whether extractor is geo bypassable or not
 516             country = self._downloader.params.get('geo_bypass_country', None)
 517
 518             # Otherwise use random country code from geo bypass context but
 519             # only if extractor is known as geo bypassable
 520             if not country:
 521                 countries = geo_bypass_context.get('countries')
 522                 if self._GEO_BYPASS and countries:
 523                     country = random.choice(countries)
 524
 525             if country:
 526                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 527                 if self._downloader.params.get('verbose', False):
 528                     self._downloader.to_screen(
 529                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 530                         % (self._x_forwarded_for_ip, country.upper()))
 531
 532     def extract(self, url):
 533         """Extracts URL information and returns it in list of dicts."""
 534         try:
 535             for _ in range(2):
 536                 try:
 537                     self.initialize()
 538                     ie_result = self._real_extract(url)
 539                     if self._x_forwarded_for_ip:
 540                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 541                     return ie_result
 542                 except GeoRestrictedError as e:
 543                     if self.__maybe_fake_ip_and_retry(e.countries):
 544                         continue
 545                     raise
 546         except ExtractorError:
 547             raise
 548         except compat_http_client.IncompleteRead as e:
 549             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 550         except (KeyError, StopIteration) as e:
 551             raise ExtractorError('An extractor error has occurred.', cause=e)
 552
 553     def __maybe_fake_ip_and_retry(self, countries):
 554         if (not self._downloader.params.get('geo_bypass_country', None)
 555                 and self._GEO_BYPASS
 556                 and self._downloader.params.get('geo_bypass', True)
 557                 and not self._x_forwarded_for_ip
 558                 and countries):
 559             country_code = random.choice(countries)
 560             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 561             if self._x_forwarded_for_ip:
 562                 self.report_warning(
 563                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 564                     % (self._x_forwarded_for_ip, country_code.upper()))
 565                 return True
 566         return False
 567
 568     def set_downloader(self, downloader):
 569         """Sets the downloader for this IE."""
 570         self._downloader = downloader
 571
 572     def _real_initialize(self):
 573         """Real initialization process. Redefine in subclasses."""
 574         pass
 575
 576     def _real_extract(self, url):
 577         """Real extraction process. Redefine in subclasses."""
 578         pass
 579
 580     @classmethod
 581     def ie_key(cls):
 582         """A string for getting the InfoExtractor with get_info_extractor"""
 583         return compat_str(cls.__name__[:-2])
 584
 585     @property
 586     def IE_NAME(self):
 587         return compat_str(type(self).__name__[:-2])
 588
 589     @staticmethod
 590     def __can_accept_status_code(err, expected_status):
 591         assert isinstance(err, compat_urllib_error.HTTPError)
 592         if expected_status is None:
 593             return False
 594         if isinstance(expected_status, compat_integer_types):
 595             return err.code == expected_status
 596         elif isinstance(expected_status, (list, tuple)):
 597             return err.code in expected_status
 598         elif callable(expected_status):
 599             return expected_status(err.code) is True
 600         else:
 601             assert False
 602
 603     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 604         """
 605         Return the response handle.
 606
 607         See _download_webpage docstring for arguments specification.
 608         """
 609         if note is None:
 610             self.report_download_webpage(video_id)
 611         elif note is not False:
 612             if video_id is None:
 613                 self.to_screen('%s' % (note,))
 614             else:
 615                 self.to_screen('%s: %s' % (video_id, note))
 616
 617         # Some sites check X-Forwarded-For HTTP header in order to figure out
 618         # the origin of the client behind proxy. This allows bypassing geo
 619         # restriction by faking this header's value to IP that belongs to some
 620         # geo unrestricted country. We will do so once we encounter any
 621         # geo restriction error.
 622         if self._x_forwarded_for_ip:
 623             if 'X-Forwarded-For' not in headers:
 624                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 625
 626         if isinstance(url_or_request, compat_urllib_request.Request):
 627             url_or_request = update_Request(
 628                 url_or_request, data=data, headers=headers, query=query)
 629         else:
 630             if query:
 631                 url_or_request = update_url_query(url_or_request, query)
 632             if data is not None or headers:
 633                 url_or_request = sanitized_Request(url_or_request, data, headers)
 634         exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 635         if hasattr(ssl, 'CertificateError'):
 636             exceptions.append(ssl.CertificateError)
 637         try:
 638             return self._downloader.urlopen(url_or_request)
 639         except tuple(exceptions) as err:
 640             if isinstance(err, compat_urllib_error.HTTPError):
 641                 if self.__can_accept_status_code(err, expected_status):
 642                     # Retain reference to error to prevent file object from
 643                     # being closed before it can be read. Works around the
 644                     # effects of <https://bugs.python.org/issue15002>
 645                     # introduced in Python 3.4.1.
 646                     err.fp._error = err
 647                     return err.fp
 648
 649             if errnote is False:
 650                 return False
 651             if errnote is None:
 652                 errnote = 'Unable to download webpage'
 653
 654             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 655             if fatal:
 656                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 657             else:
 658                 self._downloader.report_warning(errmsg)
 659                 return False
 660
 661     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 662         """
 663         Return a tuple (page content as string, URL handle).
 664
 665         See _download_webpage docstring for arguments specification.
 666         """
 667         # Strip hashes from the URL (#1038)
 668         if isinstance(url_or_request, (compat_str, str)):
 669             url_or_request = url_or_request.partition('#')[0]
 670
 671         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 672         if urlh is False:
 673             assert not fatal
 674             return False
 675         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 676         return (content, urlh)
 677
 678     @staticmethod
 679     def _guess_encoding_from_content(content_type, webpage_bytes):
 680         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 681         if m:
 682             encoding = m.group(1)
 683         else:
 684             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 685                           webpage_bytes[:1024])
 686             if m:
 687                 encoding = m.group(1).decode('ascii')
 688             elif webpage_bytes.startswith(b'\xff\xfe'):
 689                 encoding = 'utf-16'
 690             else:
 691                 encoding = 'utf-8'
 692
 693         return encoding
 694
 695     def __check_blocked(self, content):
 696         first_block = content[:512]
 697         if ('<title>Access to this site is blocked</title>' in content
 698                 and 'Websense' in first_block):
 699             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 700             blocked_iframe = self._html_search_regex(
 701                 r'<iframe src="([^"]+)"', content,
 702                 'Websense information URL', default=None)
 703             if blocked_iframe:
 704                 msg += ' Visit %s for more details' % blocked_iframe
 705             raise ExtractorError(msg, expected=True)
 706         if '<title>The URL you requested has been blocked</title>' in first_block:
 707             msg = (
 708                 'Access to this webpage has been blocked by Indian censorship. '
 709                 'Use a VPN or proxy server (with --proxy) to route around it.')
 710             block_msg = self._html_search_regex(
 711                 r'</h1><p>(.*?)</p>',
 712                 content, 'block message', default=None)
 713             if block_msg:
 714                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 715             raise ExtractorError(msg, expected=True)
 716         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 717                 and 'blocklist.rkn.gov.ru' in content):
 718             raise ExtractorError(
 719                 'Access to this webpage has been blocked by decision of the Russian government. '
 720                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 721                 expected=True)
 722
 723     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 724         content_type = urlh.headers.get('Content-Type', '')
 725         webpage_bytes = urlh.read()
 726         if prefix is not None:
 727             webpage_bytes = prefix + webpage_bytes
 728         if not encoding:
 729             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 730         if self._downloader.params.get('dump_intermediate_pages', False):
 731             self.to_screen('Dumping request to ' + urlh.geturl())
 732             dump = base64.b64encode(webpage_bytes).decode('ascii')
 733             self._downloader.to_screen(dump)
 734         if self._downloader.params.get('write_pages', False):
 735             basen = '%s_%s' % (video_id, urlh.geturl())
 736             if len(basen) > 240:
 737                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 738                 basen = basen[:240 - len(h)] + h
 739             raw_filename = basen + '.dump'
 740             filename = sanitize_filename(raw_filename, restricted=True)
 741             self.to_screen('Saving request to ' + filename)
 742             # Working around MAX_PATH limitation on Windows (see
 743             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 744             if compat_os_name == 'nt':
 745                 absfilepath = os.path.abspath(filename)
 746                 if len(absfilepath) > 259:
 747                     filename = '\\\\?\\' + absfilepath
 748             with open(filename, 'wb') as outf:
 749                 outf.write(webpage_bytes)
 750
 751         try:
 752             content = webpage_bytes.decode(encoding, 'replace')
 753         except LookupError:
 754             content = webpage_bytes.decode('utf-8', 'replace')
 755
 756         self.__check_blocked(content)
 757
 758         return content
 759
 760     def _download_webpage(
 761             self, url_or_request, video_id, note=None, errnote=None,
 762             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 763             headers={}, query={}, expected_status=None):
 764         """
 765         Return the data of the page as a string.
 766
 767         Arguments:
 768         url_or_request -- plain text URL as a string or
 769             a compat_urllib_request.Requestobject
 770         video_id -- Video/playlist/item identifier (string)
 771
 772         Keyword arguments:
 773         note -- note printed before downloading (string)
 774         errnote -- note printed in case of an error (string)
 775         fatal -- flag denoting whether error should be considered fatal,
 776             i.e. whether it should cause ExtractionError to be raised,
 777             otherwise a warning will be reported and extraction continued
 778         tries -- number of tries
 779         timeout -- sleep interval between tries
 780         encoding -- encoding for a page content decoding, guessed automatically
 781             when not explicitly specified
 782         data -- POST data (bytes)
 783         headers -- HTTP headers (dict)
 784         query -- URL query (dict)
 785         expected_status -- allows to accept failed HTTP requests (non 2xx
 786             status code) by explicitly specifying a set of accepted status
 787             codes. Can be any of the following entities:
 788                 - an integer type specifying an exact failed status code to
 789                   accept
 790                 - a list or a tuple of integer types specifying a list of
 791                   failed status codes to accept
 792                 - a callable accepting an actual failed status code and
 793                   returning True if it should be accepted
 794             Note that this argument does not affect success status codes (2xx)
 795             which are always accepted.
 796         """
 797
 798         success = False
 799         try_count = 0
 800         while success is False:
 801             try:
 802                 res = self._download_webpage_handle(
 803                     url_or_request, video_id, note, errnote, fatal,
 804                     encoding=encoding, data=data, headers=headers, query=query,
 805                     expected_status=expected_status)
 806                 success = True
 807             except compat_http_client.IncompleteRead as e:
 808                 try_count += 1
 809                 if try_count >= tries:
 810                     raise e
 811                 self._sleep(timeout, video_id)
 812         if res is False:
 813             return res
 814         else:
 815             content, _ = res
 816             return content
 817
 818     def _download_xml_handle(
 819             self, url_or_request, video_id, note='Downloading XML',
 820             errnote='Unable to download XML', transform_source=None,
 821             fatal=True, encoding=None, data=None, headers={}, query={},
 822             expected_status=None):
 823         """
 824         Return a tuple (xml as an compat_etree_Element, URL handle).
 825
 826         See _download_webpage docstring for arguments specification.
 827         """
 828         res = self._download_webpage_handle(
 829             url_or_request, video_id, note, errnote, fatal=fatal,
 830             encoding=encoding, data=data, headers=headers, query=query,
 831             expected_status=expected_status)
 832         if res is False:
 833             return res
 834         xml_string, urlh = res
 835         return self._parse_xml(
 836             xml_string, video_id, transform_source=transform_source,
 837             fatal=fatal), urlh
 838
 839     def _download_xml(
 840             self, url_or_request, video_id,
 841             note='Downloading XML', errnote='Unable to download XML',
 842             transform_source=None, fatal=True, encoding=None,
 843             data=None, headers={}, query={}, expected_status=None):
 844         """
 845         Return the xml as an compat_etree_Element.
 846
 847         See _download_webpage docstring for arguments specification.
 848         """
 849         res = self._download_xml_handle(
 850             url_or_request, video_id, note=note, errnote=errnote,
 851             transform_source=transform_source, fatal=fatal, encoding=encoding,
 852             data=data, headers=headers, query=query,
 853             expected_status=expected_status)
 854         return res if res is False else res[0]
 855
 856     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 857         if transform_source:
 858             xml_string = transform_source(xml_string)
 859         try:
 860             return compat_etree_fromstring(xml_string.encode('utf-8'))
 861         except compat_xml_parse_error as ve:
 862             errmsg = '%s: Failed to parse XML ' % video_id
 863             if fatal:
 864                 raise ExtractorError(errmsg, cause=ve)
 865             else:
 866                 self.report_warning(errmsg + str(ve))
 867
 868     def _download_json_handle(
 869             self, url_or_request, video_id, note='Downloading JSON metadata',
 870             errnote='Unable to download JSON metadata', transform_source=None,
 871             fatal=True, encoding=None, data=None, headers={}, query={},
 872             expected_status=None):
 873         """
 874         Return a tuple (JSON object, URL handle).
 875
 876         See _download_webpage docstring for arguments specification.
 877         """
 878         res = self._download_webpage_handle(
 879             url_or_request, video_id, note, errnote, fatal=fatal,
 880             encoding=encoding, data=data, headers=headers, query=query,
 881             expected_status=expected_status)
 882         if res is False:
 883             return res
 884         json_string, urlh = res
 885         return self._parse_json(
 886             json_string, video_id, transform_source=transform_source,
 887             fatal=fatal), urlh
 888
 889     def _download_json(
 890             self, url_or_request, video_id, note='Downloading JSON metadata',
 891             errnote='Unable to download JSON metadata', transform_source=None,
 892             fatal=True, encoding=None, data=None, headers={}, query={},
 893             expected_status=None):
 894         """
 895         Return the JSON object as a dict.
 896
 897         See _download_webpage docstring for arguments specification.
 898         """
 899         res = self._download_json_handle(
 900             url_or_request, video_id, note=note, errnote=errnote,
 901             transform_source=transform_source, fatal=fatal, encoding=encoding,
 902             data=data, headers=headers, query=query,
 903             expected_status=expected_status)
 904         return res if res is False else res[0]
 905
 906     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 907         if transform_source:
 908             json_string = transform_source(json_string)
 909         try:
 910             return json.loads(json_string)
 911         except ValueError as ve:
 912             errmsg = '%s: Failed to parse JSON ' % video_id
 913             if fatal:
 914                 raise ExtractorError(errmsg, cause=ve)
 915             else:
 916                 self.report_warning(errmsg + str(ve))
 917
 918     def report_warning(self, msg, video_id=None):
 919         idstr = '' if video_id is None else '%s: ' % video_id
 920         self._downloader.report_warning(
 921             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 922
 923     def to_screen(self, msg):
 924         """Print msg to screen, prefixing it with '[ie_name]'"""
 925         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 926
 927     def report_extraction(self, id_or_name):
 928         """Report information extraction."""
 929         self.to_screen('%s: Extracting information' % id_or_name)
 930
 931     def report_download_webpage(self, video_id):
 932         """Report webpage download."""
 933         self.to_screen('%s: Downloading webpage' % video_id)
 934
 935     def report_age_confirmation(self):
 936         """Report attempt to confirm age."""
 937         self.to_screen('Confirming age')
 938
 939     def report_login(self):
 940         """Report attempt to log in."""
 941         self.to_screen('Logging in')
 942
 943     @staticmethod
 944     def raise_login_required(msg='This video is only available for registered users'):
 945         raise ExtractorError(
 946             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 947             expected=True)
 948
 949     @staticmethod
 950     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 951         raise GeoRestrictedError(msg, countries=countries)
 952
 953     # Methods for following #608
 954     @staticmethod
 955     def url_result(url, ie=None, video_id=None, video_title=None):
 956         """Returns a URL that points to a page that should be processed"""
 957         # TODO: ie should be the class used for getting the info
 958         video_info = {'_type': 'url',
 959                       'url': url,
 960                       'ie_key': ie}
 961         if video_id is not None:
 962             video_info['id'] = video_id
 963         if video_title is not None:
 964             video_info['title'] = video_title
 965         return video_info
 966
 967     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 968         urls = orderedSet(
 969             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 970             for m in matches)
 971         return self.playlist_result(
 972             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 973
 974     @staticmethod
 975     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
 976         """Returns a playlist"""
 977         video_info = {'_type': 'playlist',
 978                       'entries': entries}
 979         video_info.update(kwargs)
 980         if playlist_id:
 981             video_info['id'] = playlist_id
 982         if playlist_title:
 983             video_info['title'] = playlist_title
 984         if playlist_description is not None:
 985             video_info['description'] = playlist_description
 986         return video_info
 987
 988     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 989         """
 990         Perform a regex search on the given string, using a single or a list of
 991         patterns returning the first matching group.
 992         In case of failure return a default value or raise a WARNING or a
 993         RegexNotFoundError, depending on fatal, specifying the field name.
 994         """
 995         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 996             mobj = re.search(pattern, string, flags)
 997         else:
 998             for p in pattern:
 999                 mobj = re.search(p, string, flags)
1000                 if mobj:
1001                     break
1002
1003         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1004             _name = '\033[0;34m%s\033[0m' % name
1005         else:
1006             _name = name
1007
1008         if mobj:
1009             if group is None:
1010                 # return the first matching group
1011                 return next(g for g in mobj.groups() if g is not None)
1012             else:
1013                 return mobj.group(group)
1014         elif default is not NO_DEFAULT:
1015             return default
1016         elif fatal:
1017             raise RegexNotFoundError('Unable to extract %s' % _name)
1018         else:
1019             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1020             return None
1021
1022     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1023         """
1024         Like _search_regex, but strips HTML tags and unescapes entities.
1025         """
1026         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1027         if res:
1028             return clean_html(res).strip()
1029         else:
1030             return res
1031
1032     def _get_netrc_login_info(self, netrc_machine=None):
1033         username = None
1034         password = None
1035         netrc_machine = netrc_machine or self._NETRC_MACHINE
1036
1037         if self._downloader.params.get('usenetrc', False):
1038             try:
1039                 info = netrc.netrc().authenticators(netrc_machine)
1040                 if info is not None:
1041                     username = info[0]
1042                     password = info[2]
1043                 else:
1044                     raise netrc.NetrcParseError(
1045                         'No authenticators for %s' % netrc_machine)
1046             except (IOError, netrc.NetrcParseError) as err:
1047                 self._downloader.report_warning(
1048                     'parsing .netrc: %s' % error_to_compat_str(err))
1049
1050         return username, password
1051
1052     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1053         """
1054         Get the login info as (username, password)
1055         First look for the manually specified credentials using username_option
1056         and password_option as keys in params dictionary. If no such credentials
1057         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1058         value.
1059         If there's no info available, return (None, None)
1060         """
1061         if self._downloader is None:
1062             return (None, None)
1063
1064         downloader_params = self._downloader.params
1065
1066         # Attempt to use provided username and password or .netrc data
1067         if downloader_params.get(username_option) is not None:
1068             username = downloader_params[username_option]
1069             password = downloader_params[password_option]
1070         else:
1071             username, password = self._get_netrc_login_info(netrc_machine)
1072
1073         return username, password
1074
1075     def _get_tfa_info(self, note='two-factor verification code'):
1076         """
1077         Get the two-factor authentication info
1078         TODO - asking the user will be required for sms/phone verify
1079         currently just uses the command line option
1080         If there's no info available, return None
1081         """
1082         if self._downloader is None:
1083             return None
1084         downloader_params = self._downloader.params
1085
1086         if downloader_params.get('twofactor') is not None:
1087             return downloader_params['twofactor']
1088
1089         return compat_getpass('Type %s and press [Return]: ' % note)
1090
1091     # Helper functions for extracting OpenGraph info
1092     @staticmethod
1093     def _og_regexes(prop):
1094         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1095         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1096                        % {'prop': re.escape(prop)})
1097         template = r'<meta[^>]+?%s[^>]+?%s'
1098         return [
1099             template % (property_re, content_re),
1100             template % (content_re, property_re),
1101         ]
1102
1103     @staticmethod
1104     def _meta_regex(prop):
1105         return r'''(?isx)<meta
1106                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1107                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1108
1109     def _og_search_property(self, prop, html, name=None, **kargs):
1110         if not isinstance(prop, (list, tuple)):
1111             prop = [prop]
1112         if name is None:
1113             name = 'OpenGraph %s' % prop[0]
1114         og_regexes = []
1115         for p in prop:
1116             og_regexes.extend(self._og_regexes(p))
1117         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1118         if escaped is None:
1119             return None
1120         return unescapeHTML(escaped)
1121
1122     def _og_search_thumbnail(self, html, **kargs):
1123         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1124
1125     def _og_search_description(self, html, **kargs):
1126         return self._og_search_property('description', html, fatal=False, **kargs)
1127
1128     def _og_search_title(self, html, **kargs):
1129         return self._og_search_property('title', html, **kargs)
1130
1131     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1132         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1133         if secure:
1134             regexes = self._og_regexes('video:secure_url') + regexes
1135         return self._html_search_regex(regexes, html, name, **kargs)
1136
1137     def _og_search_url(self, html, **kargs):
1138         return self._og_search_property('url', html, **kargs)
1139
1140     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1141         if not isinstance(name, (list, tuple)):
1142             name = [name]
1143         if display_name is None:
1144             display_name = name[0]
1145         return self._html_search_regex(
1146             [self._meta_regex(n) for n in name],
1147             html, display_name, fatal=fatal, group='content', **kwargs)
1148
1149     def _dc_search_uploader(self, html):
1150         return self._html_search_meta('dc.creator', html, 'uploader')
1151
1152     def _rta_search(self, html):
1153         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1154         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1155                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1156                      html):
1157             return 18
1158         return 0
1159
1160     def _media_rating_search(self, html):
1161         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1162         rating = self._html_search_meta('rating', html)
1163
1164         if not rating:
1165             return None
1166
1167         RATING_TABLE = {
1168             'safe for kids': 0,
1169             'general': 8,
1170             '14 years': 14,
1171             'mature': 17,
1172             'restricted': 19,
1173         }
1174         return RATING_TABLE.get(rating.lower())
1175
1176     def _family_friendly_search(self, html):
1177         # See http://schema.org/VideoObject
1178         family_friendly = self._html_search_meta(
1179             'isFamilyFriendly', html, default=None)
1180
1181         if not family_friendly:
1182             return None
1183
1184         RATING_TABLE = {
1185             '1': 0,
1186             'true': 0,
1187             '0': 18,
1188             'false': 18,
1189         }
1190         return RATING_TABLE.get(family_friendly.lower())
1191
1192     def _twitter_search_player(self, html):
1193         return self._html_search_meta('twitter:player', html,
1194                                       'twitter card player')
1195
1196     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1197         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1198         default = kwargs.get('default', NO_DEFAULT)
1199         # JSON-LD may be malformed and thus `fatal` should be respected.
1200         # At the same time `default` may be passed that assumes `fatal=False`
1201         # for _search_regex. Let's simulate the same behavior here as well.
1202         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1203         json_ld = []
1204         for mobj in json_ld_list:
1205             json_ld_item = self._parse_json(
1206                 mobj.group('json_ld'), video_id, fatal=fatal)
1207             if not json_ld_item:
1208                 continue
1209             if isinstance(json_ld_item, dict):
1210                 json_ld.append(json_ld_item)
1211             elif isinstance(json_ld_item, (list, tuple)):
1212                 json_ld.extend(json_ld_item)
1213         if json_ld:
1214             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1215         if json_ld:
1216             return json_ld
1217         if default is not NO_DEFAULT:
1218             return default
1219         elif fatal:
1220             raise RegexNotFoundError('Unable to extract JSON-LD')
1221         else:
1222             self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1223             return {}
1224
1225     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1226         if isinstance(json_ld, compat_str):
1227             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1228         if not json_ld:
1229             return {}
1230         info = {}
1231         if not isinstance(json_ld, (list, tuple, dict)):
1232             return info
1233         if isinstance(json_ld, dict):
1234             json_ld = [json_ld]
1235
1236         INTERACTION_TYPE_MAP = {
1237             'CommentAction': 'comment',
1238             'AgreeAction': 'like',
1239             'DisagreeAction': 'dislike',
1240             'LikeAction': 'like',
1241             'DislikeAction': 'dislike',
1242             'ListenAction': 'view',
1243             'WatchAction': 'view',
1244             'ViewAction': 'view',
1245         }
1246
1247         def extract_interaction_type(e):
1248             interaction_type = e.get('interactionType')
1249             if isinstance(interaction_type, dict):
1250                 interaction_type = interaction_type.get('@type')
1251             return str_or_none(interaction_type)
1252
1253         def extract_interaction_statistic(e):
1254             interaction_statistic = e.get('interactionStatistic')
1255             if isinstance(interaction_statistic, dict):
1256                 interaction_statistic = [interaction_statistic]
1257             if not isinstance(interaction_statistic, list):
1258                 return
1259             for is_e in interaction_statistic:
1260                 if not isinstance(is_e, dict):
1261                     continue
1262                 if is_e.get('@type') != 'InteractionCounter':
1263                     continue
1264                 interaction_type = extract_interaction_type(is_e)
1265                 if not interaction_type:
1266                     continue
1267                 # For interaction count some sites provide string instead of
1268                 # an integer (as per spec) with non digit characters (e.g. ",")
1269                 # so extracting count with more relaxed str_to_int
1270                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1271                 if interaction_count is None:
1272                     continue
1273                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1274                 if not count_kind:
1275                     continue
1276                 count_key = '%s_count' % count_kind
1277                 if info.get(count_key) is not None:
1278                     continue
1279                 info[count_key] = interaction_count
1280
1281         def extract_video_object(e):
1282             assert e['@type'] == 'VideoObject'
1283             info.update({
1284                 'url': url_or_none(e.get('contentUrl')),
1285                 'title': unescapeHTML(e.get('name')),
1286                 'description': unescapeHTML(e.get('description')),
1287                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1288                 'duration': parse_duration(e.get('duration')),
1289                 'timestamp': unified_timestamp(e.get('uploadDate')),
1290                 'uploader': str_or_none(e.get('author')),
1291                 'filesize': float_or_none(e.get('contentSize')),
1292                 'tbr': int_or_none(e.get('bitrate')),
1293                 'width': int_or_none(e.get('width')),
1294                 'height': int_or_none(e.get('height')),
1295                 'view_count': int_or_none(e.get('interactionCount')),
1296             })
1297             extract_interaction_statistic(e)
1298
1299         for e in json_ld:
1300             if '@context' in e:
1301                 item_type = e.get('@type')
1302                 if expected_type is not None and expected_type != item_type:
1303                     continue
1304                 if item_type in ('TVEpisode', 'Episode'):
1305                     episode_name = unescapeHTML(e.get('name'))
1306                     info.update({
1307                         'episode': episode_name,
1308                         'episode_number': int_or_none(e.get('episodeNumber')),
1309                         'description': unescapeHTML(e.get('description')),
1310                     })
1311                     if not info.get('title') and episode_name:
1312                         info['title'] = episode_name
1313                     part_of_season = e.get('partOfSeason')
1314                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1315                         info.update({
1316                             'season': unescapeHTML(part_of_season.get('name')),
1317                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1318                         })
1319                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1320                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1321                         info['series'] = unescapeHTML(part_of_series.get('name'))
1322                 elif item_type == 'Movie':
1323                     info.update({
1324                         'title': unescapeHTML(e.get('name')),
1325                         'description': unescapeHTML(e.get('description')),
1326                         'duration': parse_duration(e.get('duration')),
1327                         'timestamp': unified_timestamp(e.get('dateCreated')),
1328                     })
1329                 elif item_type in ('Article', 'NewsArticle'):
1330                     info.update({
1331                         'timestamp': parse_iso8601(e.get('datePublished')),
1332                         'title': unescapeHTML(e.get('headline')),
1333                         'description': unescapeHTML(e.get('articleBody')),
1334                     })
1335                 elif item_type == 'VideoObject':
1336                     extract_video_object(e)
1337                     if expected_type is None:
1338                         continue
1339                     else:
1340                         break
1341                 video = e.get('video')
1342                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1343                     extract_video_object(video)
1344                 if expected_type is None:
1345                     continue
1346                 else:
1347                     break
1348         return dict((k, v) for k, v in info.items() if v is not None)
1349
1350     @staticmethod
1351     def _hidden_inputs(html):
1352         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1353         hidden_inputs = {}
1354         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1355             attrs = extract_attributes(input)
1356             if not input:
1357                 continue
1358             if attrs.get('type') not in ('hidden', 'submit'):
1359                 continue
1360             name = attrs.get('name') or attrs.get('id')
1361             value = attrs.get('value')
1362             if name and value is not None:
1363                 hidden_inputs[name] = value
1364         return hidden_inputs
1365
1366     def _form_hidden_inputs(self, form_id, html):
1367         form = self._search_regex(
1368             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1369             html, '%s form' % form_id, group='form')
1370         return self._hidden_inputs(form)
1371
1372     class FormatSort:
1373         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<seperator>[~:])(?P<limit>.*?))?)? *$'
1374
1375         default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
1376                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1377                    'proto', 'ext', 'has_audio', 'source', 'format_id')  # These must not be aliases
1378
1379         settings = {
1380             'vcodec': {'type': 'ordered', 'regex': True,
1381                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1382             'acodec': {'type': 'ordered', 'regex': True,
1383                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1384             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1385                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
1386             'vext': {'type': 'ordered', 'field': 'video_ext',
1387                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1388                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1389             'aext': {'type': 'ordered', 'field': 'audio_ext',
1390                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1391                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1392             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1393             'ie_pref': {'priority': True, 'type': 'extractor'},
1394             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1395             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1396             'lang': {'priority': True, 'convert': 'ignore', 'type': 'extractor', 'field': 'language_preference'},
1397             'quality': {'convert': 'float_none', 'type': 'extractor'},
1398             'filesize': {'convert': 'bytes'},
1399             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1400             'id': {'convert': 'string', 'field': 'format_id'},
1401             'height': {'convert': 'float_none'},
1402             'width': {'convert': 'float_none'},
1403             'fps': {'convert': 'float_none'},
1404             'tbr': {'convert': 'float_none'},
1405             'vbr': {'convert': 'float_none'},
1406             'abr': {'convert': 'float_none'},
1407             'asr': {'convert': 'float_none'},
1408             'source': {'convert': 'ignore', 'type': 'extractor', 'field': 'source_preference'},
1409
1410             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1411             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1412             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1413             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1414             'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1415
1416             # Most of these exist only for compatibility reasons
1417             'dimension': {'type': 'alias', 'field': 'res'},
1418             'resolution': {'type': 'alias', 'field': 'res'},
1419             'extension': {'type': 'alias', 'field': 'ext'},
1420             'bitrate': {'type': 'alias', 'field': 'br'},
1421             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1422             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1423             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1424             'framerate': {'type': 'alias', 'field': 'fps'},
1425             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1426             'protocol': {'type': 'alias', 'field': 'proto'},
1427             'source_preference': {'type': 'alias', 'field': 'source'},
1428             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1429             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1430             'samplerate': {'type': 'alias', 'field': 'asr'},
1431             'video_ext': {'type': 'alias', 'field': 'vext'},
1432             'audio_ext': {'type': 'alias', 'field': 'aext'},
1433             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1434             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1435             'video': {'type': 'alias', 'field': 'hasvid'},
1436             'has_video': {'type': 'alias', 'field': 'hasvid'},
1437             'audio': {'type': 'alias', 'field': 'hasaud'},
1438             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1439             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1440             'preference': {'type': 'alias', 'field': 'ie_pref'},
1441             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1442             'format_id': {'type': 'alias', 'field': 'id'},
1443         }
1444
1445         _order = []
1446
1447         def _get_field_setting(self, field, key):
1448             if field not in self.settings:
1449                 self.settings[field] = {}
1450             propObj = self.settings[field]
1451             if key not in propObj:
1452                 type = propObj.get('type')
1453                 if key == 'field':
1454                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1455                 elif key == 'convert':
1456                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1457                 else:
1458                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1459                 propObj[key] = default
1460             return propObj[key]
1461
1462         def _resolve_field_value(self, field, value, convertNone=False):
1463             if value is None:
1464                 if not convertNone:
1465                     return None
1466             else:
1467                 value = value.lower()
1468             conversion = self._get_field_setting(field, 'convert')
1469             if conversion == 'ignore':
1470                 return None
1471             if conversion == 'string':
1472                 return value
1473             elif conversion == 'float_none':
1474                 return float_or_none(value)
1475             elif conversion == 'bytes':
1476                 return FileDownloader.parse_bytes(value)
1477             elif conversion == 'order':
1478                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1479                 use_regex = self._get_field_setting(field, 'regex')
1480                 list_length = len(order_list)
1481                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1482                 if use_regex and value is not None:
1483                     for i, regex in enumerate(order_list):
1484                         if regex and re.match(regex, value):
1485                             return list_length - i
1486                     return list_length - empty_pos  # not in list
1487                 else:  # not regex or  value = None
1488                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1489             else:
1490                 if value.isnumeric():
1491                     return float(value)
1492                 else:
1493                     self.settings[field]['convert'] = 'string'
1494                     return value
1495
1496         def evaluate_params(self, params, sort_extractor):
1497             self._use_free_order = params.get('prefer_free_formats', False)
1498             self._sort_user = params.get('format_sort', [])
1499             self._sort_extractor = sort_extractor
1500
1501             def add_item(field, reverse, closest, limit_text):
1502                 field = field.lower()
1503                 if field in self._order:
1504                     return
1505                 self._order.append(field)
1506                 limit = self._resolve_field_value(field, limit_text)
1507                 data = {
1508                     'reverse': reverse,
1509                     'closest': False if limit is None else closest,
1510                     'limit_text': limit_text,
1511                     'limit': limit}
1512                 if field in self.settings:
1513                     self.settings[field].update(data)
1514                 else:
1515                     self.settings[field] = data
1516
1517             sort_list = (
1518                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1519                 + (tuple() if params.get('format_sort_force', False)
1520                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1521                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1522
1523             for item in sort_list:
1524                 match = re.match(self.regex, item)
1525                 if match is None:
1526                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1527                 field = match.group('field')
1528                 if field is None:
1529                     continue
1530                 if self._get_field_setting(field, 'type') == 'alias':
1531                     field = self._get_field_setting(field, 'field')
1532                 reverse = match.group('reverse') is not None
1533                 closest = match.group('seperator') == '~'
1534                 limit_text = match.group('limit')
1535
1536                 has_limit = limit_text is not None
1537                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1538                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1539
1540                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1541                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1542                 limit_count = len(limits)
1543                 for (i, f) in enumerate(fields):
1544                     add_item(f, reverse, closest,
1545                              limits[i] if i < limit_count
1546                              else limits[0] if has_limit and not has_multiple_limits
1547                              else None)
1548
1549         def print_verbose_info(self, to_screen):
1550             to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
1551             if self._sort_extractor:
1552                 to_screen('[debug] Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1553             to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1554                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1555                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1556                               self._get_field_setting(field, 'limit_text'),
1557                               self._get_field_setting(field, 'limit'))
1558                 if self._get_field_setting(field, 'limit_text') is not None else '')
1559                 for field in self._order if self._get_field_setting(field, 'visible')]))
1560
1561         def _calculate_field_preference_from_value(self, format, field, type, value):
1562             reverse = self._get_field_setting(field, 'reverse')
1563             closest = self._get_field_setting(field, 'closest')
1564             limit = self._get_field_setting(field, 'limit')
1565
1566             if type == 'extractor':
1567                 maximum = self._get_field_setting(field, 'max')
1568                 if value is None or (maximum is not None and value >= maximum):
1569                     value = -1
1570             elif type == 'boolean':
1571                 in_list = self._get_field_setting(field, 'in_list')
1572                 not_in_list = self._get_field_setting(field, 'not_in_list')
1573                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1574             elif type == 'ordered':
1575                 value = self._resolve_field_value(field, value, True)
1576
1577             # try to convert to number
1578             val_num = float_or_none(value)
1579             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1580             if is_num:
1581                 value = val_num
1582
1583             return ((-10, 0) if value is None
1584                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1585                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1586                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1587                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1588                     else (-1, value, 0))
1589
1590         def _calculate_field_preference(self, format, field):
1591             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1592             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1593             if type == 'multiple':
1594                 type = 'field'  # Only 'field' is allowed in multiple for now
1595                 actual_fields = self._get_field_setting(field, 'field')
1596
1597                 def wrapped_function(values):
1598                     values = tuple(filter(lambda x: x is not None, values))
1599                     return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1600                             else values[0] if values
1601                             else None)
1602
1603                 value = wrapped_function((get_value(f) for f in actual_fields))
1604             else:
1605                 value = get_value(field)
1606             return self._calculate_field_preference_from_value(format, field, type, value)
1607
1608         def calculate_preference(self, format):
1609             # Determine missing protocol
1610             if not format.get('protocol'):
1611                 format['protocol'] = determine_protocol(format)
1612
1613             # Determine missing ext
1614             if not format.get('ext') and 'url' in format:
1615                 format['ext'] = determine_ext(format['url'])
1616             if format.get('vcodec') == 'none':
1617                 format['audio_ext'] = format['ext']
1618                 format['video_ext'] = 'none'
1619             else:
1620                 format['video_ext'] = format['ext']
1621                 format['audio_ext'] = 'none'
1622             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1623             #    format['preference'] = -1000
1624
1625             # Determine missing bitrates
1626             if format.get('tbr') is None:
1627                 if format.get('vbr') is not None and format.get('abr') is not None:
1628                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1629             else:
1630                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1631                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1632                 if format.get('acodec') != "none" and format.get('abr') is None:
1633                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1634
1635             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1636
1637     def _sort_formats(self, formats, field_preference=[]):
1638         if not formats:
1639             raise ExtractorError('No video formats found')
1640         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1641         format_sort.evaluate_params(self._downloader.params, field_preference)
1642         if self._downloader.params.get('verbose', False):
1643             format_sort.print_verbose_info(self._downloader.to_screen)
1644         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1645
1646     def _check_formats(self, formats, video_id):
1647         if formats:
1648             formats[:] = filter(
1649                 lambda f: self._is_valid_url(
1650                     f['url'], video_id,
1651                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1652                 formats)
1653
1654     @staticmethod
1655     def _remove_duplicate_formats(formats):
1656         format_urls = set()
1657         unique_formats = []
1658         for f in formats:
1659             if f['url'] not in format_urls:
1660                 format_urls.add(f['url'])
1661                 unique_formats.append(f)
1662         formats[:] = unique_formats
1663
1664     def _is_valid_url(self, url, video_id, item='video', headers={}):
1665         url = self._proto_relative_url(url, scheme='http:')
1666         # For now assume non HTTP(S) URLs always valid
1667         if not (url.startswith('http://') or url.startswith('https://')):
1668             return True
1669         try:
1670             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1671             return True
1672         except ExtractorError as e:
1673             self.to_screen(
1674                 '%s: %s URL is invalid, skipping: %s'
1675                 % (video_id, item, error_to_compat_str(e.cause)))
1676             return False
1677
1678     def http_scheme(self):
1679         """ Either "http:" or "https:", depending on the user's preferences """
1680         return (
1681             'http:'
1682             if self._downloader.params.get('prefer_insecure', False)
1683             else 'https:')
1684
1685     def _proto_relative_url(self, url, scheme=None):
1686         if url is None:
1687             return url
1688         if url.startswith('//'):
1689             if scheme is None:
1690                 scheme = self.http_scheme()
1691             return scheme + url
1692         else:
1693             return url
1694
1695     def _sleep(self, timeout, video_id, msg_template=None):
1696         if msg_template is None:
1697             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1698         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1699         self.to_screen(msg)
1700         time.sleep(timeout)
1701
1702     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1703                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1704                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1705         manifest = self._download_xml(
1706             manifest_url, video_id, 'Downloading f4m manifest',
1707             'Unable to download f4m manifest',
1708             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1709             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1710             transform_source=transform_source,
1711             fatal=fatal, data=data, headers=headers, query=query)
1712
1713         if manifest is False:
1714             return []
1715
1716         return self._parse_f4m_formats(
1717             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1718             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1719
1720     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1721                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1722                            fatal=True, m3u8_id=None):
1723         if not isinstance(manifest, compat_etree_Element) and not fatal:
1724             return []
1725
1726         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1727         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1728         if akamai_pv is not None and ';' in akamai_pv.text:
1729             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1730             if playerVerificationChallenge.strip() != '':
1731                 return []
1732
1733         formats = []
1734         manifest_version = '1.0'
1735         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1736         if not media_nodes:
1737             manifest_version = '2.0'
1738             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1739         # Remove unsupported DRM protected media from final formats
1740         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1741         media_nodes = remove_encrypted_media(media_nodes)
1742         if not media_nodes:
1743             return formats
1744
1745         manifest_base_url = get_base_url(manifest)
1746
1747         bootstrap_info = xpath_element(
1748             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1749             'bootstrap info', default=None)
1750
1751         vcodec = None
1752         mime_type = xpath_text(
1753             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1754             'base URL', default=None)
1755         if mime_type and mime_type.startswith('audio/'):
1756             vcodec = 'none'
1757
1758         for i, media_el in enumerate(media_nodes):
1759             tbr = int_or_none(media_el.attrib.get('bitrate'))
1760             width = int_or_none(media_el.attrib.get('width'))
1761             height = int_or_none(media_el.attrib.get('height'))
1762             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1763             # If <bootstrapInfo> is present, the specified f4m is a
1764             # stream-level manifest, and only set-level manifests may refer to
1765             # external resources.  See section 11.4 and section 4 of F4M spec
1766             if bootstrap_info is None:
1767                 media_url = None
1768                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1769                 if manifest_version == '2.0':
1770                     media_url = media_el.attrib.get('href')
1771                 if media_url is None:
1772                     media_url = media_el.attrib.get('url')
1773                 if not media_url:
1774                     continue
1775                 manifest_url = (
1776                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1777                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1778                 # If media_url is itself a f4m manifest do the recursive extraction
1779                 # since bitrates in parent manifest (this one) and media_url manifest
1780                 # may differ leading to inability to resolve the format by requested
1781                 # bitrate in f4m downloader
1782                 ext = determine_ext(manifest_url)
1783                 if ext == 'f4m':
1784                     f4m_formats = self._extract_f4m_formats(
1785                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1786                         transform_source=transform_source, fatal=fatal)
1787                     # Sometimes stream-level manifest contains single media entry that
1788                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1789                     # At the same time parent's media entry in set-level manifest may
1790                     # contain it. We will copy it from parent in such cases.
1791                     if len(f4m_formats) == 1:
1792                         f = f4m_formats[0]
1793                         f.update({
1794                             'tbr': f.get('tbr') or tbr,
1795                             'width': f.get('width') or width,
1796                             'height': f.get('height') or height,
1797                             'format_id': f.get('format_id') if not tbr else format_id,
1798                             'vcodec': vcodec,
1799                         })
1800                     formats.extend(f4m_formats)
1801                     continue
1802                 elif ext == 'm3u8':
1803                     formats.extend(self._extract_m3u8_formats(
1804                         manifest_url, video_id, 'mp4', preference=preference,
1805                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1806                     continue
1807             formats.append({
1808                 'format_id': format_id,
1809                 'url': manifest_url,
1810                 'manifest_url': manifest_url,
1811                 'ext': 'flv' if bootstrap_info is not None else None,
1812                 'protocol': 'f4m',
1813                 'tbr': tbr,
1814                 'width': width,
1815                 'height': height,
1816                 'vcodec': vcodec,
1817                 'preference': preference,
1818                 'quality': quality,
1819             })
1820         return formats
1821
1822     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1823         return {
1824             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1825             'url': m3u8_url,
1826             'ext': ext,
1827             'protocol': 'm3u8',
1828             'preference': preference - 100 if preference else -100,
1829             'quality': quality,
1830             'resolution': 'multiple',
1831             'format_note': 'Quality selection URL',
1832         }
1833
1834     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1835                               entry_protocol='m3u8', preference=None, quality=None,
1836                               m3u8_id=None, live=False, note=None, errnote=None,
1837                               fatal=True, data=None, headers={}, query={}):
1838         res = self._download_webpage_handle(
1839             m3u8_url, video_id,
1840             note=note or 'Downloading m3u8 information',
1841             errnote=errnote or 'Failed to download m3u8 information',
1842             fatal=fatal, data=data, headers=headers, query=query)
1843
1844         if res is False:
1845             return []
1846
1847         m3u8_doc, urlh = res
1848         m3u8_url = urlh.geturl()
1849
1850         return self._parse_m3u8_formats(
1851             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1852             preference=preference, quality=quality, m3u8_id=m3u8_id,
1853             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1854             headers=headers, query=query, video_id=video_id)
1855
1856     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1857                             entry_protocol='m3u8', preference=None, quality=None,
1858                             m3u8_id=None, live=False, note=None, errnote=None,
1859                             fatal=True, data=None, headers={}, query={}, video_id=None):
1860         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1861             return []
1862
1863         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1864             return []
1865
1866         formats = []
1867
1868         format_url = lambda u: (
1869             u
1870             if re.match(r'^https?://', u)
1871             else compat_urlparse.urljoin(m3u8_url, u))
1872
1873         split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
1874
1875         # References:
1876         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1877         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1878         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1879
1880         # We should try extracting formats only from master playlists [1, 4.3.4],
1881         # i.e. playlists that describe available qualities. On the other hand
1882         # media playlists [1, 4.3.3] should be returned as is since they contain
1883         # just the media without qualities renditions.
1884         # Fortunately, master playlist can be easily distinguished from media
1885         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1886         # master playlist tags MUST NOT appear in a media playlist and vice versa.
1887         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1888         # media playlist and MUST NOT appear in master playlist thus we can
1889         # clearly detect media playlist with this criterion.
1890
1891         def _extract_m3u8_playlist_formats(format_url, m3u8_doc=None):
1892             if not m3u8_doc:
1893                 res = self._download_webpage_handle(
1894                     format_url, video_id,
1895                     note=False,
1896                     errnote=errnote or 'Failed to download m3u8 playlist information',
1897                     fatal=fatal, data=data, headers=headers, query=query)
1898
1899                 if res is False:
1900                     return []
1901
1902                 m3u8_doc, urlh = res
1903                 format_url = urlh.geturl()
1904
1905             playlist_formats = []
1906             i = (
1907                 0
1908                 if split_discontinuity
1909                 else None)
1910             format_info = {
1911                 'index': i,
1912                 'key_data': None,
1913                 'files': [],
1914             }
1915             for line in m3u8_doc.splitlines():
1916                 if not line.startswith('#'):
1917                     format_info['files'].append(line)
1918                 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
1919                     i += 1
1920                     playlist_formats.append(format_info)
1921                     format_info = {
1922                         'index': i,
1923                         'url': format_url,
1924                         'files': [],
1925                     }
1926             playlist_formats.append(format_info)
1927             return playlist_formats
1928
1929         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1930
1931             playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc, True)
1932
1933             for format in playlist_formats:
1934                 format_id = []
1935                 if m3u8_id:
1936                     format_id.append(m3u8_id)
1937                 format_index = format.get('index')
1938                 if format_index:
1939                     format_id.append(str(format_index))
1940                 f = {
1941                     'format_id': '-'.join(format_id),
1942                     'format_index': format_index,
1943                     'url': m3u8_url,
1944                     'ext': ext,
1945                     'protocol': entry_protocol,
1946                     'preference': preference,
1947                     'quality': quality,
1948                 }
1949                 formats.append(f)
1950
1951             return formats
1952
1953         groups = {}
1954         last_stream_inf = {}
1955
1956         def extract_media(x_media_line):
1957             media = parse_m3u8_attributes(x_media_line)
1958             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1959             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1960             if not (media_type and group_id and name):
1961                 return
1962             groups.setdefault(group_id, []).append(media)
1963             if media_type not in ('VIDEO', 'AUDIO'):
1964                 return
1965             media_url = media.get('URI')
1966             if media_url:
1967                 manifest_url = format_url(media_url)
1968                 format_id = []
1969                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
1970
1971                 for format in playlist_formats:
1972                     format_index = format.get('index')
1973                     for v in (m3u8_id, group_id, name):
1974                         if v:
1975                             format_id.append(v)
1976                     if format_index:
1977                         format_id.append(str(format_index))
1978                     f = {
1979                         'format_id': '-'.join(format_id),
1980                         'format_index': format_index,
1981                         'url': manifest_url,
1982                         'manifest_url': m3u8_url,
1983                         'language': media.get('LANGUAGE'),
1984                         'ext': ext,
1985                         'protocol': entry_protocol,
1986                         'preference': preference,
1987                         'quality': quality,
1988                     }
1989                     if media_type == 'AUDIO':
1990                         f['vcodec'] = 'none'
1991                     formats.append(f)
1992
1993         def build_stream_name():
1994             # Despite specification does not mention NAME attribute for
1995             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1996             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1997             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1998             stream_name = last_stream_inf.get('NAME')
1999             if stream_name:
2000                 return stream_name
2001             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2002             # from corresponding rendition group
2003             stream_group_id = last_stream_inf.get('VIDEO')
2004             if not stream_group_id:
2005                 return
2006             stream_group = groups.get(stream_group_id)
2007             if not stream_group:
2008                 return stream_group_id
2009             rendition = stream_group[0]
2010             return rendition.get('NAME') or stream_group_id
2011
2012         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2013         # chance to detect video only formats when EXT-X-STREAM-INF tags
2014         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2015         for line in m3u8_doc.splitlines():
2016             if line.startswith('#EXT-X-MEDIA:'):
2017                 extract_media(line)
2018
2019         for line in m3u8_doc.splitlines():
2020             if line.startswith('#EXT-X-STREAM-INF:'):
2021                 last_stream_inf = parse_m3u8_attributes(line)
2022             elif line.startswith('#') or not line.strip():
2023                 continue
2024             else:
2025                 tbr = float_or_none(
2026                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2027                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2028                 manifest_url = format_url(line.strip())
2029
2030                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
2031
2032                 for format in playlist_formats:
2033                     format_id = []
2034                     if m3u8_id:
2035                         format_id.append(m3u8_id)
2036                     format_index = format.get('index')
2037                     stream_name = build_stream_name()
2038                     # Bandwidth of live streams may differ over time thus making
2039                     # format_id unpredictable. So it's better to keep provided
2040                     # format_id intact.
2041                     if not live:
2042                         format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2043                     if format_index:
2044                         format_id.append(str(format_index))
2045                     f = {
2046                         'format_id': '-'.join(format_id),
2047                         'format_index': format_index,
2048                         'url': manifest_url,
2049                         'manifest_url': m3u8_url,
2050                         'tbr': tbr,
2051                         'ext': ext,
2052                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2053                         'protocol': entry_protocol,
2054                         'preference': preference,
2055                         'quality': quality,
2056                     }
2057                     resolution = last_stream_inf.get('RESOLUTION')
2058                     if resolution:
2059                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2060                         if mobj:
2061                             f['width'] = int(mobj.group('width'))
2062                             f['height'] = int(mobj.group('height'))
2063                     # Unified Streaming Platform
2064                     mobj = re.search(
2065                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2066                     if mobj:
2067                         abr, vbr = mobj.groups()
2068                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2069                         f.update({
2070                             'vbr': vbr,
2071                             'abr': abr,
2072                         })
2073                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2074                     f.update(codecs)
2075                     audio_group_id = last_stream_inf.get('AUDIO')
2076                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2077                     # references a rendition group MUST have a CODECS attribute.
2078                     # However, this is not always respected, for example, [2]
2079                     # contains EXT-X-STREAM-INF tag which references AUDIO
2080                     # rendition group but does not have CODECS and despite
2081                     # referencing an audio group it represents a complete
2082                     # (with audio and video) format. So, for such cases we will
2083                     # ignore references to rendition groups and treat them
2084                     # as complete formats.
2085                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2086                         audio_group = groups.get(audio_group_id)
2087                         if audio_group and audio_group[0].get('URI'):
2088                             # TODO: update acodec for audio only formats with
2089                             # the same GROUP-ID
2090                             f['acodec'] = 'none'
2091                     formats.append(f)
2092
2093                     # for DailyMotion
2094                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2095                     if progressive_uri:
2096                         http_f = f.copy()
2097                         del http_f['manifest_url']
2098                         http_f.update({
2099                             'format_id': f['format_id'].replace('hls-', 'http-'),
2100                             'protocol': 'http',
2101                             'url': progressive_uri,
2102                         })
2103                         formats.append(http_f)
2104
2105                 last_stream_inf = {}
2106         return formats
2107
2108     @staticmethod
2109     def _xpath_ns(path, namespace=None):
2110         if not namespace:
2111             return path
2112         out = []
2113         for c in path.split('/'):
2114             if not c or c == '.':
2115                 out.append(c)
2116             else:
2117                 out.append('{%s}%s' % (namespace, c))
2118         return '/'.join(out)
2119
2120     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2121         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2122
2123         if smil is False:
2124             assert not fatal
2125             return []
2126
2127         namespace = self._parse_smil_namespace(smil)
2128
2129         return self._parse_smil_formats(
2130             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2131
2132     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2133         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2134         if smil is False:
2135             return {}
2136         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2137
2138     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2139         return self._download_xml(
2140             smil_url, video_id, 'Downloading SMIL file',
2141             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2142
2143     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2144         namespace = self._parse_smil_namespace(smil)
2145
2146         formats = self._parse_smil_formats(
2147             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2148         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2149
2150         video_id = os.path.splitext(url_basename(smil_url))[0]
2151         title = None
2152         description = None
2153         upload_date = None
2154         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2155             name = meta.attrib.get('name')
2156             content = meta.attrib.get('content')
2157             if not name or not content:
2158                 continue
2159             if not title and name == 'title':
2160                 title = content
2161             elif not description and name in ('description', 'abstract'):
2162                 description = content
2163             elif not upload_date and name == 'date':
2164                 upload_date = unified_strdate(content)
2165
2166         thumbnails = [{
2167             'id': image.get('type'),
2168             'url': image.get('src'),
2169             'width': int_or_none(image.get('width')),
2170             'height': int_or_none(image.get('height')),
2171         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2172
2173         return {
2174             'id': video_id,
2175             'title': title or video_id,
2176             'description': description,
2177             'upload_date': upload_date,
2178             'thumbnails': thumbnails,
2179             'formats': formats,
2180             'subtitles': subtitles,
2181         }
2182
2183     def _parse_smil_namespace(self, smil):
2184         return self._search_regex(
2185             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2186
2187     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2188         base = smil_url
2189         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2190             b = meta.get('base') or meta.get('httpBase')
2191             if b:
2192                 base = b
2193                 break
2194
2195         formats = []
2196         rtmp_count = 0
2197         http_count = 0
2198         m3u8_count = 0
2199
2200         srcs = []
2201         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2202         for medium in media:
2203             src = medium.get('src')
2204             if not src or src in srcs:
2205                 continue
2206             srcs.append(src)
2207
2208             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2209             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2210             width = int_or_none(medium.get('width'))
2211             height = int_or_none(medium.get('height'))
2212             proto = medium.get('proto')
2213             ext = medium.get('ext')
2214             src_ext = determine_ext(src)
2215             streamer = medium.get('streamer') or base
2216
2217             if proto == 'rtmp' or streamer.startswith('rtmp'):
2218                 rtmp_count += 1
2219                 formats.append({
2220                     'url': streamer,
2221                     'play_path': src,
2222                     'ext': 'flv',
2223                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2224                     'tbr': bitrate,
2225                     'filesize': filesize,
2226                     'width': width,
2227                     'height': height,
2228                 })
2229                 if transform_rtmp_url:
2230                     streamer, src = transform_rtmp_url(streamer, src)
2231                     formats[-1].update({
2232                         'url': streamer,
2233                         'play_path': src,
2234                     })
2235                 continue
2236
2237             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2238             src_url = src_url.strip()
2239
2240             if proto == 'm3u8' or src_ext == 'm3u8':
2241                 m3u8_formats = self._extract_m3u8_formats(
2242                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2243                 if len(m3u8_formats) == 1:
2244                     m3u8_count += 1
2245                     m3u8_formats[0].update({
2246                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2247                         'tbr': bitrate,
2248                         'width': width,
2249                         'height': height,
2250                     })
2251                 formats.extend(m3u8_formats)
2252             elif src_ext == 'f4m':
2253                 f4m_url = src_url
2254                 if not f4m_params:
2255                     f4m_params = {
2256                         'hdcore': '3.2.0',
2257                         'plugin': 'flowplayer-3.2.0.1',
2258                     }
2259                 f4m_url += '&' if '?' in f4m_url else '?'
2260                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2261                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2262             elif src_ext == 'mpd':
2263                 formats.extend(self._extract_mpd_formats(
2264                     src_url, video_id, mpd_id='dash', fatal=False))
2265             elif re.search(r'\.ism/[Mm]anifest', src_url):
2266                 formats.extend(self._extract_ism_formats(
2267                     src_url, video_id, ism_id='mss', fatal=False))
2268             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2269                 http_count += 1
2270                 formats.append({
2271                     'url': src_url,
2272                     'ext': ext or src_ext or 'flv',
2273                     'format_id': 'http-%d' % (bitrate or http_count),
2274                     'tbr': bitrate,
2275                     'filesize': filesize,
2276                     'width': width,
2277                     'height': height,
2278                 })
2279
2280         return formats
2281
2282     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2283         urls = []
2284         subtitles = {}
2285         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2286             src = textstream.get('src')
2287             if not src or src in urls:
2288                 continue
2289             urls.append(src)
2290             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2291             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2292             subtitles.setdefault(lang, []).append({
2293                 'url': src,
2294                 'ext': ext,
2295             })
2296         return subtitles
2297
2298     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2299         xspf = self._download_xml(
2300             xspf_url, playlist_id, 'Downloading xpsf playlist',
2301             'Unable to download xspf manifest', fatal=fatal)
2302         if xspf is False:
2303             return []
2304         return self._parse_xspf(
2305             xspf, playlist_id, xspf_url=xspf_url,
2306             xspf_base_url=base_url(xspf_url))
2307
2308     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2309         NS_MAP = {
2310             'xspf': 'http://xspf.org/ns/0/',
2311             's1': 'http://static.streamone.nl/player/ns/0',
2312         }
2313
2314         entries = []
2315         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2316             title = xpath_text(
2317                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2318             description = xpath_text(
2319                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2320             thumbnail = xpath_text(
2321                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2322             duration = float_or_none(
2323                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2324
2325             formats = []
2326             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2327                 format_url = urljoin(xspf_base_url, location.text)
2328                 if not format_url:
2329                     continue
2330                 formats.append({
2331                     'url': format_url,
2332                     'manifest_url': xspf_url,
2333                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2334                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2335                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2336                 })
2337             self._sort_formats(formats)
2338
2339             entries.append({
2340                 'id': playlist_id,
2341                 'title': title,
2342                 'description': description,
2343                 'thumbnail': thumbnail,
2344                 'duration': duration,
2345                 'formats': formats,
2346             })
2347         return entries
2348
2349     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2350         res = self._download_xml_handle(
2351             mpd_url, video_id,
2352             note=note or 'Downloading MPD manifest',
2353             errnote=errnote or 'Failed to download MPD manifest',
2354             fatal=fatal, data=data, headers=headers, query=query)
2355         if res is False:
2356             return []
2357         mpd_doc, urlh = res
2358         if mpd_doc is None:
2359             return []
2360         mpd_base_url = base_url(urlh.geturl())
2361
2362         return self._parse_mpd_formats(
2363             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2364
2365     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2366         """
2367         Parse formats from MPD manifest.
2368         References:
2369          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2370             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2371          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2372         """
2373         if not self._downloader.params.get('dynamic_mpd'):
2374             if mpd_doc.get('type') == 'dynamic':
2375                 return []
2376
2377         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2378
2379         def _add_ns(path):
2380             return self._xpath_ns(path, namespace)
2381
2382         def is_drm_protected(element):
2383             return element.find(_add_ns('ContentProtection')) is not None
2384
2385         def extract_multisegment_info(element, ms_parent_info):
2386             ms_info = ms_parent_info.copy()
2387
2388             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2389             # common attributes and elements.  We will only extract relevant
2390             # for us.
2391             def extract_common(source):
2392                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2393                 if segment_timeline is not None:
2394                     s_e = segment_timeline.findall(_add_ns('S'))
2395                     if s_e:
2396                         ms_info['total_number'] = 0
2397                         ms_info['s'] = []
2398                         for s in s_e:
2399                             r = int(s.get('r', 0))
2400                             ms_info['total_number'] += 1 + r
2401                             ms_info['s'].append({
2402                                 't': int(s.get('t', 0)),
2403                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2404                                 'd': int(s.attrib['d']),
2405                                 'r': r,
2406                             })
2407                 start_number = source.get('startNumber')
2408                 if start_number:
2409                     ms_info['start_number'] = int(start_number)
2410                 timescale = source.get('timescale')
2411                 if timescale:
2412                     ms_info['timescale'] = int(timescale)
2413                 segment_duration = source.get('duration')
2414                 if segment_duration:
2415                     ms_info['segment_duration'] = float(segment_duration)
2416
2417             def extract_Initialization(source):
2418                 initialization = source.find(_add_ns('Initialization'))
2419                 if initialization is not None:
2420                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2421
2422             segment_list = element.find(_add_ns('SegmentList'))
2423             if segment_list is not None:
2424                 extract_common(segment_list)
2425                 extract_Initialization(segment_list)
2426                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2427                 if segment_urls_e:
2428                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2429             else:
2430                 segment_template = element.find(_add_ns('SegmentTemplate'))
2431                 if segment_template is not None:
2432                     extract_common(segment_template)
2433                     media = segment_template.get('media')
2434                     if media:
2435                         ms_info['media'] = media
2436                     initialization = segment_template.get('initialization')
2437                     if initialization:
2438                         ms_info['initialization'] = initialization
2439                     else:
2440                         extract_Initialization(segment_template)
2441             return ms_info
2442
2443         skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
2444
2445         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2446         formats = []
2447         for period in mpd_doc.findall(_add_ns('Period')):
2448             period_duration = parse_duration(period.get('duration')) or mpd_duration
2449             period_ms_info = extract_multisegment_info(period, {
2450                 'start_number': 1,
2451                 'timescale': 1,
2452             })
2453             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2454                 if skip_unplayable and is_drm_protected(adaptation_set):
2455                     continue
2456                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2457                 for representation in adaptation_set.findall(_add_ns('Representation')):
2458                     if skip_unplayable and is_drm_protected(representation):
2459                         continue
2460                     representation_attrib = adaptation_set.attrib.copy()
2461                     representation_attrib.update(representation.attrib)
2462                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2463                     mime_type = representation_attrib['mimeType']
2464                     content_type = mime_type.split('/')[0]
2465                     if content_type == 'text':
2466                         # TODO implement WebVTT downloading
2467                         pass
2468                     elif content_type in ('video', 'audio'):
2469                         base_url = ''
2470                         for element in (representation, adaptation_set, period, mpd_doc):
2471                             base_url_e = element.find(_add_ns('BaseURL'))
2472                             if base_url_e is not None:
2473                                 base_url = base_url_e.text + base_url
2474                                 if re.match(r'^https?://', base_url):
2475                                     break
2476                         if mpd_base_url and not re.match(r'^https?://', base_url):
2477                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2478                                 mpd_base_url += '/'
2479                             base_url = mpd_base_url + base_url
2480                         representation_id = representation_attrib.get('id')
2481                         lang = representation_attrib.get('lang')
2482                         url_el = representation.find(_add_ns('BaseURL'))
2483                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2484                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2485                         f = {
2486                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2487                             'manifest_url': mpd_url,
2488                             'ext': mimetype2ext(mime_type),
2489                             'width': int_or_none(representation_attrib.get('width')),
2490                             'height': int_or_none(representation_attrib.get('height')),
2491                             'tbr': float_or_none(bandwidth, 1000),
2492                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2493                             'fps': int_or_none(representation_attrib.get('frameRate')),
2494                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2495                             'format_note': 'DASH %s' % content_type,
2496                             'filesize': filesize,
2497                             'container': mimetype2ext(mime_type) + '_dash',
2498                         }
2499                         f.update(parse_codecs(representation_attrib.get('codecs')))
2500                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2501
2502                         def prepare_template(template_name, identifiers):
2503                             tmpl = representation_ms_info[template_name]
2504                             # First of, % characters outside $...$ templates
2505                             # must be escaped by doubling for proper processing
2506                             # by % operator string formatting used further (see
2507                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2508                             t = ''
2509                             in_template = False
2510                             for c in tmpl:
2511                                 t += c
2512                                 if c == '$':
2513                                     in_template = not in_template
2514                                 elif c == '%' and not in_template:
2515                                     t += c
2516                             # Next, $...$ templates are translated to their
2517                             # %(...) counterparts to be used with % operator
2518                             t = t.replace('$RepresentationID$', representation_id)
2519                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2520                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2521                             t.replace('$$', '$')
2522                             return t
2523
2524                         # @initialization is a regular template like @media one
2525                         # so it should be handled just the same way (see
2526                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2527                         if 'initialization' in representation_ms_info:
2528                             initialization_template = prepare_template(
2529                                 'initialization',
2530                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2531                                 # $Time$ shall not be included for @initialization thus
2532                                 # only $Bandwidth$ remains
2533                                 ('Bandwidth', ))
2534                             representation_ms_info['initialization_url'] = initialization_template % {
2535                                 'Bandwidth': bandwidth,
2536                             }
2537
2538                         def location_key(location):
2539                             return 'url' if re.match(r'^https?://', location) else 'path'
2540
2541                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2542
2543                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2544                             media_location_key = location_key(media_template)
2545
2546                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2547                             # can't be used at the same time
2548                             if '%(Number' in media_template and 's' not in representation_ms_info:
2549                                 segment_duration = None
2550                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2551                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2552                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2553                                 representation_ms_info['fragments'] = [{
2554                                     media_location_key: media_template % {
2555                                         'Number': segment_number,
2556                                         'Bandwidth': bandwidth,
2557                                     },
2558                                     'duration': segment_duration,
2559                                 } for segment_number in range(
2560                                     representation_ms_info['start_number'],
2561                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2562                             else:
2563                                 # $Number*$ or $Time$ in media template with S list available
2564                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2565                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2566                                 representation_ms_info['fragments'] = []
2567                                 segment_time = 0
2568                                 segment_d = None
2569                                 segment_number = representation_ms_info['start_number']
2570
2571                                 def add_segment_url():
2572                                     segment_url = media_template % {
2573                                         'Time': segment_time,
2574                                         'Bandwidth': bandwidth,
2575                                         'Number': segment_number,
2576                                     }
2577                                     representation_ms_info['fragments'].append({
2578                                         media_location_key: segment_url,
2579                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2580                                     })
2581
2582                                 for num, s in enumerate(representation_ms_info['s']):
2583                                     segment_time = s.get('t') or segment_time
2584                                     segment_d = s['d']
2585                                     add_segment_url()
2586                                     segment_number += 1
2587                                     for r in range(s.get('r', 0)):
2588                                         segment_time += segment_d
2589                                         add_segment_url()
2590                                         segment_number += 1
2591                                     segment_time += segment_d
2592                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2593                             # No media template
2594                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2595                             # or any YouTube dashsegments video
2596                             fragments = []
2597                             segment_index = 0
2598                             timescale = representation_ms_info['timescale']
2599                             for s in representation_ms_info['s']:
2600                                 duration = float_or_none(s['d'], timescale)
2601                                 for r in range(s.get('r', 0) + 1):
2602                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2603                                     fragments.append({
2604                                         location_key(segment_uri): segment_uri,
2605                                         'duration': duration,
2606                                     })
2607                                     segment_index += 1
2608                             representation_ms_info['fragments'] = fragments
2609                         elif 'segment_urls' in representation_ms_info:
2610                             # Segment URLs with no SegmentTimeline
2611                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2612                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2613                             fragments = []
2614                             segment_duration = float_or_none(
2615                                 representation_ms_info['segment_duration'],
2616                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2617                             for segment_url in representation_ms_info['segment_urls']:
2618                                 fragment = {
2619                                     location_key(segment_url): segment_url,
2620                                 }
2621                                 if segment_duration:
2622                                     fragment['duration'] = segment_duration
2623                                 fragments.append(fragment)
2624                             representation_ms_info['fragments'] = fragments
2625                         # If there is a fragments key available then we correctly recognized fragmented media.
2626                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2627                         # assumption is not necessarily correct since we may simply have no support for
2628                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2629                         if 'fragments' in representation_ms_info:
2630                             f.update({
2631                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2632                                 'url': mpd_url or base_url,
2633                                 'fragment_base_url': base_url,
2634                                 'fragments': [],
2635                                 'protocol': 'http_dash_segments',
2636                             })
2637                             if 'initialization_url' in representation_ms_info:
2638                                 initialization_url = representation_ms_info['initialization_url']
2639                                 if not f.get('url'):
2640                                     f['url'] = initialization_url
2641                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2642                             f['fragments'].extend(representation_ms_info['fragments'])
2643                         else:
2644                             # Assuming direct URL to unfragmented media.
2645                             f['url'] = base_url
2646                         formats.append(f)
2647                     else:
2648                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2649         return formats
2650
2651     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2652         res = self._download_xml_handle(
2653             ism_url, video_id,
2654             note=note or 'Downloading ISM manifest',
2655             errnote=errnote or 'Failed to download ISM manifest',
2656             fatal=fatal, data=data, headers=headers, query=query)
2657         if res is False:
2658             return []
2659         ism_doc, urlh = res
2660         if ism_doc is None:
2661             return []
2662
2663         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2664
2665     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2666         """
2667         Parse formats from ISM manifest.
2668         References:
2669          1. [MS-SSTR]: Smooth Streaming Protocol,
2670             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2671         """
2672         if ism_doc.get('IsLive') == 'TRUE':
2673             return []
2674         if (not self._downloader.params.get('allow_unplayable_formats')
2675                 and ism_doc.find('Protection') is not None):
2676             return []
2677
2678         duration = int(ism_doc.attrib['Duration'])
2679         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2680
2681         formats = []
2682         for stream in ism_doc.findall('StreamIndex'):
2683             stream_type = stream.get('Type')
2684             if stream_type not in ('video', 'audio'):
2685                 continue
2686             url_pattern = stream.attrib['Url']
2687             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2688             stream_name = stream.get('Name')
2689             for track in stream.findall('QualityLevel'):
2690                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2691                 # TODO: add support for WVC1 and WMAP
2692                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2693                     self.report_warning('%s is not a supported codec' % fourcc)
2694                     continue
2695                 tbr = int(track.attrib['Bitrate']) // 1000
2696                 # [1] does not mention Width and Height attributes. However,
2697                 # they're often present while MaxWidth and MaxHeight are
2698                 # missing, so should be used as fallbacks
2699                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2700                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2701                 sampling_rate = int_or_none(track.get('SamplingRate'))
2702
2703                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2704                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2705
2706                 fragments = []
2707                 fragment_ctx = {
2708                     'time': 0,
2709                 }
2710                 stream_fragments = stream.findall('c')
2711                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2712                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2713                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2714                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2715                     if not fragment_ctx['duration']:
2716                         try:
2717                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2718                         except IndexError:
2719                             next_fragment_time = duration
2720                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2721                     for _ in range(fragment_repeat):
2722                         fragments.append({
2723                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2724                             'duration': fragment_ctx['duration'] / stream_timescale,
2725                         })
2726                         fragment_ctx['time'] += fragment_ctx['duration']
2727
2728                 format_id = []
2729                 if ism_id:
2730                     format_id.append(ism_id)
2731                 if stream_name:
2732                     format_id.append(stream_name)
2733                 format_id.append(compat_str(tbr))
2734
2735                 formats.append({
2736                     'format_id': '-'.join(format_id),
2737                     'url': ism_url,
2738                     'manifest_url': ism_url,
2739                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2740                     'width': width,
2741                     'height': height,
2742                     'tbr': tbr,
2743                     'asr': sampling_rate,
2744                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2745                     'acodec': 'none' if stream_type == 'video' else fourcc,
2746                     'protocol': 'ism',
2747                     'fragments': fragments,
2748                     '_download_params': {
2749                         'duration': duration,
2750                         'timescale': stream_timescale,
2751                         'width': width or 0,
2752                         'height': height or 0,
2753                         'fourcc': fourcc,
2754                         'codec_private_data': track.get('CodecPrivateData'),
2755                         'sampling_rate': sampling_rate,
2756                         'channels': int_or_none(track.get('Channels', 2)),
2757                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2758                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2759                     },
2760                 })
2761         return formats
2762
2763     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2764         def absolute_url(item_url):
2765             return urljoin(base_url, item_url)
2766
2767         def parse_content_type(content_type):
2768             if not content_type:
2769                 return {}
2770             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2771             if ctr:
2772                 mimetype, codecs = ctr.groups()
2773                 f = parse_codecs(codecs)
2774                 f['ext'] = mimetype2ext(mimetype)
2775                 return f
2776             return {}
2777
2778         def _media_formats(src, cur_media_type, type_info={}):
2779             full_url = absolute_url(src)
2780             ext = type_info.get('ext') or determine_ext(full_url)
2781             if ext == 'm3u8':
2782                 is_plain_url = False
2783                 formats = self._extract_m3u8_formats(
2784                     full_url, video_id, ext='mp4',
2785                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2786                     preference=preference, quality=quality, fatal=False)
2787             elif ext == 'mpd':
2788                 is_plain_url = False
2789                 formats = self._extract_mpd_formats(
2790                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2791             else:
2792                 is_plain_url = True
2793                 formats = [{
2794                     'url': full_url,
2795                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2796                 }]
2797             return is_plain_url, formats
2798
2799         entries = []
2800         # amp-video and amp-audio are very similar to their HTML5 counterparts
2801         # so we wll include them right here (see
2802         # https://www.ampproject.org/docs/reference/components/amp-video)
2803         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2804         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2805         media_tags = [(media_tag, media_tag_name, media_type, '')
2806                       for media_tag, media_tag_name, media_type
2807                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2808         media_tags.extend(re.findall(
2809             # We only allow video|audio followed by a whitespace or '>'.
2810             # Allowing more characters may end up in significant slow down (see
2811             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2812             # http://www.porntrex.com/maps/videositemap.xml).
2813             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2814         for media_tag, _, media_type, media_content in media_tags:
2815             media_info = {
2816                 'formats': [],
2817                 'subtitles': {},
2818             }
2819             media_attributes = extract_attributes(media_tag)
2820             src = strip_or_none(media_attributes.get('src'))
2821             if src:
2822                 _, formats = _media_formats(src, media_type)
2823                 media_info['formats'].extend(formats)
2824             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2825             if media_content:
2826                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2827                     s_attr = extract_attributes(source_tag)
2828                     # data-video-src and data-src are non standard but seen
2829                     # several times in the wild
2830                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2831                     if not src:
2832                         continue
2833                     f = parse_content_type(s_attr.get('type'))
2834                     is_plain_url, formats = _media_formats(src, media_type, f)
2835                     if is_plain_url:
2836                         # width, height, res, label and title attributes are
2837                         # all not standard but seen several times in the wild
2838                         labels = [
2839                             s_attr.get(lbl)
2840                             for lbl in ('label', 'title')
2841                             if str_or_none(s_attr.get(lbl))
2842                         ]
2843                         width = int_or_none(s_attr.get('width'))
2844                         height = (int_or_none(s_attr.get('height'))
2845                                   or int_or_none(s_attr.get('res')))
2846                         if not width or not height:
2847                             for lbl in labels:
2848                                 resolution = parse_resolution(lbl)
2849                                 if not resolution:
2850                                     continue
2851                                 width = width or resolution.get('width')
2852                                 height = height or resolution.get('height')
2853                         for lbl in labels:
2854                             tbr = parse_bitrate(lbl)
2855                             if tbr:
2856                                 break
2857                         else:
2858                             tbr = None
2859                         f.update({
2860                             'width': width,
2861                             'height': height,
2862                             'tbr': tbr,
2863                             'format_id': s_attr.get('label') or s_attr.get('title'),
2864                         })
2865                         f.update(formats[0])
2866                         media_info['formats'].append(f)
2867                     else:
2868                         media_info['formats'].extend(formats)
2869                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2870                     track_attributes = extract_attributes(track_tag)
2871                     kind = track_attributes.get('kind')
2872                     if not kind or kind in ('subtitles', 'captions'):
2873                         src = strip_or_none(track_attributes.get('src'))
2874                         if not src:
2875                             continue
2876                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2877                         media_info['subtitles'].setdefault(lang, []).append({
2878                             'url': absolute_url(src),
2879                         })
2880             for f in media_info['formats']:
2881                 f.setdefault('http_headers', {})['Referer'] = base_url
2882             if media_info['formats'] or media_info['subtitles']:
2883                 entries.append(media_info)
2884         return entries
2885
2886     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2887         signed = 'hdnea=' in manifest_url
2888         if not signed:
2889             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2890             manifest_url = re.sub(
2891                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2892                 '', manifest_url).strip('?')
2893
2894         formats = []
2895
2896         hdcore_sign = 'hdcore=3.7.0'
2897         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2898         hds_host = hosts.get('hds')
2899         if hds_host:
2900             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2901         if 'hdcore=' not in f4m_url:
2902             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2903         f4m_formats = self._extract_f4m_formats(
2904             f4m_url, video_id, f4m_id='hds', fatal=False)
2905         for entry in f4m_formats:
2906             entry.update({'extra_param_to_segment_url': hdcore_sign})
2907         formats.extend(f4m_formats)
2908
2909         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2910         hls_host = hosts.get('hls')
2911         if hls_host:
2912             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2913         m3u8_formats = self._extract_m3u8_formats(
2914             m3u8_url, video_id, 'mp4', 'm3u8_native',
2915             m3u8_id='hls', fatal=False)
2916         formats.extend(m3u8_formats)
2917
2918         http_host = hosts.get('http')
2919         if http_host and m3u8_formats and not signed:
2920             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
2921             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2922             qualities_length = len(qualities)
2923             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
2924                 i = 0
2925                 for f in m3u8_formats:
2926                     if f['vcodec'] != 'none':
2927                         for protocol in ('http', 'https'):
2928                             http_f = f.copy()
2929                             del http_f['manifest_url']
2930                             http_url = re.sub(
2931                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
2932                             http_f.update({
2933                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2934                                 'url': http_url,
2935                                 'protocol': protocol,
2936                             })
2937                             formats.append(http_f)
2938                         i += 1
2939
2940         return formats
2941
2942     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2943         query = compat_urlparse.urlparse(url).query
2944         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2945         mobj = re.search(
2946             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2947         url_base = mobj.group('url')
2948         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2949         formats = []
2950
2951         def manifest_url(manifest):
2952             m_url = '%s/%s' % (http_base_url, manifest)
2953             if query:
2954                 m_url += '?%s' % query
2955             return m_url
2956
2957         if 'm3u8' not in skip_protocols:
2958             formats.extend(self._extract_m3u8_formats(
2959                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2960                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2961         if 'f4m' not in skip_protocols:
2962             formats.extend(self._extract_f4m_formats(
2963                 manifest_url('manifest.f4m'),
2964                 video_id, f4m_id='hds', fatal=False))
2965         if 'dash' not in skip_protocols:
2966             formats.extend(self._extract_mpd_formats(
2967                 manifest_url('manifest.mpd'),
2968                 video_id, mpd_id='dash', fatal=False))
2969         if re.search(r'(?:/smil:|\.smil)', url_base):
2970             if 'smil' not in skip_protocols:
2971                 rtmp_formats = self._extract_smil_formats(
2972                     manifest_url('jwplayer.smil'),
2973                     video_id, fatal=False)
2974                 for rtmp_format in rtmp_formats:
2975                     rtsp_format = rtmp_format.copy()
2976                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2977                     del rtsp_format['play_path']
2978                     del rtsp_format['ext']
2979                     rtsp_format.update({
2980                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2981                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2982                         'protocol': 'rtsp',
2983                     })
2984                     formats.extend([rtmp_format, rtsp_format])
2985         else:
2986             for protocol in ('rtmp', 'rtsp'):
2987                 if protocol not in skip_protocols:
2988                     formats.append({
2989                         'url': '%s:%s' % (protocol, url_base),
2990                         'format_id': protocol,
2991                         'protocol': protocol,
2992                     })
2993         return formats
2994
2995     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2996         mobj = re.search(
2997             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2998             webpage)
2999         if mobj:
3000             try:
3001                 jwplayer_data = self._parse_json(mobj.group('options'),
3002                                                  video_id=video_id,
3003                                                  transform_source=transform_source)
3004             except ExtractorError:
3005                 pass
3006             else:
3007                 if isinstance(jwplayer_data, dict):
3008                     return jwplayer_data
3009
3010     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3011         jwplayer_data = self._find_jwplayer_data(
3012             webpage, video_id, transform_source=js_to_json)
3013         return self._parse_jwplayer_data(
3014             jwplayer_data, video_id, *args, **kwargs)
3015
3016     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3017                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3018         # JWPlayer backward compatibility: flattened playlists
3019         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3020         if 'playlist' not in jwplayer_data:
3021             jwplayer_data = {'playlist': [jwplayer_data]}
3022
3023         entries = []
3024
3025         # JWPlayer backward compatibility: single playlist item
3026         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3027         if not isinstance(jwplayer_data['playlist'], list):
3028             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3029
3030         for video_data in jwplayer_data['playlist']:
3031             # JWPlayer backward compatibility: flattened sources
3032             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3033             if 'sources' not in video_data:
3034                 video_data['sources'] = [video_data]
3035
3036             this_video_id = video_id or video_data['mediaid']
3037
3038             formats = self._parse_jwplayer_formats(
3039                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3040                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3041
3042             subtitles = {}
3043             tracks = video_data.get('tracks')
3044             if tracks and isinstance(tracks, list):
3045                 for track in tracks:
3046                     if not isinstance(track, dict):
3047                         continue
3048                     track_kind = track.get('kind')
3049                     if not track_kind or not isinstance(track_kind, compat_str):
3050                         continue
3051                     if track_kind.lower() not in ('captions', 'subtitles'):
3052                         continue
3053                     track_url = urljoin(base_url, track.get('file'))
3054                     if not track_url:
3055                         continue
3056                     subtitles.setdefault(track.get('label') or 'en', []).append({
3057                         'url': self._proto_relative_url(track_url)
3058                     })
3059
3060             entry = {
3061                 'id': this_video_id,
3062                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3063                 'description': clean_html(video_data.get('description')),
3064                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3065                 'timestamp': int_or_none(video_data.get('pubdate')),
3066                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3067                 'subtitles': subtitles,
3068             }
3069             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3070             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3071                 entry.update({
3072                     '_type': 'url_transparent',
3073                     'url': formats[0]['url'],
3074                 })
3075             else:
3076                 self._sort_formats(formats)
3077                 entry['formats'] = formats
3078             entries.append(entry)
3079         if len(entries) == 1:
3080             return entries[0]
3081         else:
3082             return self.playlist_result(entries)
3083
3084     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3085                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3086         urls = []
3087         formats = []
3088         for source in jwplayer_sources_data:
3089             if not isinstance(source, dict):
3090                 continue
3091             source_url = urljoin(
3092                 base_url, self._proto_relative_url(source.get('file')))
3093             if not source_url or source_url in urls:
3094                 continue
3095             urls.append(source_url)
3096             source_type = source.get('type') or ''
3097             ext = mimetype2ext(source_type) or determine_ext(source_url)
3098             if source_type == 'hls' or ext == 'm3u8':
3099                 formats.extend(self._extract_m3u8_formats(
3100                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3101                     m3u8_id=m3u8_id, fatal=False))
3102             elif source_type == 'dash' or ext == 'mpd':
3103                 formats.extend(self._extract_mpd_formats(
3104                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3105             elif ext == 'smil':
3106                 formats.extend(self._extract_smil_formats(
3107                     source_url, video_id, fatal=False))
3108             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3109             elif source_type.startswith('audio') or ext in (
3110                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3111                 formats.append({
3112                     'url': source_url,
3113                     'vcodec': 'none',
3114                     'ext': ext,
3115                 })
3116             else:
3117                 height = int_or_none(source.get('height'))
3118                 if height is None:
3119                     # Often no height is provided but there is a label in
3120                     # format like "1080p", "720p SD", or 1080.
3121                     height = int_or_none(self._search_regex(
3122                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3123                         'height', default=None))
3124                 a_format = {
3125                     'url': source_url,
3126                     'width': int_or_none(source.get('width')),
3127                     'height': height,
3128                     'tbr': int_or_none(source.get('bitrate')),
3129                     'ext': ext,
3130                 }
3131                 if source_url.startswith('rtmp'):
3132                     a_format['ext'] = 'flv'
3133                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3134                     # of jwplayer.flash.swf
3135                     rtmp_url_parts = re.split(
3136                         r'((?:mp4|mp3|flv):)', source_url, 1)
3137                     if len(rtmp_url_parts) == 3:
3138                         rtmp_url, prefix, play_path = rtmp_url_parts
3139                         a_format.update({
3140                             'url': rtmp_url,
3141                             'play_path': prefix + play_path,
3142                         })
3143                     if rtmp_params:
3144                         a_format.update(rtmp_params)
3145                 formats.append(a_format)
3146         return formats
3147
3148     def _live_title(self, name):
3149         """ Generate the title for a live video """
3150         now = datetime.datetime.now()
3151         now_str = now.strftime('%Y-%m-%d %H:%M')
3152         return name + ' ' + now_str
3153
3154     def _int(self, v, name, fatal=False, **kwargs):
3155         res = int_or_none(v, **kwargs)
3156         if 'get_attr' in kwargs:
3157             print(getattr(v, kwargs['get_attr']))
3158         if res is None:
3159             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3160             if fatal:
3161                 raise ExtractorError(msg)
3162             else:
3163                 self._downloader.report_warning(msg)
3164         return res
3165
3166     def _float(self, v, name, fatal=False, **kwargs):
3167         res = float_or_none(v, **kwargs)
3168         if res is None:
3169             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3170             if fatal:
3171                 raise ExtractorError(msg)
3172             else:
3173                 self._downloader.report_warning(msg)
3174         return res
3175
3176     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3177                     path='/', secure=False, discard=False, rest={}, **kwargs):
3178         cookie = compat_cookiejar_Cookie(
3179             0, name, value, port, port is not None, domain, True,
3180             domain.startswith('.'), path, True, secure, expire_time,
3181             discard, None, None, rest)
3182         self._downloader.cookiejar.set_cookie(cookie)
3183
3184     def _get_cookies(self, url):
3185         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
3186         req = sanitized_Request(url)
3187         self._downloader.cookiejar.add_cookie_header(req)
3188         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
3189
3190     def _apply_first_set_cookie_header(self, url_handle, cookie):
3191         """
3192         Apply first Set-Cookie header instead of the last. Experimental.
3193
3194         Some sites (e.g. [1-3]) may serve two cookies under the same name
3195         in Set-Cookie header and expect the first (old) one to be set rather
3196         than second (new). However, as of RFC6265 the newer one cookie
3197         should be set into cookie store what actually happens.
3198         We will workaround this issue by resetting the cookie to
3199         the first one manually.
3200         1. https://new.vk.com/
3201         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3202         3. https://learning.oreilly.com/
3203         """
3204         for header, cookies in url_handle.headers.items():
3205             if header.lower() != 'set-cookie':
3206                 continue
3207             if sys.version_info[0] >= 3:
3208                 cookies = cookies.encode('iso-8859-1')
3209             cookies = cookies.decode('utf-8')
3210             cookie_value = re.search(
3211                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3212             if cookie_value:
3213                 value, domain = cookie_value.groups()
3214                 self._set_cookie(domain, cookie, value)
3215                 break
3216
3217     def get_testcases(self, include_onlymatching=False):
3218         t = getattr(self, '_TEST', None)
3219         if t:
3220             assert not hasattr(self, '_TESTS'), \
3221                 '%s has _TEST and _TESTS' % type(self).__name__
3222             tests = [t]
3223         else:
3224             tests = getattr(self, '_TESTS', [])
3225         for t in tests:
3226             if not include_onlymatching and t.get('only_matching', False):
3227                 continue
3228             t['name'] = type(self).__name__[:-len('IE')]
3229             yield t
3230
3231     def is_suitable(self, age_limit):
3232         """ Test whether the extractor is generally suitable for the given
3233         age limit (i.e. pornographic sites are not, all others usually are) """
3234
3235         any_restricted = False
3236         for tc in self.get_testcases(include_onlymatching=False):
3237             if tc.get('playlist', []):
3238                 tc = tc['playlist'][0]
3239             is_restricted = age_restricted(
3240                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3241             if not is_restricted:
3242                 return True
3243             any_restricted = any_restricted or is_restricted
3244         return not any_restricted
3245
3246     def extract_subtitles(self, *args, **kwargs):
3247         if (self._downloader.params.get('writesubtitles', False)
3248                 or self._downloader.params.get('listsubtitles')):
3249             return self._get_subtitles(*args, **kwargs)
3250         return {}
3251
3252     def _get_subtitles(self, *args, **kwargs):
3253         raise NotImplementedError('This method must be implemented by subclasses')
3254
3255     @staticmethod
3256     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3257         """ Merge subtitle items for one language. Items with duplicated URLs
3258         will be dropped. """
3259         list1_urls = set([item['url'] for item in subtitle_list1])
3260         ret = list(subtitle_list1)
3261         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3262         return ret
3263
3264     @classmethod
3265     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
3266         """ Merge two subtitle dictionaries, language by language. """
3267         ret = dict(subtitle_dict1)
3268         for lang in subtitle_dict2:
3269             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
3270         return ret
3271
3272     def extract_automatic_captions(self, *args, **kwargs):
3273         if (self._downloader.params.get('writeautomaticsub', False)
3274                 or self._downloader.params.get('listsubtitles')):
3275             return self._get_automatic_captions(*args, **kwargs)
3276         return {}
3277
3278     def _get_automatic_captions(self, *args, **kwargs):
3279         raise NotImplementedError('This method must be implemented by subclasses')
3280
3281     def mark_watched(self, *args, **kwargs):
3282         if (self._downloader.params.get('mark_watched', False)
3283                 and (self._get_login_info()[0] is not None
3284                      or self._downloader.params.get('cookiefile') is not None)):
3285             self._mark_watched(*args, **kwargs)
3286
3287     def _mark_watched(self, *args, **kwargs):
3288         raise NotImplementedError('This method must be implemented by subclasses')
3289
3290     def geo_verification_headers(self):
3291         headers = {}
3292         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3293         if geo_verification_proxy:
3294             headers['Ytdl-request-proxy'] = geo_verification_proxy
3295         return headers
3296
3297     def _generic_id(self, url):
3298         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3299
3300     def _generic_title(self, url):
3301         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3302
3303
3304 class SearchInfoExtractor(InfoExtractor):
3305     """
3306     Base class for paged search queries extractors.
3307     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3308     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3309     """
3310
3311     @classmethod
3312     def _make_valid_url(cls):
3313         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3314
3315     @classmethod
3316     def suitable(cls, url):
3317         return re.match(cls._make_valid_url(), url) is not None
3318
3319     def _real_extract(self, query):
3320         mobj = re.match(self._make_valid_url(), query)
3321         if mobj is None:
3322             raise ExtractorError('Invalid search query "%s"' % query)
3323
3324         prefix = mobj.group('prefix')
3325         query = mobj.group('query')
3326         if prefix == '':
3327             return self._get_n_results(query, 1)
3328         elif prefix == 'all':
3329             return self._get_n_results(query, self._MAX_RESULTS)
3330         else:
3331             n = int(prefix)
3332             if n <= 0:
3333                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3334             elif n > self._MAX_RESULTS:
3335                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3336                 n = self._MAX_RESULTS
3337             return self._get_n_results(query, n)
3338
3339     def _get_n_results(self, query, n):
3340         """Get a specified number of results for a query"""
3341         raise NotImplementedError('This method must be implemented by subclasses')
3342
3343     @property
3344     def SEARCH_KEY(self):
3345         return self._SEARCH_KEY