youtube_dlc/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import ssl
  14 import sys
  15 import time
  16 import math
  17
  18 from ..compat import (
  19     compat_cookiejar_Cookie,
  20     compat_cookies,
  21     compat_etree_Element,
  22     compat_etree_fromstring,
  23     compat_getpass,
  24     compat_integer_types,
  25     compat_http_client,
  26     compat_os_name,
  27     compat_str,
  28     compat_urllib_error,
  29     compat_urllib_parse_unquote,
  30     compat_urllib_parse_urlencode,
  31     compat_urllib_request,
  32     compat_urlparse,
  33     compat_xml_parse_error,
  34 )
  35 from ..downloader import FileDownloader
  36 from ..downloader.f4m import (
  37     get_base_url,
  38     remove_encrypted_media,
  39 )
  40 from ..utils import (
  41     NO_DEFAULT,
  42     age_restricted,
  43     base_url,
  44     bug_reports_message,
  45     clean_html,
  46     compiled_regex_type,
  47     determine_ext,
  48     determine_protocol,
  49     dict_get,
  50     error_to_compat_str,
  51     ExtractorError,
  52     extract_attributes,
  53     fix_xml_ampersands,
  54     float_or_none,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     js_to_json,
  59     JSON_LD_RE,
  60     mimetype2ext,
  61     orderedSet,
  62     parse_bitrate,
  63     parse_codecs,
  64     parse_duration,
  65     parse_iso8601,
  66     parse_m3u8_attributes,
  67     parse_resolution,
  68     RegexNotFoundError,
  69     sanitized_Request,
  70     sanitize_filename,
  71     str_or_none,
  72     str_to_int,
  73     strip_or_none,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     urljoin,
  80     url_basename,
  81     url_or_none,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor(object):
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped.
 107
 108     Additionally, it must contain either a formats entry or a url one:
 109
 110     formats:        A list of dictionaries for each format available, ordered
 111                     from worst to best quality.
 112
 113                     Potential fields:
 114                     * url        The mandatory URL representing the media:
 115                                    for plain file media - HTTP URL of this file,
 116                                    for RTMP - RTMP URL,
 117                                    for HLS - URL of the M3U8 media playlist,
 118                                    for HDS - URL of the F4M manifest,
 119                                    for DASH
 120                                      - HTTP URL to plain file media (in case of
 121                                        unfragmented media)
 122                                      - URL of the MPD manifest or base URL
 123                                        representing the media if MPD manifest
 124                                        is parsed from a string (in case of
 125                                        fragmented media)
 126                                    for MSS - URL of the ISM manifest.
 127                     * manifest_url
 128                                  The URL of the manifest file in case of
 129                                  fragmented media:
 130                                    for HLS - URL of the M3U8 master playlist,
 131                                    for HDS - URL of the F4M manifest,
 132                                    for DASH - URL of the MPD manifest,
 133                                    for MSS - URL of the ISM manifest.
 134                     * ext        Will be calculated from URL if missing
 135                     * format     A human-readable description of the format
 136                                  ("mp4 container with h264/opus").
 137                                  Calculated from the format_id, width, height.
 138                                  and format_note fields if missing.
 139                     * format_id  A short description of the format
 140                                  ("mp4_h264_opus" or "19").
 141                                 Technically optional, but strongly recommended.
 142                     * format_note Additional info about the format
 143                                  ("3D" or "DASH video")
 144                     * width      Width of the video, if known
 145                     * height     Height of the video, if known
 146                     * resolution Textual description of width and height
 147                     * tbr        Average bitrate of audio and video in KBit/s
 148                     * abr        Average audio bitrate in KBit/s
 149                     * acodec     Name of the audio codec in use
 150                     * asr        Audio sampling rate in Hertz
 151                     * vbr        Average video bitrate in KBit/s
 152                     * fps        Frame rate
 153                     * vcodec     Name of the video codec in use
 154                     * container  Name of the container format
 155                     * filesize   The number of bytes, if known in advance
 156                     * filesize_approx  An estimate for the number of bytes
 157                     * player_url SWF Player URL (used for rtmpdump).
 158                     * protocol   The protocol that will be used for the actual
 159                                  download, lower-case.
 160                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 161                                  "m3u8", "m3u8_native" or "http_dash_segments".
 162                     * fragment_base_url
 163                                  Base URL for fragments. Each fragment's path
 164                                  value (if present) will be relative to
 165                                  this URL.
 166                     * fragments  A list of fragments of a fragmented media.
 167                                  Each fragment entry must contain either an url
 168                                  or a path. If an url is present it should be
 169                                  considered by a client. Otherwise both path and
 170                                  fragment_base_url must be present. Here is
 171                                  the list of all potential fields:
 172                                  * "url" - fragment's URL
 173                                  * "path" - fragment's path relative to
 174                                             fragment_base_url
 175                                  * "duration" (optional, int or float)
 176                                  * "filesize" (optional, int)
 177                     * preference Order number of this format. If this field is
 178                                  present and not None, the formats get sorted
 179                                  by this field, regardless of all other values.
 180                                  -1 for default (order by other properties),
 181                                  -2 or smaller for less than default.
 182                                  < -1000 to hide the format (if there is
 183                                     another one which is strictly better)
 184                     * language   Language code, e.g. "de" or "en-US".
 185                     * language_preference  Is this in the language mentioned in
 186                                  the URL?
 187                                  10 if it's what the URL is about,
 188                                  -1 for default (don't know),
 189                                  -10 otherwise, other values reserved for now.
 190                     * quality    Order number of the video quality of this
 191                                  format, irrespective of the file format.
 192                                  -1 for default (order by other properties),
 193                                  -2 or smaller for less than default.
 194                     * source_preference  Order number for this video source
 195                                   (quality takes higher priority)
 196                                  -1 for default (order by other properties),
 197                                  -2 or smaller for less than default.
 198                     * http_headers  A dictionary of additional HTTP headers
 199                                  to add to the request.
 200                     * stretched_ratio  If given and not 1, indicates that the
 201                                  video's pixels are not square.
 202                                  width : height ratio as float.
 203                     * no_resume  The server does not support resuming the
 204                                  (HTTP or RTMP) download. Boolean.
 205                     * downloader_options  A dictionary of downloader options as
 206                                  described in FileDownloader
 207
 208     url:            Final video URL.
 209     ext:            Video filename extension.
 210     format:         The video format, defaults to ext (used for --get-format)
 211     player_url:     SWF Player URL (used for rtmpdump).
 212
 213     The following fields are optional:
 214
 215     alt_title:      A secondary title of the video.
 216     display_id      An alternative identifier for the video, not necessarily
 217                     unique, but available before title. Typically, id is
 218                     something like "4234987", title "Dancing naked mole rats",
 219                     and display_id "dancing-naked-mole-rats"
 220     thumbnails:     A list of dictionaries, with the following entries:
 221                         * "id" (optional, string) - Thumbnail format ID
 222                         * "url"
 223                         * "preference" (optional, int) - quality of the image
 224                         * "width" (optional, int)
 225                         * "height" (optional, int)
 226                         * "resolution" (optional, string "{width}x{height}",
 227                                         deprecated)
 228                         * "filesize" (optional, int)
 229     thumbnail:      Full URL to a video thumbnail image.
 230     description:    Full video description.
 231     uploader:       Full name of the video uploader.
 232     license:        License name the video is licensed under.
 233     creator:        The creator of the video.
 234     release_date:   The date (YYYYMMDD) when the video was released.
 235     timestamp:      UNIX timestamp of the moment the video became available.
 236     upload_date:    Video upload date (YYYYMMDD).
 237                     If not explicitly set, calculated from timestamp.
 238     uploader_id:    Nickname or id of the video uploader.
 239     uploader_url:   Full URL to a personal webpage of the video uploader.
 240     channel:        Full name of the channel the video is uploaded on.
 241                     Note that channel fields may or may not repeat uploader
 242                     fields. This depends on a particular extractor.
 243     channel_id:     Id of the channel.
 244     channel_url:    Full URL to a channel webpage.
 245     location:       Physical location where the video was filmed.
 246     subtitles:      The available subtitles as a dictionary in the format
 247                     {tag: subformats}. "tag" is usually a language code, and
 248                     "subformats" is a list sorted from lower to higher
 249                     preference, each element is a dictionary with the "ext"
 250                     entry and one of:
 251                         * "data": The subtitles file contents
 252                         * "url": A URL pointing to the subtitles file
 253                     "ext" will be calculated from URL if missing
 254     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 255                     automatically generated captions
 256     duration:       Length of the video in seconds, as an integer or float.
 257     view_count:     How many users have watched the video on the platform.
 258     like_count:     Number of positive ratings of the video
 259     dislike_count:  Number of negative ratings of the video
 260     repost_count:   Number of reposts of the video
 261     average_rating: Average rating give by users, the scale used depends on the webpage
 262     comment_count:  Number of comments on the video
 263     comments:       A list of comments, each with one or more of the following
 264                     properties (all but one of text or html optional):
 265                         * "author" - human-readable name of the comment author
 266                         * "author_id" - user ID of the comment author
 267                         * "id" - Comment ID
 268                         * "html" - Comment as HTML
 269                         * "text" - Plain text of the comment
 270                         * "timestamp" - UNIX timestamp of comment
 271                         * "parent" - ID of the comment this one is replying to.
 272                                      Set to "root" to indicate that this is a
 273                                      comment to the original video.
 274     age_limit:      Age restriction for the video, as an integer (years)
 275     webpage_url:    The URL to the video webpage, if given to youtube-dlc it
 276                     should allow to get the same result again. (It will be set
 277                     by YoutubeDL if it's missing)
 278     categories:     A list of categories that the video falls in, for example
 279                     ["Sports", "Berlin"]
 280     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 281     is_live:        True, False, or None (=unknown). Whether this video is a
 282                     live stream that goes on instead of a fixed-length video.
 283     start_time:     Time in seconds where the reproduction should start, as
 284                     specified in the URL.
 285     end_time:       Time in seconds where the reproduction should end, as
 286                     specified in the URL.
 287     chapters:       A list of dictionaries, with the following entries:
 288                         * "start_time" - The start time of the chapter in seconds
 289                         * "end_time" - The end time of the chapter in seconds
 290                         * "title" (optional, string)
 291
 292     The following fields should only be used when the video belongs to some logical
 293     chapter or section:
 294
 295     chapter:        Name or title of the chapter the video belongs to.
 296     chapter_number: Number of the chapter the video belongs to, as an integer.
 297     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 298
 299     The following fields should only be used when the video is an episode of some
 300     series, programme or podcast:
 301
 302     series:         Title of the series or programme the video episode belongs to.
 303     season:         Title of the season the video episode belongs to.
 304     season_number:  Number of the season the video episode belongs to, as an integer.
 305     season_id:      Id of the season the video episode belongs to, as a unicode string.
 306     episode:        Title of the video episode. Unlike mandatory video title field,
 307                     this field should denote the exact title of the video episode
 308                     without any kind of decoration.
 309     episode_number: Number of the video episode within a season, as an integer.
 310     episode_id:     Id of the video episode, as a unicode string.
 311
 312     The following fields should only be used when the media is a track or a part of
 313     a music album:
 314
 315     track:          Title of the track.
 316     track_number:   Number of the track within an album or a disc, as an integer.
 317     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 318                     as a unicode string.
 319     artist:         Artist(s) of the track.
 320     genre:          Genre(s) of the track.
 321     album:          Title of the album the track belongs to.
 322     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 323     album_artist:   List of all artists appeared on the album (e.g.
 324                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 325                     and compilations).
 326     disc_number:    Number of the disc or other physical medium the track belongs to,
 327                     as an integer.
 328     release_year:   Year (YYYY) when the album was released.
 329
 330     Unless mentioned otherwise, the fields should be Unicode strings.
 331
 332     Unless mentioned otherwise, None is equivalent to absence of information.
 333
 334
 335     _type "playlist" indicates multiple videos.
 336     There must be a key "entries", which is a list, an iterable, or a PagedList
 337     object, each element of which is a valid dictionary by this specification.
 338
 339     Additionally, playlists can have "id", "title", "description", "uploader",
 340     "uploader_id", "uploader_url" attributes with the same semantics as videos
 341     (see above).
 342
 343
 344     _type "multi_video" indicates that there are multiple videos that
 345     form a single show, for examples multiple acts of an opera or TV episode.
 346     It must have an entries key like a playlist and contain all the keys
 347     required for a video at the same time.
 348
 349
 350     _type "url" indicates that the video must be extracted from another
 351     location, possibly by a different extractor. Its only required key is:
 352     "url" - the next URL to extract.
 353     The key "ie_key" can be set to the class name (minus the trailing "IE",
 354     e.g. "Youtube") if the extractor class is known in advance.
 355     Additionally, the dictionary may have any properties of the resolved entity
 356     known in advance, for example "title" if the title of the referred video is
 357     known ahead of time.
 358
 359
 360     _type "url_transparent" entities have the same specification as "url", but
 361     indicate that the given additional information is more precise than the one
 362     associated with the resolved URL.
 363     This is useful when a site employs a video service that hosts the video and
 364     its technical metadata, but that video service does not embed a useful
 365     title, description etc.
 366
 367
 368     Subclasses of this one should re-define the _real_initialize() and
 369     _real_extract() methods and define a _VALID_URL regexp.
 370     Probably, they should also be added to the list of extractors.
 371
 372     _GEO_BYPASS attribute may be set to False in order to disable
 373     geo restriction bypass mechanisms for a particular extractor.
 374     Though it won't disable explicit geo restriction bypass based on
 375     country code provided with geo_bypass_country.
 376
 377     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 378     countries for this extractor. One of these countries will be used by
 379     geo restriction bypass mechanism right away in order to bypass
 380     geo restriction, of course, if the mechanism is not disabled.
 381
 382     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 383     IP blocks in CIDR notation for this extractor. One of these IP blocks
 384     will be used by geo restriction bypass mechanism similarly
 385     to _GEO_COUNTRIES.
 386
 387     Finally, the _WORKING attribute should be set to False for broken IEs
 388     in order to warn the users and skip the tests.
 389     """
 390
 391     _ready = False
 392     _downloader = None
 393     _x_forwarded_for_ip = None
 394     _GEO_BYPASS = True
 395     _GEO_COUNTRIES = None
 396     _GEO_IP_BLOCKS = None
 397     _WORKING = True
 398
 399     def __init__(self, downloader=None):
 400         """Constructor. Receives an optional downloader."""
 401         self._ready = False
 402         self._x_forwarded_for_ip = None
 403         self.set_downloader(downloader)
 404
 405     @classmethod
 406     def suitable(cls, url):
 407         """Receives a URL and returns True if suitable for this IE."""
 408
 409         # This does not use has/getattr intentionally - we want to know whether
 410         # we have cached the regexp for *this* class, whereas getattr would also
 411         # match the superclass
 412         if '_VALID_URL_RE' not in cls.__dict__:
 413             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 414         return cls._VALID_URL_RE.match(url) is not None
 415
 416     @classmethod
 417     def _match_id(cls, url):
 418         if '_VALID_URL_RE' not in cls.__dict__:
 419             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 420         m = cls._VALID_URL_RE.match(url)
 421         assert m
 422         return compat_str(m.group('id'))
 423
 424     @classmethod
 425     def working(cls):
 426         """Getter method for _WORKING."""
 427         return cls._WORKING
 428
 429     def initialize(self):
 430         """Initializes an instance (authentication, etc)."""
 431         self._initialize_geo_bypass({
 432             'countries': self._GEO_COUNTRIES,
 433             'ip_blocks': self._GEO_IP_BLOCKS,
 434         })
 435         if not self._ready:
 436             self._real_initialize()
 437             self._ready = True
 438
 439     def _initialize_geo_bypass(self, geo_bypass_context):
 440         """
 441         Initialize geo restriction bypass mechanism.
 442
 443         This method is used to initialize geo bypass mechanism based on faking
 444         X-Forwarded-For HTTP header. A random country from provided country list
 445         is selected and a random IP belonging to this country is generated. This
 446         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 447         HTTP requests.
 448
 449         This method will be used for initial geo bypass mechanism initialization
 450         during the instance initialization with _GEO_COUNTRIES and
 451         _GEO_IP_BLOCKS.
 452
 453         You may also manually call it from extractor's code if geo bypass
 454         information is not available beforehand (e.g. obtained during
 455         extraction) or due to some other reason. In this case you should pass
 456         this information in geo bypass context passed as first argument. It may
 457         contain following fields:
 458
 459         countries:  List of geo unrestricted countries (similar
 460                     to _GEO_COUNTRIES)
 461         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 462                     (similar to _GEO_IP_BLOCKS)
 463
 464         """
 465         if not self._x_forwarded_for_ip:
 466
 467             # Geo bypass mechanism is explicitly disabled by user
 468             if not self._downloader.params.get('geo_bypass', True):
 469                 return
 470
 471             if not geo_bypass_context:
 472                 geo_bypass_context = {}
 473
 474             # Backward compatibility: previously _initialize_geo_bypass
 475             # expected a list of countries, some 3rd party code may still use
 476             # it this way
 477             if isinstance(geo_bypass_context, (list, tuple)):
 478                 geo_bypass_context = {
 479                     'countries': geo_bypass_context,
 480                 }
 481
 482             # The whole point of geo bypass mechanism is to fake IP
 483             # as X-Forwarded-For HTTP header based on some IP block or
 484             # country code.
 485
 486             # Path 1: bypassing based on IP block in CIDR notation
 487
 488             # Explicit IP block specified by user, use it right away
 489             # regardless of whether extractor is geo bypassable or not
 490             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 491
 492             # Otherwise use random IP block from geo bypass context but only
 493             # if extractor is known as geo bypassable
 494             if not ip_block:
 495                 ip_blocks = geo_bypass_context.get('ip_blocks')
 496                 if self._GEO_BYPASS and ip_blocks:
 497                     ip_block = random.choice(ip_blocks)
 498
 499             if ip_block:
 500                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 501                 if self._downloader.params.get('verbose', False):
 502                     self._downloader.to_screen(
 503                         '[debug] Using fake IP %s as X-Forwarded-For.'
 504                         % self._x_forwarded_for_ip)
 505                 return
 506
 507             # Path 2: bypassing based on country code
 508
 509             # Explicit country code specified by user, use it right away
 510             # regardless of whether extractor is geo bypassable or not
 511             country = self._downloader.params.get('geo_bypass_country', None)
 512
 513             # Otherwise use random country code from geo bypass context but
 514             # only if extractor is known as geo bypassable
 515             if not country:
 516                 countries = geo_bypass_context.get('countries')
 517                 if self._GEO_BYPASS and countries:
 518                     country = random.choice(countries)
 519
 520             if country:
 521                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 522                 if self._downloader.params.get('verbose', False):
 523                     self._downloader.to_screen(
 524                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 525                         % (self._x_forwarded_for_ip, country.upper()))
 526
 527     def extract(self, url):
 528         """Extracts URL information and returns it in list of dicts."""
 529         try:
 530             for _ in range(2):
 531                 try:
 532                     self.initialize()
 533                     ie_result = self._real_extract(url)
 534                     if self._x_forwarded_for_ip:
 535                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 536                     return ie_result
 537                 except GeoRestrictedError as e:
 538                     if self.__maybe_fake_ip_and_retry(e.countries):
 539                         continue
 540                     raise
 541         except ExtractorError:
 542             raise
 543         except compat_http_client.IncompleteRead as e:
 544             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 545         except (KeyError, StopIteration) as e:
 546             raise ExtractorError('An extractor error has occurred.', cause=e)
 547
 548     def __maybe_fake_ip_and_retry(self, countries):
 549         if (not self._downloader.params.get('geo_bypass_country', None)
 550                 and self._GEO_BYPASS
 551                 and self._downloader.params.get('geo_bypass', True)
 552                 and not self._x_forwarded_for_ip
 553                 and countries):
 554             country_code = random.choice(countries)
 555             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 556             if self._x_forwarded_for_ip:
 557                 self.report_warning(
 558                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 559                     % (self._x_forwarded_for_ip, country_code.upper()))
 560                 return True
 561         return False
 562
 563     def set_downloader(self, downloader):
 564         """Sets the downloader for this IE."""
 565         self._downloader = downloader
 566
 567     def _real_initialize(self):
 568         """Real initialization process. Redefine in subclasses."""
 569         pass
 570
 571     def _real_extract(self, url):
 572         """Real extraction process. Redefine in subclasses."""
 573         pass
 574
 575     @classmethod
 576     def ie_key(cls):
 577         """A string for getting the InfoExtractor with get_info_extractor"""
 578         return compat_str(cls.__name__[:-2])
 579
 580     @property
 581     def IE_NAME(self):
 582         return compat_str(type(self).__name__[:-2])
 583
 584     @staticmethod
 585     def __can_accept_status_code(err, expected_status):
 586         assert isinstance(err, compat_urllib_error.HTTPError)
 587         if expected_status is None:
 588             return False
 589         if isinstance(expected_status, compat_integer_types):
 590             return err.code == expected_status
 591         elif isinstance(expected_status, (list, tuple)):
 592             return err.code in expected_status
 593         elif callable(expected_status):
 594             return expected_status(err.code) is True
 595         else:
 596             assert False
 597
 598     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 599         """
 600         Return the response handle.
 601
 602         See _download_webpage docstring for arguments specification.
 603         """
 604         if note is None:
 605             self.report_download_webpage(video_id)
 606         elif note is not False:
 607             if video_id is None:
 608                 self.to_screen('%s' % (note,))
 609             else:
 610                 self.to_screen('%s: %s' % (video_id, note))
 611
 612         # Some sites check X-Forwarded-For HTTP header in order to figure out
 613         # the origin of the client behind proxy. This allows bypassing geo
 614         # restriction by faking this header's value to IP that belongs to some
 615         # geo unrestricted country. We will do so once we encounter any
 616         # geo restriction error.
 617         if self._x_forwarded_for_ip:
 618             if 'X-Forwarded-For' not in headers:
 619                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 620
 621         if isinstance(url_or_request, compat_urllib_request.Request):
 622             url_or_request = update_Request(
 623                 url_or_request, data=data, headers=headers, query=query)
 624         else:
 625             if query:
 626                 url_or_request = update_url_query(url_or_request, query)
 627             if data is not None or headers:
 628                 url_or_request = sanitized_Request(url_or_request, data, headers)
 629         exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 630         if hasattr(ssl, 'CertificateError'):
 631             exceptions.append(ssl.CertificateError)
 632         try:
 633             return self._downloader.urlopen(url_or_request)
 634         except tuple(exceptions) as err:
 635             if isinstance(err, compat_urllib_error.HTTPError):
 636                 if self.__can_accept_status_code(err, expected_status):
 637                     # Retain reference to error to prevent file object from
 638                     # being closed before it can be read. Works around the
 639                     # effects of <https://bugs.python.org/issue15002>
 640                     # introduced in Python 3.4.1.
 641                     err.fp._error = err
 642                     return err.fp
 643
 644             if errnote is False:
 645                 return False
 646             if errnote is None:
 647                 errnote = 'Unable to download webpage'
 648
 649             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 650             if fatal:
 651                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 652             else:
 653                 self._downloader.report_warning(errmsg)
 654                 return False
 655
 656     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 657         """
 658         Return a tuple (page content as string, URL handle).
 659
 660         See _download_webpage docstring for arguments specification.
 661         """
 662         # Strip hashes from the URL (#1038)
 663         if isinstance(url_or_request, (compat_str, str)):
 664             url_or_request = url_or_request.partition('#')[0]
 665
 666         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 667         if urlh is False:
 668             assert not fatal
 669             return False
 670         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 671         return (content, urlh)
 672
 673     @staticmethod
 674     def _guess_encoding_from_content(content_type, webpage_bytes):
 675         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 676         if m:
 677             encoding = m.group(1)
 678         else:
 679             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 680                           webpage_bytes[:1024])
 681             if m:
 682                 encoding = m.group(1).decode('ascii')
 683             elif webpage_bytes.startswith(b'\xff\xfe'):
 684                 encoding = 'utf-16'
 685             else:
 686                 encoding = 'utf-8'
 687
 688         return encoding
 689
 690     def __check_blocked(self, content):
 691         first_block = content[:512]
 692         if ('<title>Access to this site is blocked</title>' in content
 693                 and 'Websense' in first_block):
 694             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 695             blocked_iframe = self._html_search_regex(
 696                 r'<iframe src="([^"]+)"', content,
 697                 'Websense information URL', default=None)
 698             if blocked_iframe:
 699                 msg += ' Visit %s for more details' % blocked_iframe
 700             raise ExtractorError(msg, expected=True)
 701         if '<title>The URL you requested has been blocked</title>' in first_block:
 702             msg = (
 703                 'Access to this webpage has been blocked by Indian censorship. '
 704                 'Use a VPN or proxy server (with --proxy) to route around it.')
 705             block_msg = self._html_search_regex(
 706                 r'</h1><p>(.*?)</p>',
 707                 content, 'block message', default=None)
 708             if block_msg:
 709                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 710             raise ExtractorError(msg, expected=True)
 711         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 712                 and 'blocklist.rkn.gov.ru' in content):
 713             raise ExtractorError(
 714                 'Access to this webpage has been blocked by decision of the Russian government. '
 715                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 716                 expected=True)
 717
 718     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 719         content_type = urlh.headers.get('Content-Type', '')
 720         webpage_bytes = urlh.read()
 721         if prefix is not None:
 722             webpage_bytes = prefix + webpage_bytes
 723         if not encoding:
 724             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 725         if self._downloader.params.get('dump_intermediate_pages', False):
 726             self.to_screen('Dumping request to ' + urlh.geturl())
 727             dump = base64.b64encode(webpage_bytes).decode('ascii')
 728             self._downloader.to_screen(dump)
 729         if self._downloader.params.get('write_pages', False):
 730             basen = '%s_%s' % (video_id, urlh.geturl())
 731             if len(basen) > 240:
 732                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 733                 basen = basen[:240 - len(h)] + h
 734             raw_filename = basen + '.dump'
 735             filename = sanitize_filename(raw_filename, restricted=True)
 736             self.to_screen('Saving request to ' + filename)
 737             # Working around MAX_PATH limitation on Windows (see
 738             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 739             if compat_os_name == 'nt':
 740                 absfilepath = os.path.abspath(filename)
 741                 if len(absfilepath) > 259:
 742                     filename = '\\\\?\\' + absfilepath
 743             with open(filename, 'wb') as outf:
 744                 outf.write(webpage_bytes)
 745
 746         try:
 747             content = webpage_bytes.decode(encoding, 'replace')
 748         except LookupError:
 749             content = webpage_bytes.decode('utf-8', 'replace')
 750
 751         self.__check_blocked(content)
 752
 753         return content
 754
 755     def _download_webpage(
 756             self, url_or_request, video_id, note=None, errnote=None,
 757             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 758             headers={}, query={}, expected_status=None):
 759         """
 760         Return the data of the page as a string.
 761
 762         Arguments:
 763         url_or_request -- plain text URL as a string or
 764             a compat_urllib_request.Requestobject
 765         video_id -- Video/playlist/item identifier (string)
 766
 767         Keyword arguments:
 768         note -- note printed before downloading (string)
 769         errnote -- note printed in case of an error (string)
 770         fatal -- flag denoting whether error should be considered fatal,
 771             i.e. whether it should cause ExtractionError to be raised,
 772             otherwise a warning will be reported and extraction continued
 773         tries -- number of tries
 774         timeout -- sleep interval between tries
 775         encoding -- encoding for a page content decoding, guessed automatically
 776             when not explicitly specified
 777         data -- POST data (bytes)
 778         headers -- HTTP headers (dict)
 779         query -- URL query (dict)
 780         expected_status -- allows to accept failed HTTP requests (non 2xx
 781             status code) by explicitly specifying a set of accepted status
 782             codes. Can be any of the following entities:
 783                 - an integer type specifying an exact failed status code to
 784                   accept
 785                 - a list or a tuple of integer types specifying a list of
 786                   failed status codes to accept
 787                 - a callable accepting an actual failed status code and
 788                   returning True if it should be accepted
 789             Note that this argument does not affect success status codes (2xx)
 790             which are always accepted.
 791         """
 792
 793         success = False
 794         try_count = 0
 795         while success is False:
 796             try:
 797                 res = self._download_webpage_handle(
 798                     url_or_request, video_id, note, errnote, fatal,
 799                     encoding=encoding, data=data, headers=headers, query=query,
 800                     expected_status=expected_status)
 801                 success = True
 802             except compat_http_client.IncompleteRead as e:
 803                 try_count += 1
 804                 if try_count >= tries:
 805                     raise e
 806                 self._sleep(timeout, video_id)
 807         if res is False:
 808             return res
 809         else:
 810             content, _ = res
 811             return content
 812
 813     def _download_xml_handle(
 814             self, url_or_request, video_id, note='Downloading XML',
 815             errnote='Unable to download XML', transform_source=None,
 816             fatal=True, encoding=None, data=None, headers={}, query={},
 817             expected_status=None):
 818         """
 819         Return a tuple (xml as an compat_etree_Element, URL handle).
 820
 821         See _download_webpage docstring for arguments specification.
 822         """
 823         res = self._download_webpage_handle(
 824             url_or_request, video_id, note, errnote, fatal=fatal,
 825             encoding=encoding, data=data, headers=headers, query=query,
 826             expected_status=expected_status)
 827         if res is False:
 828             return res
 829         xml_string, urlh = res
 830         return self._parse_xml(
 831             xml_string, video_id, transform_source=transform_source,
 832             fatal=fatal), urlh
 833
 834     def _download_xml(
 835             self, url_or_request, video_id,
 836             note='Downloading XML', errnote='Unable to download XML',
 837             transform_source=None, fatal=True, encoding=None,
 838             data=None, headers={}, query={}, expected_status=None):
 839         """
 840         Return the xml as an compat_etree_Element.
 841
 842         See _download_webpage docstring for arguments specification.
 843         """
 844         res = self._download_xml_handle(
 845             url_or_request, video_id, note=note, errnote=errnote,
 846             transform_source=transform_source, fatal=fatal, encoding=encoding,
 847             data=data, headers=headers, query=query,
 848             expected_status=expected_status)
 849         return res if res is False else res[0]
 850
 851     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 852         if transform_source:
 853             xml_string = transform_source(xml_string)
 854         try:
 855             return compat_etree_fromstring(xml_string.encode('utf-8'))
 856         except compat_xml_parse_error as ve:
 857             errmsg = '%s: Failed to parse XML ' % video_id
 858             if fatal:
 859                 raise ExtractorError(errmsg, cause=ve)
 860             else:
 861                 self.report_warning(errmsg + str(ve))
 862
 863     def _download_json_handle(
 864             self, url_or_request, video_id, note='Downloading JSON metadata',
 865             errnote='Unable to download JSON metadata', transform_source=None,
 866             fatal=True, encoding=None, data=None, headers={}, query={},
 867             expected_status=None):
 868         """
 869         Return a tuple (JSON object, URL handle).
 870
 871         See _download_webpage docstring for arguments specification.
 872         """
 873         res = self._download_webpage_handle(
 874             url_or_request, video_id, note, errnote, fatal=fatal,
 875             encoding=encoding, data=data, headers=headers, query=query,
 876             expected_status=expected_status)
 877         if res is False:
 878             return res
 879         json_string, urlh = res
 880         return self._parse_json(
 881             json_string, video_id, transform_source=transform_source,
 882             fatal=fatal), urlh
 883
 884     def _download_json(
 885             self, url_or_request, video_id, note='Downloading JSON metadata',
 886             errnote='Unable to download JSON metadata', transform_source=None,
 887             fatal=True, encoding=None, data=None, headers={}, query={},
 888             expected_status=None):
 889         """
 890         Return the JSON object as a dict.
 891
 892         See _download_webpage docstring for arguments specification.
 893         """
 894         res = self._download_json_handle(
 895             url_or_request, video_id, note=note, errnote=errnote,
 896             transform_source=transform_source, fatal=fatal, encoding=encoding,
 897             data=data, headers=headers, query=query,
 898             expected_status=expected_status)
 899         return res if res is False else res[0]
 900
 901     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 902         if transform_source:
 903             json_string = transform_source(json_string)
 904         try:
 905             return json.loads(json_string)
 906         except ValueError as ve:
 907             errmsg = '%s: Failed to parse JSON ' % video_id
 908             if fatal:
 909                 raise ExtractorError(errmsg, cause=ve)
 910             else:
 911                 self.report_warning(errmsg + str(ve))
 912
 913     def report_warning(self, msg, video_id=None):
 914         idstr = '' if video_id is None else '%s: ' % video_id
 915         self._downloader.report_warning(
 916             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 917
 918     def to_screen(self, msg):
 919         """Print msg to screen, prefixing it with '[ie_name]'"""
 920         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 921
 922     def report_extraction(self, id_or_name):
 923         """Report information extraction."""
 924         self.to_screen('%s: Extracting information' % id_or_name)
 925
 926     def report_download_webpage(self, video_id):
 927         """Report webpage download."""
 928         self.to_screen('%s: Downloading webpage' % video_id)
 929
 930     def report_age_confirmation(self):
 931         """Report attempt to confirm age."""
 932         self.to_screen('Confirming age')
 933
 934     def report_login(self):
 935         """Report attempt to log in."""
 936         self.to_screen('Logging in')
 937
 938     @staticmethod
 939     def raise_login_required(msg='This video is only available for registered users'):
 940         raise ExtractorError(
 941             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 942             expected=True)
 943
 944     @staticmethod
 945     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 946         raise GeoRestrictedError(msg, countries=countries)
 947
 948     # Methods for following #608
 949     @staticmethod
 950     def url_result(url, ie=None, video_id=None, video_title=None):
 951         """Returns a URL that points to a page that should be processed"""
 952         # TODO: ie should be the class used for getting the info
 953         video_info = {'_type': 'url',
 954                       'url': url,
 955                       'ie_key': ie}
 956         if video_id is not None:
 957             video_info['id'] = video_id
 958         if video_title is not None:
 959             video_info['title'] = video_title
 960         return video_info
 961
 962     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 963         urls = orderedSet(
 964             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 965             for m in matches)
 966         return self.playlist_result(
 967             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 968
 969     @staticmethod
 970     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 971         """Returns a playlist"""
 972         video_info = {'_type': 'playlist',
 973                       'entries': entries}
 974         if playlist_id:
 975             video_info['id'] = playlist_id
 976         if playlist_title:
 977             video_info['title'] = playlist_title
 978         if playlist_description:
 979             video_info['description'] = playlist_description
 980         return video_info
 981
 982     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 983         """
 984         Perform a regex search on the given string, using a single or a list of
 985         patterns returning the first matching group.
 986         In case of failure return a default value or raise a WARNING or a
 987         RegexNotFoundError, depending on fatal, specifying the field name.
 988         """
 989         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 990             mobj = re.search(pattern, string, flags)
 991         else:
 992             for p in pattern:
 993                 mobj = re.search(p, string, flags)
 994                 if mobj:
 995                     break
 996
 997         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 998             _name = '\033[0;34m%s\033[0m' % name
 999         else:
1000             _name = name
1001
1002         if mobj:
1003             if group is None:
1004                 # return the first matching group
1005                 return next(g for g in mobj.groups() if g is not None)
1006             else:
1007                 return mobj.group(group)
1008         elif default is not NO_DEFAULT:
1009             return default
1010         elif fatal:
1011             raise RegexNotFoundError('Unable to extract %s' % _name)
1012         else:
1013             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1014             return None
1015
1016     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1017         """
1018         Like _search_regex, but strips HTML tags and unescapes entities.
1019         """
1020         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1021         if res:
1022             return clean_html(res).strip()
1023         else:
1024             return res
1025
1026     def _get_netrc_login_info(self, netrc_machine=None):
1027         username = None
1028         password = None
1029         netrc_machine = netrc_machine or self._NETRC_MACHINE
1030
1031         if self._downloader.params.get('usenetrc', False):
1032             try:
1033                 info = netrc.netrc().authenticators(netrc_machine)
1034                 if info is not None:
1035                     username = info[0]
1036                     password = info[2]
1037                 else:
1038                     raise netrc.NetrcParseError(
1039                         'No authenticators for %s' % netrc_machine)
1040             except (IOError, netrc.NetrcParseError) as err:
1041                 self._downloader.report_warning(
1042                     'parsing .netrc: %s' % error_to_compat_str(err))
1043
1044         return username, password
1045
1046     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1047         """
1048         Get the login info as (username, password)
1049         First look for the manually specified credentials using username_option
1050         and password_option as keys in params dictionary. If no such credentials
1051         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1052         value.
1053         If there's no info available, return (None, None)
1054         """
1055         if self._downloader is None:
1056             return (None, None)
1057
1058         downloader_params = self._downloader.params
1059
1060         # Attempt to use provided username and password or .netrc data
1061         if downloader_params.get(username_option) is not None:
1062             username = downloader_params[username_option]
1063             password = downloader_params[password_option]
1064         else:
1065             username, password = self._get_netrc_login_info(netrc_machine)
1066
1067         return username, password
1068
1069     def _get_tfa_info(self, note='two-factor verification code'):
1070         """
1071         Get the two-factor authentication info
1072         TODO - asking the user will be required for sms/phone verify
1073         currently just uses the command line option
1074         If there's no info available, return None
1075         """
1076         if self._downloader is None:
1077             return None
1078         downloader_params = self._downloader.params
1079
1080         if downloader_params.get('twofactor') is not None:
1081             return downloader_params['twofactor']
1082
1083         return compat_getpass('Type %s and press [Return]: ' % note)
1084
1085     # Helper functions for extracting OpenGraph info
1086     @staticmethod
1087     def _og_regexes(prop):
1088         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1089         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1090                        % {'prop': re.escape(prop)})
1091         template = r'<meta[^>]+?%s[^>]+?%s'
1092         return [
1093             template % (property_re, content_re),
1094             template % (content_re, property_re),
1095         ]
1096
1097     @staticmethod
1098     def _meta_regex(prop):
1099         return r'''(?isx)<meta
1100                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1101                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1102
1103     def _og_search_property(self, prop, html, name=None, **kargs):
1104         if not isinstance(prop, (list, tuple)):
1105             prop = [prop]
1106         if name is None:
1107             name = 'OpenGraph %s' % prop[0]
1108         og_regexes = []
1109         for p in prop:
1110             og_regexes.extend(self._og_regexes(p))
1111         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1112         if escaped is None:
1113             return None
1114         return unescapeHTML(escaped)
1115
1116     def _og_search_thumbnail(self, html, **kargs):
1117         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1118
1119     def _og_search_description(self, html, **kargs):
1120         return self._og_search_property('description', html, fatal=False, **kargs)
1121
1122     def _og_search_title(self, html, **kargs):
1123         return self._og_search_property('title', html, **kargs)
1124
1125     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1126         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1127         if secure:
1128             regexes = self._og_regexes('video:secure_url') + regexes
1129         return self._html_search_regex(regexes, html, name, **kargs)
1130
1131     def _og_search_url(self, html, **kargs):
1132         return self._og_search_property('url', html, **kargs)
1133
1134     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1135         if not isinstance(name, (list, tuple)):
1136             name = [name]
1137         if display_name is None:
1138             display_name = name[0]
1139         return self._html_search_regex(
1140             [self._meta_regex(n) for n in name],
1141             html, display_name, fatal=fatal, group='content', **kwargs)
1142
1143     def _dc_search_uploader(self, html):
1144         return self._html_search_meta('dc.creator', html, 'uploader')
1145
1146     def _rta_search(self, html):
1147         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1148         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1149                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1150                      html):
1151             return 18
1152         return 0
1153
1154     def _media_rating_search(self, html):
1155         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1156         rating = self._html_search_meta('rating', html)
1157
1158         if not rating:
1159             return None
1160
1161         RATING_TABLE = {
1162             'safe for kids': 0,
1163             'general': 8,
1164             '14 years': 14,
1165             'mature': 17,
1166             'restricted': 19,
1167         }
1168         return RATING_TABLE.get(rating.lower())
1169
1170     def _family_friendly_search(self, html):
1171         # See http://schema.org/VideoObject
1172         family_friendly = self._html_search_meta(
1173             'isFamilyFriendly', html, default=None)
1174
1175         if not family_friendly:
1176             return None
1177
1178         RATING_TABLE = {
1179             '1': 0,
1180             'true': 0,
1181             '0': 18,
1182             'false': 18,
1183         }
1184         return RATING_TABLE.get(family_friendly.lower())
1185
1186     def _twitter_search_player(self, html):
1187         return self._html_search_meta('twitter:player', html,
1188                                       'twitter card player')
1189
1190     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1191         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1192         default = kwargs.get('default', NO_DEFAULT)
1193         # JSON-LD may be malformed and thus `fatal` should be respected.
1194         # At the same time `default` may be passed that assumes `fatal=False`
1195         # for _search_regex. Let's simulate the same behavior here as well.
1196         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1197         json_ld = []
1198         for mobj in json_ld_list:
1199             json_ld_item = self._parse_json(
1200                 mobj.group('json_ld'), video_id, fatal=fatal)
1201             if not json_ld_item:
1202                 continue
1203             if isinstance(json_ld_item, dict):
1204                 json_ld.append(json_ld_item)
1205             elif isinstance(json_ld_item, (list, tuple)):
1206                 json_ld.extend(json_ld_item)
1207         if json_ld:
1208             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1209         if json_ld:
1210             return json_ld
1211         if default is not NO_DEFAULT:
1212             return default
1213         elif fatal:
1214             raise RegexNotFoundError('Unable to extract JSON-LD')
1215         else:
1216             self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1217             return {}
1218
1219     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1220         if isinstance(json_ld, compat_str):
1221             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1222         if not json_ld:
1223             return {}
1224         info = {}
1225         if not isinstance(json_ld, (list, tuple, dict)):
1226             return info
1227         if isinstance(json_ld, dict):
1228             json_ld = [json_ld]
1229
1230         INTERACTION_TYPE_MAP = {
1231             'CommentAction': 'comment',
1232             'AgreeAction': 'like',
1233             'DisagreeAction': 'dislike',
1234             'LikeAction': 'like',
1235             'DislikeAction': 'dislike',
1236             'ListenAction': 'view',
1237             'WatchAction': 'view',
1238             'ViewAction': 'view',
1239         }
1240
1241         def extract_interaction_statistic(e):
1242             interaction_statistic = e.get('interactionStatistic')
1243             if not isinstance(interaction_statistic, list):
1244                 return
1245             for is_e in interaction_statistic:
1246                 if not isinstance(is_e, dict):
1247                     continue
1248                 if is_e.get('@type') != 'InteractionCounter':
1249                     continue
1250                 interaction_type = is_e.get('interactionType')
1251                 if not isinstance(interaction_type, compat_str):
1252                     continue
1253                 # For interaction count some sites provide string instead of
1254                 # an integer (as per spec) with non digit characters (e.g. ",")
1255                 # so extracting count with more relaxed str_to_int
1256                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1257                 if interaction_count is None:
1258                     continue
1259                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1260                 if not count_kind:
1261                     continue
1262                 count_key = '%s_count' % count_kind
1263                 if info.get(count_key) is not None:
1264                     continue
1265                 info[count_key] = interaction_count
1266
1267         def extract_video_object(e):
1268             assert e['@type'] == 'VideoObject'
1269             info.update({
1270                 'url': url_or_none(e.get('contentUrl')),
1271                 'title': unescapeHTML(e.get('name')),
1272                 'description': unescapeHTML(e.get('description')),
1273                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1274                 'duration': parse_duration(e.get('duration')),
1275                 'timestamp': unified_timestamp(e.get('uploadDate')),
1276                 'uploader': str_or_none(e.get('author')),
1277                 'filesize': float_or_none(e.get('contentSize')),
1278                 'tbr': int_or_none(e.get('bitrate')),
1279                 'width': int_or_none(e.get('width')),
1280                 'height': int_or_none(e.get('height')),
1281                 'view_count': int_or_none(e.get('interactionCount')),
1282             })
1283             extract_interaction_statistic(e)
1284
1285         for e in json_ld:
1286             if '@context' in e:
1287                 item_type = e.get('@type')
1288                 if expected_type is not None and expected_type != item_type:
1289                     continue
1290                 if item_type in ('TVEpisode', 'Episode'):
1291                     episode_name = unescapeHTML(e.get('name'))
1292                     info.update({
1293                         'episode': episode_name,
1294                         'episode_number': int_or_none(e.get('episodeNumber')),
1295                         'description': unescapeHTML(e.get('description')),
1296                     })
1297                     if not info.get('title') and episode_name:
1298                         info['title'] = episode_name
1299                     part_of_season = e.get('partOfSeason')
1300                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1301                         info.update({
1302                             'season': unescapeHTML(part_of_season.get('name')),
1303                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1304                         })
1305                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1306                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1307                         info['series'] = unescapeHTML(part_of_series.get('name'))
1308                 elif item_type == 'Movie':
1309                     info.update({
1310                         'title': unescapeHTML(e.get('name')),
1311                         'description': unescapeHTML(e.get('description')),
1312                         'duration': parse_duration(e.get('duration')),
1313                         'timestamp': unified_timestamp(e.get('dateCreated')),
1314                     })
1315                 elif item_type in ('Article', 'NewsArticle'):
1316                     info.update({
1317                         'timestamp': parse_iso8601(e.get('datePublished')),
1318                         'title': unescapeHTML(e.get('headline')),
1319                         'description': unescapeHTML(e.get('articleBody')),
1320                     })
1321                 elif item_type == 'VideoObject':
1322                     extract_video_object(e)
1323                     if expected_type is None:
1324                         continue
1325                     else:
1326                         break
1327                 video = e.get('video')
1328                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1329                     extract_video_object(video)
1330                 if expected_type is None:
1331                     continue
1332                 else:
1333                     break
1334         return dict((k, v) for k, v in info.items() if v is not None)
1335
1336     @staticmethod
1337     def _hidden_inputs(html):
1338         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1339         hidden_inputs = {}
1340         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1341             attrs = extract_attributes(input)
1342             if not input:
1343                 continue
1344             if attrs.get('type') not in ('hidden', 'submit'):
1345                 continue
1346             name = attrs.get('name') or attrs.get('id')
1347             value = attrs.get('value')
1348             if name and value is not None:
1349                 hidden_inputs[name] = value
1350         return hidden_inputs
1351
1352     def _form_hidden_inputs(self, form_id, html):
1353         form = self._search_regex(
1354             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1355             html, '%s form' % form_id, group='form')
1356         return self._hidden_inputs(form)
1357
1358     class FormatSort:
1359         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<seperator>[~:])(?P<limit>.*?))?)? *$'
1360
1361         default = ('hidden', 'has_video', 'extractor', 'lang', 'quality',
1362                    'tbr', 'filesize', 'vbr', 'height', 'width', 'protocol', 'vext',
1363                    'abr', 'aext', 'fps', 'filesize_approx', 'source_preference', 'format_id')
1364
1365         settings = {
1366             'vcodec': {'type': 'ordered', 'regex': True,
1367                        'order': ['av01', 'vp9', '(h265|he?vc?)', '(h264|avc)', 'vp8', '(mp4v|h263)', 'theora', '', None, 'none']},
1368             'acodec': {'type': 'ordered', 'regex': True,
1369                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1370             'protocol': {'type': 'ordered', 'regex': True,
1371                          'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
1372             'vext': {'type': 'ordered', 'field': 'video_ext',
1373                      'order': ('mp4', 'flv', 'webm', '', 'none'),  # Why is flv prefered over webm???
1374                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1375             'aext': {'type': 'ordered', 'field': 'audio_ext',
1376                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1377                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1378             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1379             'extractor_preference': {'priority': True, 'type': 'extractor'},
1380             'has_video': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1381             'has_audio': {'priority': False, 'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1382             'language_preference': {'priority': True, 'convert': 'ignore'},
1383             'quality': {'priority': True, 'convert': 'float_none'},
1384             'filesize': {'convert': 'bytes'},
1385             'filesize_approx': {'convert': 'bytes'},
1386             'format_id': {'convert': 'string'},
1387             'height': {'convert': 'float_none'},
1388             'width': {'convert': 'float_none'},
1389             'fps': {'convert': 'float_none'},
1390             'tbr': {'convert': 'float_none'},
1391             'vbr': {'convert': 'float_none'},
1392             'abr': {'convert': 'float_none'},
1393             'asr': {'convert': 'float_none'},
1394             'source_preference': {'convert': 'ignore'},
1395             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1396             'bitrate': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1397             'filesize_estimate': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'filesize_approx')},
1398             'extension': {'type': 'combined', 'field': ('vext', 'aext')},
1399             'dimension': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},  # not named as 'resolution' because such a field exists
1400             'res': {'type': 'alias', 'field': 'dimension'},
1401             'ext': {'type': 'alias', 'field': 'extension'},
1402             'br': {'type': 'alias', 'field': 'bitrate'},
1403             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1404             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1405             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1406             'framerate': {'type': 'alias', 'field': 'fps'},
1407             'lang': {'type': 'alias', 'field': 'language_preference'},  # not named as 'language' because such a field exists
1408             'proto': {'type': 'alias', 'field': 'protocol'},
1409             'source': {'type': 'alias', 'field': 'source_preference'},
1410             'size': {'type': 'alias', 'field': 'filesize_estimate'},
1411             'samplerate': {'type': 'alias', 'field': 'asr'},
1412             'video_ext': {'type': 'alias', 'field': 'vext'},
1413             'audio_ext': {'type': 'alias', 'field': 'aext'},
1414             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1415             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1416             'video': {'type': 'alias', 'field': 'has_video'},
1417             'audio': {'type': 'alias', 'field': 'has_audio'},
1418             'extractor': {'type': 'alias', 'field': 'extractor_preference'},
1419             'preference': {'type': 'alias', 'field': 'extractor_preference'}}
1420
1421         _order = []
1422
1423         def _get_field_setting(self, field, key):
1424             if field not in self.settings:
1425                 self.settings[field] = {}
1426             propObj = self.settings[field]
1427             if key not in propObj:
1428                 type = propObj.get('type')
1429                 if key == 'field':
1430                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1431                 elif key == 'convert':
1432                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1433                 else:
1434                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1435                 propObj[key] = default
1436             return propObj[key]
1437
1438         def _resolve_field_value(self, field, value, convertNone=False):
1439             if value is None:
1440                 if not convertNone:
1441                     return None
1442             else:
1443                 value = value.lower()
1444             conversion = self._get_field_setting(field, 'convert')
1445             if conversion == 'ignore':
1446                 return None
1447             if conversion == 'string':
1448                 return value
1449             elif conversion == 'float_none':
1450                 return float_or_none(value)
1451             elif conversion == 'bytes':
1452                 return FileDownloader.parse_bytes(value)
1453             elif conversion == 'order':
1454                 order_free = self._get_field_setting(field, 'order_free')
1455                 order_list = order_free if order_free and self._use_free_order else self._get_field_setting(field, 'order')
1456                 use_regex = self._get_field_setting(field, 'regex')
1457                 list_length = len(order_list)
1458                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1459                 if use_regex and value is not None:
1460                     for (i, regex) in enumerate(order_list):
1461                         if regex and re.match(regex, value):
1462                             return list_length - i
1463                     return list_length - empty_pos  # not in list
1464                 else:  # not regex or  value = None
1465                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1466             else:
1467                 if value.isnumeric():
1468                     return float(value)
1469                 else:
1470                     self.settings[field]['convert'] = 'string'
1471                     return value
1472
1473         def evaluate_params(self, params, sort_extractor):
1474             self._use_free_order = params.get('prefer_free_formats', False)
1475             self._sort_user = params.get('format_sort', [])
1476             self._sort_extractor = sort_extractor
1477
1478             def add_item(field, reverse, closest, limit_text):
1479                 field = field.lower()
1480                 if field in self._order:
1481                     return
1482                 self._order.append(field)
1483                 limit = self._resolve_field_value(field, limit_text)
1484                 data = {
1485                     'reverse': reverse,
1486                     'closest': False if limit is None else closest,
1487                     'limit_text': limit_text,
1488                     'limit': limit}
1489                 if field in self.settings:
1490                     self.settings[field].update(data)
1491                 else:
1492                     self.settings[field] = data
1493
1494             sort_list = (
1495                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1496                 + (tuple() if params.get('format_sort_force', False)
1497                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1498                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1499
1500             for item in sort_list:
1501                 match = re.match(self.regex, item)
1502                 if match is None:
1503                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1504                 field = match.group('field')
1505                 if field is None:
1506                     continue
1507                 if self._get_field_setting(field, 'type') == 'alias':
1508                     field = self._get_field_setting(field, 'field')
1509                 reverse = match.group('reverse') is not None
1510                 closest = match.group('seperator') == '~'
1511                 limit_text = match.group('limit')
1512
1513                 has_limit = limit_text is not None
1514                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1515                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1516
1517                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1518                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1519                 limit_count = len(limits)
1520                 for (i, f) in enumerate(fields):
1521                     add_item(f, reverse, closest,
1522                              limits[i] if i < limit_count
1523                              else limits[0] if has_limit and not has_multiple_limits
1524                              else None)
1525
1526         def print_verbose_info(self, to_screen):
1527             to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
1528             if self._sort_extractor:
1529                 to_screen('[debug] Sort order given by extractor: %s' % ','.join(self._sort_extractor))
1530             to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1531                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1532                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1533                               self._get_field_setting(field, 'limit_text'),
1534                               self._get_field_setting(field, 'limit'))
1535                 if self._get_field_setting(field, 'limit_text') is not None else '')
1536                 for field in self._order if self._get_field_setting(field, 'visible')]))
1537
1538         def _calculate_field_preference_from_value(self, format, field, type, value):
1539             reverse = self._get_field_setting(field, 'reverse')
1540             closest = self._get_field_setting(field, 'closest')
1541             limit = self._get_field_setting(field, 'limit')
1542
1543             if type == 'extractor':
1544                 maximum = self._get_field_setting(field, 'max')
1545                 if value is None or (maximum is not None and value >= maximum):
1546                     value = 0
1547             elif type == 'boolean':
1548                 in_list = self._get_field_setting(field, 'in_list')
1549                 not_in_list = self._get_field_setting(field, 'not_in_list')
1550                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1551             elif type == 'ordered':
1552                 value = self._resolve_field_value(field, value, True)
1553
1554             # try to convert to number
1555             val_num = float_or_none(value)
1556             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1557             if is_num:
1558                 value = val_num
1559
1560             return ((-10, 0) if value is None
1561                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1562                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1563                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1564                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1565                     else (-1, value, 0))
1566
1567         def _calculate_field_preference(self, format, field):
1568             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1569             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1570             if type == 'multiple':
1571                 type = 'field'  # Only 'field' is allowed in multiple for now
1572                 actual_fields = self._get_field_setting(field, 'field')
1573
1574                 def wrapped_function(values):
1575                     values = tuple(filter(lambda x: x is not None, values))
1576                     return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1577                             else values[0] if values
1578                             else None)
1579
1580                 value = wrapped_function((get_value(f) for f in actual_fields))
1581             else:
1582                 value = get_value(field)
1583             return self._calculate_field_preference_from_value(format, field, type, value)
1584
1585         def calculate_preference(self, format):
1586             # Determine missing protocol
1587             if not format.get('protocol'):
1588                 format['protocol'] = determine_protocol(format)
1589
1590             # Determine missing ext
1591             if not format.get('ext') and 'url' in format:
1592                 format['ext'] = determine_ext(format['url'])
1593             if format.get('vcodec') == 'none':
1594                 format['audio_ext'] = format['ext']
1595                 format['video_ext'] = 'none'
1596             else:
1597                 format['video_ext'] = format['ext']
1598                 format['audio_ext'] = 'none'
1599             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1600             #    format['preference'] = -1000
1601
1602             # Determine missing bitrates
1603             if format.get('tbr') is None:
1604                 if format.get('vbr') is not None and format.get('abr') is not None:
1605                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1606             else:
1607                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1608                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1609                 if format.get('acodec') != "none" and format.get('abr') is None:
1610                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1611
1612             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1613
1614     def _sort_formats(self, formats, field_preference=[]):
1615         if not formats:
1616             raise ExtractorError('No video formats found')
1617         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1618         format_sort.evaluate_params(self._downloader.params, field_preference)
1619         if self._downloader.params.get('verbose', False):
1620             format_sort.print_verbose_info(self._downloader.to_screen)
1621         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1622
1623     def _check_formats(self, formats, video_id):
1624         if formats:
1625             formats[:] = filter(
1626                 lambda f: self._is_valid_url(
1627                     f['url'], video_id,
1628                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1629                 formats)
1630
1631     @staticmethod
1632     def _remove_duplicate_formats(formats):
1633         format_urls = set()
1634         unique_formats = []
1635         for f in formats:
1636             if f['url'] not in format_urls:
1637                 format_urls.add(f['url'])
1638                 unique_formats.append(f)
1639         formats[:] = unique_formats
1640
1641     def _is_valid_url(self, url, video_id, item='video', headers={}):
1642         url = self._proto_relative_url(url, scheme='http:')
1643         # For now assume non HTTP(S) URLs always valid
1644         if not (url.startswith('http://') or url.startswith('https://')):
1645             return True
1646         try:
1647             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1648             return True
1649         except ExtractorError as e:
1650             self.to_screen(
1651                 '%s: %s URL is invalid, skipping: %s'
1652                 % (video_id, item, error_to_compat_str(e.cause)))
1653             return False
1654
1655     def http_scheme(self):
1656         """ Either "http:" or "https:", depending on the user's preferences """
1657         return (
1658             'http:'
1659             if self._downloader.params.get('prefer_insecure', False)
1660             else 'https:')
1661
1662     def _proto_relative_url(self, url, scheme=None):
1663         if url is None:
1664             return url
1665         if url.startswith('//'):
1666             if scheme is None:
1667                 scheme = self.http_scheme()
1668             return scheme + url
1669         else:
1670             return url
1671
1672     def _sleep(self, timeout, video_id, msg_template=None):
1673         if msg_template is None:
1674             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1675         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1676         self.to_screen(msg)
1677         time.sleep(timeout)
1678
1679     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1680                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1681                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1682         manifest = self._download_xml(
1683             manifest_url, video_id, 'Downloading f4m manifest',
1684             'Unable to download f4m manifest',
1685             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1686             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1687             transform_source=transform_source,
1688             fatal=fatal, data=data, headers=headers, query=query)
1689
1690         if manifest is False:
1691             return []
1692
1693         return self._parse_f4m_formats(
1694             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1695             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1696
1697     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1698                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1699                            fatal=True, m3u8_id=None):
1700         if not isinstance(manifest, compat_etree_Element) and not fatal:
1701             return []
1702
1703         # currently youtube-dlc cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1704         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1705         if akamai_pv is not None and ';' in akamai_pv.text:
1706             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1707             if playerVerificationChallenge.strip() != '':
1708                 return []
1709
1710         formats = []
1711         manifest_version = '1.0'
1712         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1713         if not media_nodes:
1714             manifest_version = '2.0'
1715             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1716         # Remove unsupported DRM protected media from final formats
1717         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1718         media_nodes = remove_encrypted_media(media_nodes)
1719         if not media_nodes:
1720             return formats
1721
1722         manifest_base_url = get_base_url(manifest)
1723
1724         bootstrap_info = xpath_element(
1725             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1726             'bootstrap info', default=None)
1727
1728         vcodec = None
1729         mime_type = xpath_text(
1730             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1731             'base URL', default=None)
1732         if mime_type and mime_type.startswith('audio/'):
1733             vcodec = 'none'
1734
1735         for i, media_el in enumerate(media_nodes):
1736             tbr = int_or_none(media_el.attrib.get('bitrate'))
1737             width = int_or_none(media_el.attrib.get('width'))
1738             height = int_or_none(media_el.attrib.get('height'))
1739             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1740             # If <bootstrapInfo> is present, the specified f4m is a
1741             # stream-level manifest, and only set-level manifests may refer to
1742             # external resources.  See section 11.4 and section 4 of F4M spec
1743             if bootstrap_info is None:
1744                 media_url = None
1745                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1746                 if manifest_version == '2.0':
1747                     media_url = media_el.attrib.get('href')
1748                 if media_url is None:
1749                     media_url = media_el.attrib.get('url')
1750                 if not media_url:
1751                     continue
1752                 manifest_url = (
1753                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1754                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1755                 # If media_url is itself a f4m manifest do the recursive extraction
1756                 # since bitrates in parent manifest (this one) and media_url manifest
1757                 # may differ leading to inability to resolve the format by requested
1758                 # bitrate in f4m downloader
1759                 ext = determine_ext(manifest_url)
1760                 if ext == 'f4m':
1761                     f4m_formats = self._extract_f4m_formats(
1762                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1763                         transform_source=transform_source, fatal=fatal)
1764                     # Sometimes stream-level manifest contains single media entry that
1765                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1766                     # At the same time parent's media entry in set-level manifest may
1767                     # contain it. We will copy it from parent in such cases.
1768                     if len(f4m_formats) == 1:
1769                         f = f4m_formats[0]
1770                         f.update({
1771                             'tbr': f.get('tbr') or tbr,
1772                             'width': f.get('width') or width,
1773                             'height': f.get('height') or height,
1774                             'format_id': f.get('format_id') if not tbr else format_id,
1775                             'vcodec': vcodec,
1776                         })
1777                     formats.extend(f4m_formats)
1778                     continue
1779                 elif ext == 'm3u8':
1780                     formats.extend(self._extract_m3u8_formats(
1781                         manifest_url, video_id, 'mp4', preference=preference,
1782                         m3u8_id=m3u8_id, fatal=fatal))
1783                     continue
1784             formats.append({
1785                 'format_id': format_id,
1786                 'url': manifest_url,
1787                 'manifest_url': manifest_url,
1788                 'ext': 'flv' if bootstrap_info is not None else None,
1789                 'protocol': 'f4m',
1790                 'tbr': tbr,
1791                 'width': width,
1792                 'height': height,
1793                 'vcodec': vcodec,
1794                 'preference': preference,
1795             })
1796         return formats
1797
1798     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1799         return {
1800             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1801             'url': m3u8_url,
1802             'ext': ext,
1803             'protocol': 'm3u8',
1804             'preference': preference - 100 if preference else -100,
1805             'resolution': 'multiple',
1806             'format_note': 'Quality selection URL',
1807         }
1808
1809     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1810                               entry_protocol='m3u8', preference=None,
1811                               m3u8_id=None, note=None, errnote=None,
1812                               fatal=True, live=False, data=None, headers={},
1813                               query={}):
1814         res = self._download_webpage_handle(
1815             m3u8_url, video_id,
1816             note=note or 'Downloading m3u8 information',
1817             errnote=errnote or 'Failed to download m3u8 information',
1818             fatal=fatal, data=data, headers=headers, query=query)
1819
1820         if res is False:
1821             return []
1822
1823         m3u8_doc, urlh = res
1824         m3u8_url = urlh.geturl()
1825
1826         return self._parse_m3u8_formats(
1827             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1828             preference=preference, m3u8_id=m3u8_id, live=live)
1829
1830     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1831                             entry_protocol='m3u8', preference=None,
1832                             m3u8_id=None, live=False):
1833         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1834             return []
1835
1836         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1837             return []
1838
1839         formats = []
1840
1841         format_url = lambda u: (
1842             u
1843             if re.match(r'^https?://', u)
1844             else compat_urlparse.urljoin(m3u8_url, u))
1845
1846         # References:
1847         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1848         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1849         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1850
1851         # We should try extracting formats only from master playlists [1, 4.3.4],
1852         # i.e. playlists that describe available qualities. On the other hand
1853         # media playlists [1, 4.3.3] should be returned as is since they contain
1854         # just the media without qualities renditions.
1855         # Fortunately, master playlist can be easily distinguished from media
1856         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1857         # master playlist tags MUST NOT appear in a media playlist and vice versa.
1858         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1859         # media playlist and MUST NOT appear in master playlist thus we can
1860         # clearly detect media playlist with this criterion.
1861
1862         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1863             return [{
1864                 'url': m3u8_url,
1865                 'format_id': m3u8_id,
1866                 'ext': ext,
1867                 'protocol': entry_protocol,
1868                 'preference': preference,
1869             }]
1870
1871         groups = {}
1872         last_stream_inf = {}
1873
1874         def extract_media(x_media_line):
1875             media = parse_m3u8_attributes(x_media_line)
1876             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1877             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1878             if not (media_type and group_id and name):
1879                 return
1880             groups.setdefault(group_id, []).append(media)
1881             if media_type not in ('VIDEO', 'AUDIO'):
1882                 return
1883             media_url = media.get('URI')
1884             if media_url:
1885                 format_id = []
1886                 for v in (m3u8_id, group_id, name):
1887                     if v:
1888                         format_id.append(v)
1889                 f = {
1890                     'format_id': '-'.join(format_id),
1891                     'url': format_url(media_url),
1892                     'manifest_url': m3u8_url,
1893                     'language': media.get('LANGUAGE'),
1894                     'ext': ext,
1895                     'protocol': entry_protocol,
1896                     'preference': preference,
1897                 }
1898                 if media_type == 'AUDIO':
1899                     f['vcodec'] = 'none'
1900                 formats.append(f)
1901
1902         def build_stream_name():
1903             # Despite specification does not mention NAME attribute for
1904             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1905             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1906             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1907             stream_name = last_stream_inf.get('NAME')
1908             if stream_name:
1909                 return stream_name
1910             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1911             # from corresponding rendition group
1912             stream_group_id = last_stream_inf.get('VIDEO')
1913             if not stream_group_id:
1914                 return
1915             stream_group = groups.get(stream_group_id)
1916             if not stream_group:
1917                 return stream_group_id
1918             rendition = stream_group[0]
1919             return rendition.get('NAME') or stream_group_id
1920
1921         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
1922         # chance to detect video only formats when EXT-X-STREAM-INF tags
1923         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1924         for line in m3u8_doc.splitlines():
1925             if line.startswith('#EXT-X-MEDIA:'):
1926                 extract_media(line)
1927
1928         for line in m3u8_doc.splitlines():
1929             if line.startswith('#EXT-X-STREAM-INF:'):
1930                 last_stream_inf = parse_m3u8_attributes(line)
1931             elif line.startswith('#') or not line.strip():
1932                 continue
1933             else:
1934                 tbr = float_or_none(
1935                     last_stream_inf.get('AVERAGE-BANDWIDTH')
1936                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
1937                 format_id = []
1938                 if m3u8_id:
1939                     format_id.append(m3u8_id)
1940                 stream_name = build_stream_name()
1941                 # Bandwidth of live streams may differ over time thus making
1942                 # format_id unpredictable. So it's better to keep provided
1943                 # format_id intact.
1944                 if not live:
1945                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1946                 manifest_url = format_url(line.strip())
1947                 f = {
1948                     'format_id': '-'.join(format_id),
1949                     'url': manifest_url,
1950                     'manifest_url': m3u8_url,
1951                     'tbr': tbr,
1952                     'ext': ext,
1953                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1954                     'protocol': entry_protocol,
1955                     'preference': preference,
1956                 }
1957                 resolution = last_stream_inf.get('RESOLUTION')
1958                 if resolution:
1959                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1960                     if mobj:
1961                         f['width'] = int(mobj.group('width'))
1962                         f['height'] = int(mobj.group('height'))
1963                 # Unified Streaming Platform
1964                 mobj = re.search(
1965                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1966                 if mobj:
1967                     abr, vbr = mobj.groups()
1968                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1969                     f.update({
1970                         'vbr': vbr,
1971                         'abr': abr,
1972                     })
1973                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1974                 f.update(codecs)
1975                 audio_group_id = last_stream_inf.get('AUDIO')
1976                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1977                 # references a rendition group MUST have a CODECS attribute.
1978                 # However, this is not always respected, for example, [2]
1979                 # contains EXT-X-STREAM-INF tag which references AUDIO
1980                 # rendition group but does not have CODECS and despite
1981                 # referencing an audio group it represents a complete
1982                 # (with audio and video) format. So, for such cases we will
1983                 # ignore references to rendition groups and treat them
1984                 # as complete formats.
1985                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1986                     audio_group = groups.get(audio_group_id)
1987                     if audio_group and audio_group[0].get('URI'):
1988                         # TODO: update acodec for audio only formats with
1989                         # the same GROUP-ID
1990                         f['acodec'] = 'none'
1991                 formats.append(f)
1992
1993                 # for DailyMotion
1994                 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
1995                 if progressive_uri:
1996                     http_f = f.copy()
1997                     del http_f['manifest_url']
1998                     http_f.update({
1999                         'format_id': f['format_id'].replace('hls-', 'http-'),
2000                         'protocol': 'http',
2001                         'url': progressive_uri,
2002                     })
2003                     formats.append(http_f)
2004
2005                 last_stream_inf = {}
2006         return formats
2007
2008     @staticmethod
2009     def _xpath_ns(path, namespace=None):
2010         if not namespace:
2011             return path
2012         out = []
2013         for c in path.split('/'):
2014             if not c or c == '.':
2015                 out.append(c)
2016             else:
2017                 out.append('{%s}%s' % (namespace, c))
2018         return '/'.join(out)
2019
2020     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2021         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2022
2023         if smil is False:
2024             assert not fatal
2025             return []
2026
2027         namespace = self._parse_smil_namespace(smil)
2028
2029         return self._parse_smil_formats(
2030             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2031
2032     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2033         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2034         if smil is False:
2035             return {}
2036         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2037
2038     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2039         return self._download_xml(
2040             smil_url, video_id, 'Downloading SMIL file',
2041             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2042
2043     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2044         namespace = self._parse_smil_namespace(smil)
2045
2046         formats = self._parse_smil_formats(
2047             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2048         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2049
2050         video_id = os.path.splitext(url_basename(smil_url))[0]
2051         title = None
2052         description = None
2053         upload_date = None
2054         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2055             name = meta.attrib.get('name')
2056             content = meta.attrib.get('content')
2057             if not name or not content:
2058                 continue
2059             if not title and name == 'title':
2060                 title = content
2061             elif not description and name in ('description', 'abstract'):
2062                 description = content
2063             elif not upload_date and name == 'date':
2064                 upload_date = unified_strdate(content)
2065
2066         thumbnails = [{
2067             'id': image.get('type'),
2068             'url': image.get('src'),
2069             'width': int_or_none(image.get('width')),
2070             'height': int_or_none(image.get('height')),
2071         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2072
2073         return {
2074             'id': video_id,
2075             'title': title or video_id,
2076             'description': description,
2077             'upload_date': upload_date,
2078             'thumbnails': thumbnails,
2079             'formats': formats,
2080             'subtitles': subtitles,
2081         }
2082
2083     def _parse_smil_namespace(self, smil):
2084         return self._search_regex(
2085             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2086
2087     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2088         base = smil_url
2089         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2090             b = meta.get('base') or meta.get('httpBase')
2091             if b:
2092                 base = b
2093                 break
2094
2095         formats = []
2096         rtmp_count = 0
2097         http_count = 0
2098         m3u8_count = 0
2099
2100         srcs = []
2101         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2102         for medium in media:
2103             src = medium.get('src')
2104             if not src or src in srcs:
2105                 continue
2106             srcs.append(src)
2107
2108             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2109             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2110             width = int_or_none(medium.get('width'))
2111             height = int_or_none(medium.get('height'))
2112             proto = medium.get('proto')
2113             ext = medium.get('ext')
2114             src_ext = determine_ext(src)
2115             streamer = medium.get('streamer') or base
2116
2117             if proto == 'rtmp' or streamer.startswith('rtmp'):
2118                 rtmp_count += 1
2119                 formats.append({
2120                     'url': streamer,
2121                     'play_path': src,
2122                     'ext': 'flv',
2123                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2124                     'tbr': bitrate,
2125                     'filesize': filesize,
2126                     'width': width,
2127                     'height': height,
2128                 })
2129                 if transform_rtmp_url:
2130                     streamer, src = transform_rtmp_url(streamer, src)
2131                     formats[-1].update({
2132                         'url': streamer,
2133                         'play_path': src,
2134                     })
2135                 continue
2136
2137             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2138             src_url = src_url.strip()
2139
2140             if proto == 'm3u8' or src_ext == 'm3u8':
2141                 m3u8_formats = self._extract_m3u8_formats(
2142                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2143                 if len(m3u8_formats) == 1:
2144                     m3u8_count += 1
2145                     m3u8_formats[0].update({
2146                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2147                         'tbr': bitrate,
2148                         'width': width,
2149                         'height': height,
2150                     })
2151                 formats.extend(m3u8_formats)
2152             elif src_ext == 'f4m':
2153                 f4m_url = src_url
2154                 if not f4m_params:
2155                     f4m_params = {
2156                         'hdcore': '3.2.0',
2157                         'plugin': 'flowplayer-3.2.0.1',
2158                     }
2159                 f4m_url += '&' if '?' in f4m_url else '?'
2160                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2161                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2162             elif src_ext == 'mpd':
2163                 formats.extend(self._extract_mpd_formats(
2164                     src_url, video_id, mpd_id='dash', fatal=False))
2165             elif re.search(r'\.ism/[Mm]anifest', src_url):
2166                 formats.extend(self._extract_ism_formats(
2167                     src_url, video_id, ism_id='mss', fatal=False))
2168             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2169                 http_count += 1
2170                 formats.append({
2171                     'url': src_url,
2172                     'ext': ext or src_ext or 'flv',
2173                     'format_id': 'http-%d' % (bitrate or http_count),
2174                     'tbr': bitrate,
2175                     'filesize': filesize,
2176                     'width': width,
2177                     'height': height,
2178                 })
2179
2180         return formats
2181
2182     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2183         urls = []
2184         subtitles = {}
2185         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2186             src = textstream.get('src')
2187             if not src or src in urls:
2188                 continue
2189             urls.append(src)
2190             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2191             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2192             subtitles.setdefault(lang, []).append({
2193                 'url': src,
2194                 'ext': ext,
2195             })
2196         return subtitles
2197
2198     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2199         xspf = self._download_xml(
2200             xspf_url, playlist_id, 'Downloading xpsf playlist',
2201             'Unable to download xspf manifest', fatal=fatal)
2202         if xspf is False:
2203             return []
2204         return self._parse_xspf(
2205             xspf, playlist_id, xspf_url=xspf_url,
2206             xspf_base_url=base_url(xspf_url))
2207
2208     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2209         NS_MAP = {
2210             'xspf': 'http://xspf.org/ns/0/',
2211             's1': 'http://static.streamone.nl/player/ns/0',
2212         }
2213
2214         entries = []
2215         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2216             title = xpath_text(
2217                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2218             description = xpath_text(
2219                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2220             thumbnail = xpath_text(
2221                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2222             duration = float_or_none(
2223                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2224
2225             formats = []
2226             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2227                 format_url = urljoin(xspf_base_url, location.text)
2228                 if not format_url:
2229                     continue
2230                 formats.append({
2231                     'url': format_url,
2232                     'manifest_url': xspf_url,
2233                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2234                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2235                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2236                 })
2237             self._sort_formats(formats)
2238
2239             entries.append({
2240                 'id': playlist_id,
2241                 'title': title,
2242                 'description': description,
2243                 'thumbnail': thumbnail,
2244                 'duration': duration,
2245                 'formats': formats,
2246             })
2247         return entries
2248
2249     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}):
2250         res = self._download_xml_handle(
2251             mpd_url, video_id,
2252             note=note or 'Downloading MPD manifest',
2253             errnote=errnote or 'Failed to download MPD manifest',
2254             fatal=fatal, data=data, headers=headers, query=query)
2255         if res is False:
2256             return []
2257         mpd_doc, urlh = res
2258         if mpd_doc is None:
2259             return []
2260         mpd_base_url = base_url(urlh.geturl())
2261
2262         return self._parse_mpd_formats(
2263             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
2264             formats_dict=formats_dict, mpd_url=mpd_url)
2265
2266     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
2267         """
2268         Parse formats from MPD manifest.
2269         References:
2270          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2271             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2272          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2273         """
2274         if not self._downloader.params.get('dynamic_mpd'):
2275             if mpd_doc.get('type') == 'dynamic':
2276                 return []
2277
2278         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2279
2280         def _add_ns(path):
2281             return self._xpath_ns(path, namespace)
2282
2283         def is_drm_protected(element):
2284             return element.find(_add_ns('ContentProtection')) is not None
2285
2286         def extract_multisegment_info(element, ms_parent_info):
2287             ms_info = ms_parent_info.copy()
2288
2289             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2290             # common attributes and elements.  We will only extract relevant
2291             # for us.
2292             def extract_common(source):
2293                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2294                 if segment_timeline is not None:
2295                     s_e = segment_timeline.findall(_add_ns('S'))
2296                     if s_e:
2297                         ms_info['total_number'] = 0
2298                         ms_info['s'] = []
2299                         for s in s_e:
2300                             r = int(s.get('r', 0))
2301                             ms_info['total_number'] += 1 + r
2302                             ms_info['s'].append({
2303                                 't': int(s.get('t', 0)),
2304                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2305                                 'd': int(s.attrib['d']),
2306                                 'r': r,
2307                             })
2308                 start_number = source.get('startNumber')
2309                 if start_number:
2310                     ms_info['start_number'] = int(start_number)
2311                 timescale = source.get('timescale')
2312                 if timescale:
2313                     ms_info['timescale'] = int(timescale)
2314                 segment_duration = source.get('duration')
2315                 if segment_duration:
2316                     ms_info['segment_duration'] = float(segment_duration)
2317
2318             def extract_Initialization(source):
2319                 initialization = source.find(_add_ns('Initialization'))
2320                 if initialization is not None:
2321                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2322
2323             segment_list = element.find(_add_ns('SegmentList'))
2324             if segment_list is not None:
2325                 extract_common(segment_list)
2326                 extract_Initialization(segment_list)
2327                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2328                 if segment_urls_e:
2329                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2330             else:
2331                 segment_template = element.find(_add_ns('SegmentTemplate'))
2332                 if segment_template is not None:
2333                     extract_common(segment_template)
2334                     media = segment_template.get('media')
2335                     if media:
2336                         ms_info['media'] = media
2337                     initialization = segment_template.get('initialization')
2338                     if initialization:
2339                         ms_info['initialization'] = initialization
2340                     else:
2341                         extract_Initialization(segment_template)
2342             return ms_info
2343
2344         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2345         formats = []
2346         for period in mpd_doc.findall(_add_ns('Period')):
2347             period_duration = parse_duration(period.get('duration')) or mpd_duration
2348             period_ms_info = extract_multisegment_info(period, {
2349                 'start_number': 1,
2350                 'timescale': 1,
2351             })
2352             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2353                 if is_drm_protected(adaptation_set):
2354                     continue
2355                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2356                 for representation in adaptation_set.findall(_add_ns('Representation')):
2357                     if is_drm_protected(representation):
2358                         continue
2359                     representation_attrib = adaptation_set.attrib.copy()
2360                     representation_attrib.update(representation.attrib)
2361                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2362                     mime_type = representation_attrib['mimeType']
2363                     content_type = mime_type.split('/')[0]
2364                     if content_type == 'text':
2365                         # TODO implement WebVTT downloading
2366                         pass
2367                     elif content_type in ('video', 'audio'):
2368                         base_url = ''
2369                         for element in (representation, adaptation_set, period, mpd_doc):
2370                             base_url_e = element.find(_add_ns('BaseURL'))
2371                             if base_url_e is not None:
2372                                 base_url = base_url_e.text + base_url
2373                                 if re.match(r'^https?://', base_url):
2374                                     break
2375                         if mpd_base_url and not re.match(r'^https?://', base_url):
2376                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2377                                 mpd_base_url += '/'
2378                             base_url = mpd_base_url + base_url
2379                         representation_id = representation_attrib.get('id')
2380                         lang = representation_attrib.get('lang')
2381                         url_el = representation.find(_add_ns('BaseURL'))
2382                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2383                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2384                         f = {
2385                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2386                             'manifest_url': mpd_url,
2387                             'ext': mimetype2ext(mime_type),
2388                             'width': int_or_none(representation_attrib.get('width')),
2389                             'height': int_or_none(representation_attrib.get('height')),
2390                             'tbr': float_or_none(bandwidth, 1000),
2391                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2392                             'fps': int_or_none(representation_attrib.get('frameRate')),
2393                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2394                             'format_note': 'DASH %s' % content_type,
2395                             'filesize': filesize,
2396                             'container': mimetype2ext(mime_type) + '_dash',
2397                         }
2398                         f.update(parse_codecs(representation_attrib.get('codecs')))
2399                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2400
2401                         def prepare_template(template_name, identifiers):
2402                             tmpl = representation_ms_info[template_name]
2403                             # First of, % characters outside $...$ templates
2404                             # must be escaped by doubling for proper processing
2405                             # by % operator string formatting used further (see
2406                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2407                             t = ''
2408                             in_template = False
2409                             for c in tmpl:
2410                                 t += c
2411                                 if c == '$':
2412                                     in_template = not in_template
2413                                 elif c == '%' and not in_template:
2414                                     t += c
2415                             # Next, $...$ templates are translated to their
2416                             # %(...) counterparts to be used with % operator
2417                             t = t.replace('$RepresentationID$', representation_id)
2418                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2419                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2420                             t.replace('$$', '$')
2421                             return t
2422
2423                         # @initialization is a regular template like @media one
2424                         # so it should be handled just the same way (see
2425                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2426                         if 'initialization' in representation_ms_info:
2427                             initialization_template = prepare_template(
2428                                 'initialization',
2429                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2430                                 # $Time$ shall not be included for @initialization thus
2431                                 # only $Bandwidth$ remains
2432                                 ('Bandwidth', ))
2433                             representation_ms_info['initialization_url'] = initialization_template % {
2434                                 'Bandwidth': bandwidth,
2435                             }
2436
2437                         def location_key(location):
2438                             return 'url' if re.match(r'^https?://', location) else 'path'
2439
2440                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2441
2442                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2443                             media_location_key = location_key(media_template)
2444
2445                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2446                             # can't be used at the same time
2447                             if '%(Number' in media_template and 's' not in representation_ms_info:
2448                                 segment_duration = None
2449                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2450                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2451                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2452                                 representation_ms_info['fragments'] = [{
2453                                     media_location_key: media_template % {
2454                                         'Number': segment_number,
2455                                         'Bandwidth': bandwidth,
2456                                     },
2457                                     'duration': segment_duration,
2458                                 } for segment_number in range(
2459                                     representation_ms_info['start_number'],
2460                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2461                             else:
2462                                 # $Number*$ or $Time$ in media template with S list available
2463                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2464                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2465                                 representation_ms_info['fragments'] = []
2466                                 segment_time = 0
2467                                 segment_d = None
2468                                 segment_number = representation_ms_info['start_number']
2469
2470                                 def add_segment_url():
2471                                     segment_url = media_template % {
2472                                         'Time': segment_time,
2473                                         'Bandwidth': bandwidth,
2474                                         'Number': segment_number,
2475                                     }
2476                                     representation_ms_info['fragments'].append({
2477                                         media_location_key: segment_url,
2478                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2479                                     })
2480
2481                                 for num, s in enumerate(representation_ms_info['s']):
2482                                     segment_time = s.get('t') or segment_time
2483                                     segment_d = s['d']
2484                                     add_segment_url()
2485                                     segment_number += 1
2486                                     for r in range(s.get('r', 0)):
2487                                         segment_time += segment_d
2488                                         add_segment_url()
2489                                         segment_number += 1
2490                                     segment_time += segment_d
2491                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2492                             # No media template
2493                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2494                             # or any YouTube dashsegments video
2495                             fragments = []
2496                             segment_index = 0
2497                             timescale = representation_ms_info['timescale']
2498                             for s in representation_ms_info['s']:
2499                                 duration = float_or_none(s['d'], timescale)
2500                                 for r in range(s.get('r', 0) + 1):
2501                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2502                                     fragments.append({
2503                                         location_key(segment_uri): segment_uri,
2504                                         'duration': duration,
2505                                     })
2506                                     segment_index += 1
2507                             representation_ms_info['fragments'] = fragments
2508                         elif 'segment_urls' in representation_ms_info:
2509                             # Segment URLs with no SegmentTimeline
2510                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2511                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2512                             fragments = []
2513                             segment_duration = float_or_none(
2514                                 representation_ms_info['segment_duration'],
2515                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2516                             for segment_url in representation_ms_info['segment_urls']:
2517                                 fragment = {
2518                                     location_key(segment_url): segment_url,
2519                                 }
2520                                 if segment_duration:
2521                                     fragment['duration'] = segment_duration
2522                                 fragments.append(fragment)
2523                             representation_ms_info['fragments'] = fragments
2524                         # If there is a fragments key available then we correctly recognized fragmented media.
2525                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2526                         # assumption is not necessarily correct since we may simply have no support for
2527                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2528                         if 'fragments' in representation_ms_info:
2529                             f.update({
2530                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2531                                 'url': mpd_url or base_url,
2532                                 'fragment_base_url': base_url,
2533                                 'fragments': [],
2534                                 'protocol': 'http_dash_segments',
2535                             })
2536                             if 'initialization_url' in representation_ms_info:
2537                                 initialization_url = representation_ms_info['initialization_url']
2538                                 if not f.get('url'):
2539                                     f['url'] = initialization_url
2540                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2541                             f['fragments'].extend(representation_ms_info['fragments'])
2542                         else:
2543                             # Assuming direct URL to unfragmented media.
2544                             f['url'] = base_url
2545
2546                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2547                         # is not necessarily unique within a Period thus formats with
2548                         # the same `format_id` are quite possible. There are numerous examples
2549                         # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111,
2550                         # https://github.com/ytdl-org/youtube-dl/issues/13919)
2551                         full_info = formats_dict.get(representation_id, {}).copy()
2552                         full_info.update(f)
2553                         formats.append(full_info)
2554                     else:
2555                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2556         return formats
2557
2558     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2559         res = self._download_xml_handle(
2560             ism_url, video_id,
2561             note=note or 'Downloading ISM manifest',
2562             errnote=errnote or 'Failed to download ISM manifest',
2563             fatal=fatal, data=data, headers=headers, query=query)
2564         if res is False:
2565             return []
2566         ism_doc, urlh = res
2567         if ism_doc is None:
2568             return []
2569
2570         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2571
2572     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2573         """
2574         Parse formats from ISM manifest.
2575         References:
2576          1. [MS-SSTR]: Smooth Streaming Protocol,
2577             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2578         """
2579         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2580             return []
2581
2582         duration = int(ism_doc.attrib['Duration'])
2583         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2584
2585         formats = []
2586         for stream in ism_doc.findall('StreamIndex'):
2587             stream_type = stream.get('Type')
2588             if stream_type not in ('video', 'audio'):
2589                 continue
2590             url_pattern = stream.attrib['Url']
2591             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2592             stream_name = stream.get('Name')
2593             for track in stream.findall('QualityLevel'):
2594                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2595                 # TODO: add support for WVC1 and WMAP
2596                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2597                     self.report_warning('%s is not a supported codec' % fourcc)
2598                     continue
2599                 tbr = int(track.attrib['Bitrate']) // 1000
2600                 # [1] does not mention Width and Height attributes. However,
2601                 # they're often present while MaxWidth and MaxHeight are
2602                 # missing, so should be used as fallbacks
2603                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2604                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2605                 sampling_rate = int_or_none(track.get('SamplingRate'))
2606
2607                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2608                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2609
2610                 fragments = []
2611                 fragment_ctx = {
2612                     'time': 0,
2613                 }
2614                 stream_fragments = stream.findall('c')
2615                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2616                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2617                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2618                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2619                     if not fragment_ctx['duration']:
2620                         try:
2621                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2622                         except IndexError:
2623                             next_fragment_time = duration
2624                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2625                     for _ in range(fragment_repeat):
2626                         fragments.append({
2627                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2628                             'duration': fragment_ctx['duration'] / stream_timescale,
2629                         })
2630                         fragment_ctx['time'] += fragment_ctx['duration']
2631
2632                 format_id = []
2633                 if ism_id:
2634                     format_id.append(ism_id)
2635                 if stream_name:
2636                     format_id.append(stream_name)
2637                 format_id.append(compat_str(tbr))
2638
2639                 formats.append({
2640                     'format_id': '-'.join(format_id),
2641                     'url': ism_url,
2642                     'manifest_url': ism_url,
2643                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2644                     'width': width,
2645                     'height': height,
2646                     'tbr': tbr,
2647                     'asr': sampling_rate,
2648                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2649                     'acodec': 'none' if stream_type == 'video' else fourcc,
2650                     'protocol': 'ism',
2651                     'fragments': fragments,
2652                     '_download_params': {
2653                         'duration': duration,
2654                         'timescale': stream_timescale,
2655                         'width': width or 0,
2656                         'height': height or 0,
2657                         'fourcc': fourcc,
2658                         'codec_private_data': track.get('CodecPrivateData'),
2659                         'sampling_rate': sampling_rate,
2660                         'channels': int_or_none(track.get('Channels', 2)),
2661                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2662                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2663                     },
2664                 })
2665         return formats
2666
2667     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2668         def absolute_url(item_url):
2669             return urljoin(base_url, item_url)
2670
2671         def parse_content_type(content_type):
2672             if not content_type:
2673                 return {}
2674             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2675             if ctr:
2676                 mimetype, codecs = ctr.groups()
2677                 f = parse_codecs(codecs)
2678                 f['ext'] = mimetype2ext(mimetype)
2679                 return f
2680             return {}
2681
2682         def _media_formats(src, cur_media_type, type_info={}):
2683             full_url = absolute_url(src)
2684             ext = type_info.get('ext') or determine_ext(full_url)
2685             if ext == 'm3u8':
2686                 is_plain_url = False
2687                 formats = self._extract_m3u8_formats(
2688                     full_url, video_id, ext='mp4',
2689                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2690                     preference=preference, fatal=False)
2691             elif ext == 'mpd':
2692                 is_plain_url = False
2693                 formats = self._extract_mpd_formats(
2694                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2695             else:
2696                 is_plain_url = True
2697                 formats = [{
2698                     'url': full_url,
2699                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2700                 }]
2701             return is_plain_url, formats
2702
2703         entries = []
2704         # amp-video and amp-audio are very similar to their HTML5 counterparts
2705         # so we wll include them right here (see
2706         # https://www.ampproject.org/docs/reference/components/amp-video)
2707         media_tags = [(media_tag, media_type, '')
2708                       for media_tag, media_type
2709                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2710         media_tags.extend(re.findall(
2711             # We only allow video|audio followed by a whitespace or '>'.
2712             # Allowing more characters may end up in significant slow down (see
2713             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2714             # http://www.porntrex.com/maps/videositemap.xml).
2715             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2716         for media_tag, media_type, media_content in media_tags:
2717             media_info = {
2718                 'formats': [],
2719                 'subtitles': {},
2720             }
2721             media_attributes = extract_attributes(media_tag)
2722             src = strip_or_none(media_attributes.get('src'))
2723             if src:
2724                 _, formats = _media_formats(src, media_type)
2725                 media_info['formats'].extend(formats)
2726             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2727             if media_content:
2728                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2729                     s_attr = extract_attributes(source_tag)
2730                     # data-video-src and data-src are non standard but seen
2731                     # several times in the wild
2732                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2733                     if not src:
2734                         continue
2735                     f = parse_content_type(s_attr.get('type'))
2736                     is_plain_url, formats = _media_formats(src, media_type, f)
2737                     if is_plain_url:
2738                         # width, height, res, label and title attributes are
2739                         # all not standard but seen several times in the wild
2740                         labels = [
2741                             s_attr.get(lbl)
2742                             for lbl in ('label', 'title')
2743                             if str_or_none(s_attr.get(lbl))
2744                         ]
2745                         width = int_or_none(s_attr.get('width'))
2746                         height = (int_or_none(s_attr.get('height'))
2747                                   or int_or_none(s_attr.get('res')))
2748                         if not width or not height:
2749                             for lbl in labels:
2750                                 resolution = parse_resolution(lbl)
2751                                 if not resolution:
2752                                     continue
2753                                 width = width or resolution.get('width')
2754                                 height = height or resolution.get('height')
2755                         for lbl in labels:
2756                             tbr = parse_bitrate(lbl)
2757                             if tbr:
2758                                 break
2759                         else:
2760                             tbr = None
2761                         f.update({
2762                             'width': width,
2763                             'height': height,
2764                             'tbr': tbr,
2765                             'format_id': s_attr.get('label') or s_attr.get('title'),
2766                         })
2767                         f.update(formats[0])
2768                         media_info['formats'].append(f)
2769                     else:
2770                         media_info['formats'].extend(formats)
2771                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2772                     track_attributes = extract_attributes(track_tag)
2773                     kind = track_attributes.get('kind')
2774                     if not kind or kind in ('subtitles', 'captions'):
2775                         src = strip_or_none(track_attributes.get('src'))
2776                         if not src:
2777                             continue
2778                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2779                         media_info['subtitles'].setdefault(lang, []).append({
2780                             'url': absolute_url(src),
2781                         })
2782             for f in media_info['formats']:
2783                 f.setdefault('http_headers', {})['Referer'] = base_url
2784             if media_info['formats'] or media_info['subtitles']:
2785                 entries.append(media_info)
2786         return entries
2787
2788     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2789         formats = []
2790
2791         hdcore_sign = 'hdcore=3.7.0'
2792         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2793         hds_host = hosts.get('hds')
2794         if hds_host:
2795             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2796         if 'hdcore=' not in f4m_url:
2797             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2798         f4m_formats = self._extract_f4m_formats(
2799             f4m_url, video_id, f4m_id='hds', fatal=False)
2800         for entry in f4m_formats:
2801             entry.update({'extra_param_to_segment_url': hdcore_sign})
2802         formats.extend(f4m_formats)
2803
2804         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2805         hls_host = hosts.get('hls')
2806         if hls_host:
2807             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2808         formats.extend(self._extract_m3u8_formats(
2809             m3u8_url, video_id, 'mp4', 'm3u8_native',
2810             m3u8_id='hls', fatal=False))
2811
2812         http_host = hosts.get('http')
2813         if http_host and 'hdnea=' not in manifest_url:
2814             REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+'
2815             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2816             qualities_length = len(qualities)
2817             if len(formats) in (qualities_length + 1, qualities_length * 2 + 1):
2818                 i = 0
2819                 http_formats = []
2820                 for f in formats:
2821                     if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none':
2822                         for protocol in ('http', 'https'):
2823                             http_f = f.copy()
2824                             del http_f['manifest_url']
2825                             http_url = re.sub(
2826                                 REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url'])
2827                             http_f.update({
2828                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2829                                 'url': http_url,
2830                                 'protocol': protocol,
2831                             })
2832                             http_formats.append(http_f)
2833                         i += 1
2834                 formats.extend(http_formats)
2835
2836         return formats
2837
2838     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2839         query = compat_urlparse.urlparse(url).query
2840         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2841         mobj = re.search(
2842             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2843         url_base = mobj.group('url')
2844         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2845         formats = []
2846
2847         def manifest_url(manifest):
2848             m_url = '%s/%s' % (http_base_url, manifest)
2849             if query:
2850                 m_url += '?%s' % query
2851             return m_url
2852
2853         if 'm3u8' not in skip_protocols:
2854             formats.extend(self._extract_m3u8_formats(
2855                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2856                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2857         if 'f4m' not in skip_protocols:
2858             formats.extend(self._extract_f4m_formats(
2859                 manifest_url('manifest.f4m'),
2860                 video_id, f4m_id='hds', fatal=False))
2861         if 'dash' not in skip_protocols:
2862             formats.extend(self._extract_mpd_formats(
2863                 manifest_url('manifest.mpd'),
2864                 video_id, mpd_id='dash', fatal=False))
2865         if re.search(r'(?:/smil:|\.smil)', url_base):
2866             if 'smil' not in skip_protocols:
2867                 rtmp_formats = self._extract_smil_formats(
2868                     manifest_url('jwplayer.smil'),
2869                     video_id, fatal=False)
2870                 for rtmp_format in rtmp_formats:
2871                     rtsp_format = rtmp_format.copy()
2872                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2873                     del rtsp_format['play_path']
2874                     del rtsp_format['ext']
2875                     rtsp_format.update({
2876                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2877                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2878                         'protocol': 'rtsp',
2879                     })
2880                     formats.extend([rtmp_format, rtsp_format])
2881         else:
2882             for protocol in ('rtmp', 'rtsp'):
2883                 if protocol not in skip_protocols:
2884                     formats.append({
2885                         'url': '%s:%s' % (protocol, url_base),
2886                         'format_id': protocol,
2887                         'protocol': protocol,
2888                     })
2889         return formats
2890
2891     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2892         mobj = re.search(
2893             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2894             webpage)
2895         if mobj:
2896             try:
2897                 jwplayer_data = self._parse_json(mobj.group('options'),
2898                                                  video_id=video_id,
2899                                                  transform_source=transform_source)
2900             except ExtractorError:
2901                 pass
2902             else:
2903                 if isinstance(jwplayer_data, dict):
2904                     return jwplayer_data
2905
2906     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2907         jwplayer_data = self._find_jwplayer_data(
2908             webpage, video_id, transform_source=js_to_json)
2909         return self._parse_jwplayer_data(
2910             jwplayer_data, video_id, *args, **kwargs)
2911
2912     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2913                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2914         # JWPlayer backward compatibility: flattened playlists
2915         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2916         if 'playlist' not in jwplayer_data:
2917             jwplayer_data = {'playlist': [jwplayer_data]}
2918
2919         entries = []
2920
2921         # JWPlayer backward compatibility: single playlist item
2922         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2923         if not isinstance(jwplayer_data['playlist'], list):
2924             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2925
2926         for video_data in jwplayer_data['playlist']:
2927             # JWPlayer backward compatibility: flattened sources
2928             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2929             if 'sources' not in video_data:
2930                 video_data['sources'] = [video_data]
2931
2932             this_video_id = video_id or video_data['mediaid']
2933
2934             formats = self._parse_jwplayer_formats(
2935                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2936                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2937
2938             subtitles = {}
2939             tracks = video_data.get('tracks')
2940             if tracks and isinstance(tracks, list):
2941                 for track in tracks:
2942                     if not isinstance(track, dict):
2943                         continue
2944                     track_kind = track.get('kind')
2945                     if not track_kind or not isinstance(track_kind, compat_str):
2946                         continue
2947                     if track_kind.lower() not in ('captions', 'subtitles'):
2948                         continue
2949                     track_url = urljoin(base_url, track.get('file'))
2950                     if not track_url:
2951                         continue
2952                     subtitles.setdefault(track.get('label') or 'en', []).append({
2953                         'url': self._proto_relative_url(track_url)
2954                     })
2955
2956             entry = {
2957                 'id': this_video_id,
2958                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2959                 'description': clean_html(video_data.get('description')),
2960                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
2961                 'timestamp': int_or_none(video_data.get('pubdate')),
2962                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2963                 'subtitles': subtitles,
2964             }
2965             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2966             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2967                 entry.update({
2968                     '_type': 'url_transparent',
2969                     'url': formats[0]['url'],
2970                 })
2971             else:
2972                 self._sort_formats(formats)
2973                 entry['formats'] = formats
2974             entries.append(entry)
2975         if len(entries) == 1:
2976             return entries[0]
2977         else:
2978             return self.playlist_result(entries)
2979
2980     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2981                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2982         urls = []
2983         formats = []
2984         for source in jwplayer_sources_data:
2985             if not isinstance(source, dict):
2986                 continue
2987             source_url = urljoin(
2988                 base_url, self._proto_relative_url(source.get('file')))
2989             if not source_url or source_url in urls:
2990                 continue
2991             urls.append(source_url)
2992             source_type = source.get('type') or ''
2993             ext = mimetype2ext(source_type) or determine_ext(source_url)
2994             if source_type == 'hls' or ext == 'm3u8':
2995                 formats.extend(self._extract_m3u8_formats(
2996                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2997                     m3u8_id=m3u8_id, fatal=False))
2998             elif source_type == 'dash' or ext == 'mpd':
2999                 formats.extend(self._extract_mpd_formats(
3000                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3001             elif ext == 'smil':
3002                 formats.extend(self._extract_smil_formats(
3003                     source_url, video_id, fatal=False))
3004             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3005             elif source_type.startswith('audio') or ext in (
3006                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3007                 formats.append({
3008                     'url': source_url,
3009                     'vcodec': 'none',
3010                     'ext': ext,
3011                 })
3012             else:
3013                 height = int_or_none(source.get('height'))
3014                 if height is None:
3015                     # Often no height is provided but there is a label in
3016                     # format like "1080p", "720p SD", or 1080.
3017                     height = int_or_none(self._search_regex(
3018                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3019                         'height', default=None))
3020                 a_format = {
3021                     'url': source_url,
3022                     'width': int_or_none(source.get('width')),
3023                     'height': height,
3024                     'tbr': int_or_none(source.get('bitrate')),
3025                     'ext': ext,
3026                 }
3027                 if source_url.startswith('rtmp'):
3028                     a_format['ext'] = 'flv'
3029                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3030                     # of jwplayer.flash.swf
3031                     rtmp_url_parts = re.split(
3032                         r'((?:mp4|mp3|flv):)', source_url, 1)
3033                     if len(rtmp_url_parts) == 3:
3034                         rtmp_url, prefix, play_path = rtmp_url_parts
3035                         a_format.update({
3036                             'url': rtmp_url,
3037                             'play_path': prefix + play_path,
3038                         })
3039                     if rtmp_params:
3040                         a_format.update(rtmp_params)
3041                 formats.append(a_format)
3042         return formats
3043
3044     def _live_title(self, name):
3045         """ Generate the title for a live video """
3046         now = datetime.datetime.now()
3047         now_str = now.strftime('%Y-%m-%d %H:%M')
3048         return name + ' ' + now_str
3049
3050     def _int(self, v, name, fatal=False, **kwargs):
3051         res = int_or_none(v, **kwargs)
3052         if 'get_attr' in kwargs:
3053             print(getattr(v, kwargs['get_attr']))
3054         if res is None:
3055             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3056             if fatal:
3057                 raise ExtractorError(msg)
3058             else:
3059                 self._downloader.report_warning(msg)
3060         return res
3061
3062     def _float(self, v, name, fatal=False, **kwargs):
3063         res = float_or_none(v, **kwargs)
3064         if res is None:
3065             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3066             if fatal:
3067                 raise ExtractorError(msg)
3068             else:
3069                 self._downloader.report_warning(msg)
3070         return res
3071
3072     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3073                     path='/', secure=False, discard=False, rest={}, **kwargs):
3074         cookie = compat_cookiejar_Cookie(
3075             0, name, value, port, port is not None, domain, True,
3076             domain.startswith('.'), path, True, secure, expire_time,
3077             discard, None, None, rest)
3078         self._downloader.cookiejar.set_cookie(cookie)
3079
3080     def _get_cookies(self, url):
3081         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
3082         req = sanitized_Request(url)
3083         self._downloader.cookiejar.add_cookie_header(req)
3084         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
3085
3086     def _apply_first_set_cookie_header(self, url_handle, cookie):
3087         """
3088         Apply first Set-Cookie header instead of the last. Experimental.
3089
3090         Some sites (e.g. [1-3]) may serve two cookies under the same name
3091         in Set-Cookie header and expect the first (old) one to be set rather
3092         than second (new). However, as of RFC6265 the newer one cookie
3093         should be set into cookie store what actually happens.
3094         We will workaround this issue by resetting the cookie to
3095         the first one manually.
3096         1. https://new.vk.com/
3097         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3098         3. https://learning.oreilly.com/
3099         """
3100         for header, cookies in url_handle.headers.items():
3101             if header.lower() != 'set-cookie':
3102                 continue
3103             if sys.version_info[0] >= 3:
3104                 cookies = cookies.encode('iso-8859-1')
3105             cookies = cookies.decode('utf-8')
3106             cookie_value = re.search(
3107                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3108             if cookie_value:
3109                 value, domain = cookie_value.groups()
3110                 self._set_cookie(domain, cookie, value)
3111                 break
3112
3113     def get_testcases(self, include_onlymatching=False):
3114         t = getattr(self, '_TEST', None)
3115         if t:
3116             assert not hasattr(self, '_TESTS'), \
3117                 '%s has _TEST and _TESTS' % type(self).__name__
3118             tests = [t]
3119         else:
3120             tests = getattr(self, '_TESTS', [])
3121         for t in tests:
3122             if not include_onlymatching and t.get('only_matching', False):
3123                 continue
3124             t['name'] = type(self).__name__[:-len('IE')]
3125             yield t
3126
3127     def is_suitable(self, age_limit):
3128         """ Test whether the extractor is generally suitable for the given
3129         age limit (i.e. pornographic sites are not, all others usually are) """
3130
3131         any_restricted = False
3132         for tc in self.get_testcases(include_onlymatching=False):
3133             if tc.get('playlist', []):
3134                 tc = tc['playlist'][0]
3135             is_restricted = age_restricted(
3136                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3137             if not is_restricted:
3138                 return True
3139             any_restricted = any_restricted or is_restricted
3140         return not any_restricted
3141
3142     def extract_subtitles(self, *args, **kwargs):
3143         if (self._downloader.params.get('writesubtitles', False)
3144                 or self._downloader.params.get('listsubtitles')):
3145             return self._get_subtitles(*args, **kwargs)
3146         return {}
3147
3148     def _get_subtitles(self, *args, **kwargs):
3149         raise NotImplementedError('This method must be implemented by subclasses')
3150
3151     @staticmethod
3152     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3153         """ Merge subtitle items for one language. Items with duplicated URLs
3154         will be dropped. """
3155         list1_urls = set([item['url'] for item in subtitle_list1])
3156         ret = list(subtitle_list1)
3157         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3158         return ret
3159
3160     @classmethod
3161     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
3162         """ Merge two subtitle dictionaries, language by language. """
3163         ret = dict(subtitle_dict1)
3164         for lang in subtitle_dict2:
3165             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
3166         return ret
3167
3168     def extract_automatic_captions(self, *args, **kwargs):
3169         if (self._downloader.params.get('writeautomaticsub', False)
3170                 or self._downloader.params.get('listsubtitles')):
3171             return self._get_automatic_captions(*args, **kwargs)
3172         return {}
3173
3174     def _get_automatic_captions(self, *args, **kwargs):
3175         raise NotImplementedError('This method must be implemented by subclasses')
3176
3177     def mark_watched(self, *args, **kwargs):
3178         if (self._downloader.params.get('mark_watched', False)
3179                 and (self._get_login_info()[0] is not None
3180                      or self._downloader.params.get('cookiefile') is not None)):
3181             self._mark_watched(*args, **kwargs)
3182
3183     def _mark_watched(self, *args, **kwargs):
3184         raise NotImplementedError('This method must be implemented by subclasses')
3185
3186     def geo_verification_headers(self):
3187         headers = {}
3188         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3189         if geo_verification_proxy:
3190             headers['Ytdl-request-proxy'] = geo_verification_proxy
3191         return headers
3192
3193     def _generic_id(self, url):
3194         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3195
3196     def _generic_title(self, url):
3197         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3198
3199
3200 class SearchInfoExtractor(InfoExtractor):
3201     """
3202     Base class for paged search queries extractors.
3203     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3204     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3205     """
3206
3207     @classmethod
3208     def _make_valid_url(cls):
3209         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3210
3211     @classmethod
3212     def suitable(cls, url):
3213         return re.match(cls._make_valid_url(), url) is not None
3214
3215     def _real_extract(self, query):
3216         mobj = re.match(self._make_valid_url(), query)
3217         if mobj is None:
3218             raise ExtractorError('Invalid search query "%s"' % query)
3219
3220         prefix = mobj.group('prefix')
3221         query = mobj.group('query')
3222         if prefix == '':
3223             return self._get_n_results(query, 1)
3224         elif prefix == 'all':
3225             return self._get_n_results(query, self._MAX_RESULTS)
3226         else:
3227             n = int(prefix)
3228             if n <= 0:
3229                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3230             elif n > self._MAX_RESULTS:
3231                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3232                 n = self._MAX_RESULTS
3233             return self._get_n_results(query, n)
3234
3235     def _get_n_results(self, query, n):
3236         """Get a specified number of results for a query"""
3237         raise NotImplementedError('This method must be implemented by subclasses')
3238
3239     @property
3240     def SEARCH_KEY(self):
3241         return self._SEARCH_KEY