youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_getpass,
  23     compat_integer_types,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader.f4m import (
  35     get_base_url,
  36     remove_encrypted_media,
  37 )
  38 from ..utils import (
  39     NO_DEFAULT,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     error_to_compat_str,
  48     ExtractorError,
  49     extract_attributes,
  50     fix_xml_ampersands,
  51     float_or_none,
  52     GeoRestrictedError,
  53     GeoUtils,
  54     int_or_none,
  55     js_to_json,
  56     JSON_LD_RE,
  57     mimetype2ext,
  58     orderedSet,
  59     parse_codecs,
  60     parse_duration,
  61     parse_iso8601,
  62     parse_m3u8_attributes,
  63     RegexNotFoundError,
  64     sanitized_Request,
  65     sanitize_filename,
  66     unescapeHTML,
  67     unified_strdate,
  68     unified_timestamp,
  69     update_Request,
  70     update_url_query,
  71     urljoin,
  72     url_basename,
  73     url_or_none,
  74     xpath_element,
  75     xpath_text,
  76     xpath_with_ns,
  77 )
  78
  79
  80 class InfoExtractor(object):
  81     """Information Extractor class.
  82
  83     Information extractors are the classes that, given a URL, extract
  84     information about the video (or videos) the URL refers to. This
  85     information includes the real video URL, the video title, author and
  86     others. The information is stored in a dictionary which is then
  87     passed to the YoutubeDL. The YoutubeDL processes this
  88     information possibly downloading the video to the file system, among
  89     other possible outcomes.
  90
  91     The type field determines the type of the result.
  92     By far the most common value (and the default if _type is missing) is
  93     "video", which indicates a single video.
  94
  95     For a video, the dictionaries must include the following fields:
  96
  97     id:             Video identifier.
  98     title:          Video title, unescaped.
  99
 100     Additionally, it must contain either a formats entry or a url one:
 101
 102     formats:        A list of dictionaries for each format available, ordered
 103                     from worst to best quality.
 104
 105                     Potential fields:
 106                     * url        The mandatory URL representing the media:
 107                                    for plain file media - HTTP URL of this file,
 108                                    for RTMP - RTMP URL,
 109                                    for HLS - URL of the M3U8 media playlist,
 110                                    for HDS - URL of the F4M manifest,
 111                                    for DASH - URL of the MPD manifest or
 112                                               base URL representing the media
 113                                               if MPD manifest is parsed from
 114                                               a string,
 115                                    for MSS - URL of the ISM manifest.
 116                     * manifest_url
 117                                  The URL of the manifest file in case of
 118                                  fragmented media:
 119                                    for HLS - URL of the M3U8 master playlist,
 120                                    for HDS - URL of the F4M manifest,
 121                                    for DASH - URL of the MPD manifest,
 122                                    for MSS - URL of the ISM manifest.
 123                     * ext        Will be calculated from URL if missing
 124                     * format     A human-readable description of the format
 125                                  ("mp4 container with h264/opus").
 126                                  Calculated from the format_id, width, height.
 127                                  and format_note fields if missing.
 128                     * format_id  A short description of the format
 129                                  ("mp4_h264_opus" or "19").
 130                                 Technically optional, but strongly recommended.
 131                     * format_note Additional info about the format
 132                                  ("3D" or "DASH video")
 133                     * width      Width of the video, if known
 134                     * height     Height of the video, if known
 135                     * resolution Textual description of width and height
 136                     * tbr        Average bitrate of audio and video in KBit/s
 137                     * abr        Average audio bitrate in KBit/s
 138                     * acodec     Name of the audio codec in use
 139                     * asr        Audio sampling rate in Hertz
 140                     * vbr        Average video bitrate in KBit/s
 141                     * fps        Frame rate
 142                     * vcodec     Name of the video codec in use
 143                     * container  Name of the container format
 144                     * filesize   The number of bytes, if known in advance
 145                     * filesize_approx  An estimate for the number of bytes
 146                     * player_url SWF Player URL (used for rtmpdump).
 147                     * protocol   The protocol that will be used for the actual
 148                                  download, lower-case.
 149                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 150                                  "m3u8", "m3u8_native" or "http_dash_segments".
 151                     * fragment_base_url
 152                                  Base URL for fragments. Each fragment's path
 153                                  value (if present) will be relative to
 154                                  this URL.
 155                     * fragments  A list of fragments of a fragmented media.
 156                                  Each fragment entry must contain either an url
 157                                  or a path. If an url is present it should be
 158                                  considered by a client. Otherwise both path and
 159                                  fragment_base_url must be present. Here is
 160                                  the list of all potential fields:
 161                                  * "url" - fragment's URL
 162                                  * "path" - fragment's path relative to
 163                                             fragment_base_url
 164                                  * "duration" (optional, int or float)
 165                                  * "filesize" (optional, int)
 166                     * preference Order number of this format. If this field is
 167                                  present and not None, the formats get sorted
 168                                  by this field, regardless of all other values.
 169                                  -1 for default (order by other properties),
 170                                  -2 or smaller for less than default.
 171                                  < -1000 to hide the format (if there is
 172                                     another one which is strictly better)
 173                     * language   Language code, e.g. "de" or "en-US".
 174                     * language_preference  Is this in the language mentioned in
 175                                  the URL?
 176                                  10 if it's what the URL is about,
 177                                  -1 for default (don't know),
 178                                  -10 otherwise, other values reserved for now.
 179                     * quality    Order number of the video quality of this
 180                                  format, irrespective of the file format.
 181                                  -1 for default (order by other properties),
 182                                  -2 or smaller for less than default.
 183                     * source_preference  Order number for this video source
 184                                   (quality takes higher priority)
 185                                  -1 for default (order by other properties),
 186                                  -2 or smaller for less than default.
 187                     * http_headers  A dictionary of additional HTTP headers
 188                                  to add to the request.
 189                     * stretched_ratio  If given and not 1, indicates that the
 190                                  video's pixels are not square.
 191                                  width : height ratio as float.
 192                     * no_resume  The server does not support resuming the
 193                                  (HTTP or RTMP) download. Boolean.
 194                     * downloader_options  A dictionary of downloader options as
 195                                  described in FileDownloader
 196
 197     url:            Final video URL.
 198     ext:            Video filename extension.
 199     format:         The video format, defaults to ext (used for --get-format)
 200     player_url:     SWF Player URL (used for rtmpdump).
 201
 202     The following fields are optional:
 203
 204     alt_title:      A secondary title of the video.
 205     display_id      An alternative identifier for the video, not necessarily
 206                     unique, but available before title. Typically, id is
 207                     something like "4234987", title "Dancing naked mole rats",
 208                     and display_id "dancing-naked-mole-rats"
 209     thumbnails:     A list of dictionaries, with the following entries:
 210                         * "id" (optional, string) - Thumbnail format ID
 211                         * "url"
 212                         * "preference" (optional, int) - quality of the image
 213                         * "width" (optional, int)
 214                         * "height" (optional, int)
 215                         * "resolution" (optional, string "{width}x{height"},
 216                                         deprecated)
 217                         * "filesize" (optional, int)
 218     thumbnail:      Full URL to a video thumbnail image.
 219     description:    Full video description.
 220     uploader:       Full name of the video uploader.
 221     license:        License name the video is licensed under.
 222     creator:        The creator of the video.
 223     release_date:   The date (YYYYMMDD) when the video was released.
 224     timestamp:      UNIX timestamp of the moment the video became available.
 225     upload_date:    Video upload date (YYYYMMDD).
 226                     If not explicitly set, calculated from timestamp.
 227     uploader_id:    Nickname or id of the video uploader.
 228     uploader_url:   Full URL to a personal webpage of the video uploader.
 229     channel:        Full name of the channel the video is uploaded on.
 230                     Note that channel fields may or may not repeat uploader
 231                     fields. This depends on a particular extractor.
 232     channel_id:     Id of the channel.
 233     channel_url:    Full URL to a channel webpage.
 234     location:       Physical location where the video was filmed.
 235     subtitles:      The available subtitles as a dictionary in the format
 236                     {tag: subformats}. "tag" is usually a language code, and
 237                     "subformats" is a list sorted from lower to higher
 238                     preference, each element is a dictionary with the "ext"
 239                     entry and one of:
 240                         * "data": The subtitles file contents
 241                         * "url": A URL pointing to the subtitles file
 242                     "ext" will be calculated from URL if missing
 243     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 244                     automatically generated captions
 245     duration:       Length of the video in seconds, as an integer or float.
 246     view_count:     How many users have watched the video on the platform.
 247     like_count:     Number of positive ratings of the video
 248     dislike_count:  Number of negative ratings of the video
 249     repost_count:   Number of reposts of the video
 250     average_rating: Average rating give by users, the scale used depends on the webpage
 251     comment_count:  Number of comments on the video
 252     comments:       A list of comments, each with one or more of the following
 253                     properties (all but one of text or html optional):
 254                         * "author" - human-readable name of the comment author
 255                         * "author_id" - user ID of the comment author
 256                         * "id" - Comment ID
 257                         * "html" - Comment as HTML
 258                         * "text" - Plain text of the comment
 259                         * "timestamp" - UNIX timestamp of comment
 260                         * "parent" - ID of the comment this one is replying to.
 261                                      Set to "root" to indicate that this is a
 262                                      comment to the original video.
 263     age_limit:      Age restriction for the video, as an integer (years)
 264     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 265                     should allow to get the same result again. (It will be set
 266                     by YoutubeDL if it's missing)
 267     categories:     A list of categories that the video falls in, for example
 268                     ["Sports", "Berlin"]
 269     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 270     is_live:        True, False, or None (=unknown). Whether this video is a
 271                     live stream that goes on instead of a fixed-length video.
 272     start_time:     Time in seconds where the reproduction should start, as
 273                     specified in the URL.
 274     end_time:       Time in seconds where the reproduction should end, as
 275                     specified in the URL.
 276     chapters:       A list of dictionaries, with the following entries:
 277                         * "start_time" - The start time of the chapter in seconds
 278                         * "end_time" - The end time of the chapter in seconds
 279                         * "title" (optional, string)
 280
 281     The following fields should only be used when the video belongs to some logical
 282     chapter or section:
 283
 284     chapter:        Name or title of the chapter the video belongs to.
 285     chapter_number: Number of the chapter the video belongs to, as an integer.
 286     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 287
 288     The following fields should only be used when the video is an episode of some
 289     series, programme or podcast:
 290
 291     series:         Title of the series or programme the video episode belongs to.
 292     season:         Title of the season the video episode belongs to.
 293     season_number:  Number of the season the video episode belongs to, as an integer.
 294     season_id:      Id of the season the video episode belongs to, as a unicode string.
 295     episode:        Title of the video episode. Unlike mandatory video title field,
 296                     this field should denote the exact title of the video episode
 297                     without any kind of decoration.
 298     episode_number: Number of the video episode within a season, as an integer.
 299     episode_id:     Id of the video episode, as a unicode string.
 300
 301     The following fields should only be used when the media is a track or a part of
 302     a music album:
 303
 304     track:          Title of the track.
 305     track_number:   Number of the track within an album or a disc, as an integer.
 306     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 307                     as a unicode string.
 308     artist:         Artist(s) of the track.
 309     genre:          Genre(s) of the track.
 310     album:          Title of the album the track belongs to.
 311     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 312     album_artist:   List of all artists appeared on the album (e.g.
 313                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 314                     and compilations).
 315     disc_number:    Number of the disc or other physical medium the track belongs to,
 316                     as an integer.
 317     release_year:   Year (YYYY) when the album was released.
 318
 319     Unless mentioned otherwise, the fields should be Unicode strings.
 320
 321     Unless mentioned otherwise, None is equivalent to absence of information.
 322
 323
 324     _type "playlist" indicates multiple videos.
 325     There must be a key "entries", which is a list, an iterable, or a PagedList
 326     object, each element of which is a valid dictionary by this specification.
 327
 328     Additionally, playlists can have "id", "title", "description", "uploader",
 329     "uploader_id", "uploader_url" attributes with the same semantics as videos
 330     (see above).
 331
 332
 333     _type "multi_video" indicates that there are multiple videos that
 334     form a single show, for examples multiple acts of an opera or TV episode.
 335     It must have an entries key like a playlist and contain all the keys
 336     required for a video at the same time.
 337
 338
 339     _type "url" indicates that the video must be extracted from another
 340     location, possibly by a different extractor. Its only required key is:
 341     "url" - the next URL to extract.
 342     The key "ie_key" can be set to the class name (minus the trailing "IE",
 343     e.g. "Youtube") if the extractor class is known in advance.
 344     Additionally, the dictionary may have any properties of the resolved entity
 345     known in advance, for example "title" if the title of the referred video is
 346     known ahead of time.
 347
 348
 349     _type "url_transparent" entities have the same specification as "url", but
 350     indicate that the given additional information is more precise than the one
 351     associated with the resolved URL.
 352     This is useful when a site employs a video service that hosts the video and
 353     its technical metadata, but that video service does not embed a useful
 354     title, description etc.
 355
 356
 357     Subclasses of this one should re-define the _real_initialize() and
 358     _real_extract() methods and define a _VALID_URL regexp.
 359     Probably, they should also be added to the list of extractors.
 360
 361     _GEO_BYPASS attribute may be set to False in order to disable
 362     geo restriction bypass mechanisms for a particular extractor.
 363     Though it won't disable explicit geo restriction bypass based on
 364     country code provided with geo_bypass_country.
 365
 366     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 367     countries for this extractor. One of these countries will be used by
 368     geo restriction bypass mechanism right away in order to bypass
 369     geo restriction, of course, if the mechanism is not disabled.
 370
 371     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 372     IP blocks in CIDR notation for this extractor. One of these IP blocks
 373     will be used by geo restriction bypass mechanism similarly
 374     to _GEO_COUNTRIES.
 375
 376     Finally, the _WORKING attribute should be set to False for broken IEs
 377     in order to warn the users and skip the tests.
 378     """
 379
 380     _ready = False
 381     _downloader = None
 382     _x_forwarded_for_ip = None
 383     _GEO_BYPASS = True
 384     _GEO_COUNTRIES = None
 385     _GEO_IP_BLOCKS = None
 386     _WORKING = True
 387
 388     def __init__(self, downloader=None):
 389         """Constructor. Receives an optional downloader."""
 390         self._ready = False
 391         self._x_forwarded_for_ip = None
 392         self.set_downloader(downloader)
 393
 394     @classmethod
 395     def suitable(cls, url):
 396         """Receives a URL and returns True if suitable for this IE."""
 397
 398         # This does not use has/getattr intentionally - we want to know whether
 399         # we have cached the regexp for *this* class, whereas getattr would also
 400         # match the superclass
 401         if '_VALID_URL_RE' not in cls.__dict__:
 402             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 403         return cls._VALID_URL_RE.match(url) is not None
 404
 405     @classmethod
 406     def _match_id(cls, url):
 407         if '_VALID_URL_RE' not in cls.__dict__:
 408             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 409         m = cls._VALID_URL_RE.match(url)
 410         assert m
 411         return compat_str(m.group('id'))
 412
 413     @classmethod
 414     def working(cls):
 415         """Getter method for _WORKING."""
 416         return cls._WORKING
 417
 418     def initialize(self):
 419         """Initializes an instance (authentication, etc)."""
 420         self._initialize_geo_bypass({
 421             'countries': self._GEO_COUNTRIES,
 422             'ip_blocks': self._GEO_IP_BLOCKS,
 423         })
 424         if not self._ready:
 425             self._real_initialize()
 426             self._ready = True
 427
 428     def _initialize_geo_bypass(self, geo_bypass_context):
 429         """
 430         Initialize geo restriction bypass mechanism.
 431
 432         This method is used to initialize geo bypass mechanism based on faking
 433         X-Forwarded-For HTTP header. A random country from provided country list
 434         is selected and a random IP belonging to this country is generated. This
 435         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 436         HTTP requests.
 437
 438         This method will be used for initial geo bypass mechanism initialization
 439         during the instance initialization with _GEO_COUNTRIES and
 440         _GEO_IP_BLOCKS.
 441
 442         You may also manually call it from extractor's code if geo bypass
 443         information is not available beforehand (e.g. obtained during
 444         extraction) or due to some other reason. In this case you should pass
 445         this information in geo bypass context passed as first argument. It may
 446         contain following fields:
 447
 448         countries:  List of geo unrestricted countries (similar
 449                     to _GEO_COUNTRIES)
 450         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 451                     (similar to _GEO_IP_BLOCKS)
 452
 453         """
 454         if not self._x_forwarded_for_ip:
 455
 456             # Geo bypass mechanism is explicitly disabled by user
 457             if not self._downloader.params.get('geo_bypass', True):
 458                 return
 459
 460             if not geo_bypass_context:
 461                 geo_bypass_context = {}
 462
 463             # Backward compatibility: previously _initialize_geo_bypass
 464             # expected a list of countries, some 3rd party code may still use
 465             # it this way
 466             if isinstance(geo_bypass_context, (list, tuple)):
 467                 geo_bypass_context = {
 468                     'countries': geo_bypass_context,
 469                 }
 470
 471             # The whole point of geo bypass mechanism is to fake IP
 472             # as X-Forwarded-For HTTP header based on some IP block or
 473             # country code.
 474
 475             # Path 1: bypassing based on IP block in CIDR notation
 476
 477             # Explicit IP block specified by user, use it right away
 478             # regardless of whether extractor is geo bypassable or not
 479             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 480
 481             # Otherwise use random IP block from geo bypass context but only
 482             # if extractor is known as geo bypassable
 483             if not ip_block:
 484                 ip_blocks = geo_bypass_context.get('ip_blocks')
 485                 if self._GEO_BYPASS and ip_blocks:
 486                     ip_block = random.choice(ip_blocks)
 487
 488             if ip_block:
 489                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 490                 if self._downloader.params.get('verbose', False):
 491                     self._downloader.to_screen(
 492                         '[debug] Using fake IP %s as X-Forwarded-For.'
 493                         % self._x_forwarded_for_ip)
 494                 return
 495
 496             # Path 2: bypassing based on country code
 497
 498             # Explicit country code specified by user, use it right away
 499             # regardless of whether extractor is geo bypassable or not
 500             country = self._downloader.params.get('geo_bypass_country', None)
 501
 502             # Otherwise use random country code from geo bypass context but
 503             # only if extractor is known as geo bypassable
 504             if not country:
 505                 countries = geo_bypass_context.get('countries')
 506                 if self._GEO_BYPASS and countries:
 507                     country = random.choice(countries)
 508
 509             if country:
 510                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 511                 if self._downloader.params.get('verbose', False):
 512                     self._downloader.to_screen(
 513                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 514                         % (self._x_forwarded_for_ip, country.upper()))
 515
 516     def extract(self, url):
 517         """Extracts URL information and returns it in list of dicts."""
 518         try:
 519             for _ in range(2):
 520                 try:
 521                     self.initialize()
 522                     ie_result = self._real_extract(url)
 523                     if self._x_forwarded_for_ip:
 524                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 525                     return ie_result
 526                 except GeoRestrictedError as e:
 527                     if self.__maybe_fake_ip_and_retry(e.countries):
 528                         continue
 529                     raise
 530         except ExtractorError:
 531             raise
 532         except compat_http_client.IncompleteRead as e:
 533             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 534         except (KeyError, StopIteration) as e:
 535             raise ExtractorError('An extractor error has occurred.', cause=e)
 536
 537     def __maybe_fake_ip_and_retry(self, countries):
 538         if (not self._downloader.params.get('geo_bypass_country', None) and
 539                 self._GEO_BYPASS and
 540                 self._downloader.params.get('geo_bypass', True) and
 541                 not self._x_forwarded_for_ip and
 542                 countries):
 543             country_code = random.choice(countries)
 544             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 545             if self._x_forwarded_for_ip:
 546                 self.report_warning(
 547                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 548                     % (self._x_forwarded_for_ip, country_code.upper()))
 549                 return True
 550         return False
 551
 552     def set_downloader(self, downloader):
 553         """Sets the downloader for this IE."""
 554         self._downloader = downloader
 555
 556     def _real_initialize(self):
 557         """Real initialization process. Redefine in subclasses."""
 558         pass
 559
 560     def _real_extract(self, url):
 561         """Real extraction process. Redefine in subclasses."""
 562         pass
 563
 564     @classmethod
 565     def ie_key(cls):
 566         """A string for getting the InfoExtractor with get_info_extractor"""
 567         return compat_str(cls.__name__[:-2])
 568
 569     @property
 570     def IE_NAME(self):
 571         return compat_str(type(self).__name__[:-2])
 572
 573     @staticmethod
 574     def __can_accept_status_code(err, expected_status):
 575         assert isinstance(err, compat_urllib_error.HTTPError)
 576         if expected_status is None:
 577             return False
 578         if isinstance(expected_status, compat_integer_types):
 579             return err.code == expected_status
 580         elif isinstance(expected_status, (list, tuple)):
 581             return err.code in expected_status
 582         elif callable(expected_status):
 583             return expected_status(err.code) is True
 584         else:
 585             assert False
 586
 587     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 588         """
 589         Return the response handle.
 590
 591         See _download_webpage docstring for arguments specification.
 592         """
 593         if note is None:
 594             self.report_download_webpage(video_id)
 595         elif note is not False:
 596             if video_id is None:
 597                 self.to_screen('%s' % (note,))
 598             else:
 599                 self.to_screen('%s: %s' % (video_id, note))
 600
 601         # Some sites check X-Forwarded-For HTTP header in order to figure out
 602         # the origin of the client behind proxy. This allows bypassing geo
 603         # restriction by faking this header's value to IP that belongs to some
 604         # geo unrestricted country. We will do so once we encounter any
 605         # geo restriction error.
 606         if self._x_forwarded_for_ip:
 607             if 'X-Forwarded-For' not in headers:
 608                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 609
 610         if isinstance(url_or_request, compat_urllib_request.Request):
 611             url_or_request = update_Request(
 612                 url_or_request, data=data, headers=headers, query=query)
 613         else:
 614             if query:
 615                 url_or_request = update_url_query(url_or_request, query)
 616             if data is not None or headers:
 617                 url_or_request = sanitized_Request(url_or_request, data, headers)
 618         try:
 619             return self._downloader.urlopen(url_or_request)
 620         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 621             if isinstance(err, compat_urllib_error.HTTPError):
 622                 if self.__can_accept_status_code(err, expected_status):
 623                     # Retain reference to error to prevent file object from
 624                     # being closed before it can be read. Works around the
 625                     # effects of <https://bugs.python.org/issue15002>
 626                     # introduced in Python 3.4.1.
 627                     err.fp._error = err
 628                     return err.fp
 629
 630             if errnote is False:
 631                 return False
 632             if errnote is None:
 633                 errnote = 'Unable to download webpage'
 634
 635             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 636             if fatal:
 637                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 638             else:
 639                 self._downloader.report_warning(errmsg)
 640                 return False
 641
 642     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 643         """
 644         Return a tuple (page content as string, URL handle).
 645
 646         See _download_webpage docstring for arguments specification.
 647         """
 648         # Strip hashes from the URL (#1038)
 649         if isinstance(url_or_request, (compat_str, str)):
 650             url_or_request = url_or_request.partition('#')[0]
 651
 652         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 653         if urlh is False:
 654             assert not fatal
 655             return False
 656         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 657         return (content, urlh)
 658
 659     @staticmethod
 660     def _guess_encoding_from_content(content_type, webpage_bytes):
 661         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 662         if m:
 663             encoding = m.group(1)
 664         else:
 665             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 666                           webpage_bytes[:1024])
 667             if m:
 668                 encoding = m.group(1).decode('ascii')
 669             elif webpage_bytes.startswith(b'\xff\xfe'):
 670                 encoding = 'utf-16'
 671             else:
 672                 encoding = 'utf-8'
 673
 674         return encoding
 675
 676     def __check_blocked(self, content):
 677         first_block = content[:512]
 678         if ('<title>Access to this site is blocked</title>' in content and
 679                 'Websense' in first_block):
 680             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 681             blocked_iframe = self._html_search_regex(
 682                 r'<iframe src="([^"]+)"', content,
 683                 'Websense information URL', default=None)
 684             if blocked_iframe:
 685                 msg += ' Visit %s for more details' % blocked_iframe
 686             raise ExtractorError(msg, expected=True)
 687         if '<title>The URL you requested has been blocked</title>' in first_block:
 688             msg = (
 689                 'Access to this webpage has been blocked by Indian censorship. '
 690                 'Use a VPN or proxy server (with --proxy) to route around it.')
 691             block_msg = self._html_search_regex(
 692                 r'</h1><p>(.*?)</p>',
 693                 content, 'block message', default=None)
 694             if block_msg:
 695                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 696             raise ExtractorError(msg, expected=True)
 697         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 698                 'blocklist.rkn.gov.ru' in content):
 699             raise ExtractorError(
 700                 'Access to this webpage has been blocked by decision of the Russian government. '
 701                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 702                 expected=True)
 703
 704     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 705         content_type = urlh.headers.get('Content-Type', '')
 706         webpage_bytes = urlh.read()
 707         if prefix is not None:
 708             webpage_bytes = prefix + webpage_bytes
 709         if not encoding:
 710             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 711         if self._downloader.params.get('dump_intermediate_pages', False):
 712             self.to_screen('Dumping request to ' + urlh.geturl())
 713             dump = base64.b64encode(webpage_bytes).decode('ascii')
 714             self._downloader.to_screen(dump)
 715         if self._downloader.params.get('write_pages', False):
 716             basen = '%s_%s' % (video_id, urlh.geturl())
 717             if len(basen) > 240:
 718                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 719                 basen = basen[:240 - len(h)] + h
 720             raw_filename = basen + '.dump'
 721             filename = sanitize_filename(raw_filename, restricted=True)
 722             self.to_screen('Saving request to ' + filename)
 723             # Working around MAX_PATH limitation on Windows (see
 724             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 725             if compat_os_name == 'nt':
 726                 absfilepath = os.path.abspath(filename)
 727                 if len(absfilepath) > 259:
 728                     filename = '\\\\?\\' + absfilepath
 729             with open(filename, 'wb') as outf:
 730                 outf.write(webpage_bytes)
 731
 732         try:
 733             content = webpage_bytes.decode(encoding, 'replace')
 734         except LookupError:
 735             content = webpage_bytes.decode('utf-8', 'replace')
 736
 737         self.__check_blocked(content)
 738
 739         return content
 740
 741     def _download_webpage(
 742             self, url_or_request, video_id, note=None, errnote=None,
 743             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 744             headers={}, query={}, expected_status=None):
 745         """
 746         Return the data of the page as a string.
 747
 748         Arguments:
 749         url_or_request -- plain text URL as a string or
 750             a compat_urllib_request.Requestobject
 751         video_id -- Video/playlist/item identifier (string)
 752
 753         Keyword arguments:
 754         note -- note printed before downloading (string)
 755         errnote -- note printed in case of an error (string)
 756         fatal -- flag denoting whether error should be considered fatal,
 757             i.e. whether it should cause ExtractionError to be raised,
 758             otherwise a warning will be reported and extraction continued
 759         tries -- number of tries
 760         timeout -- sleep interval between tries
 761         encoding -- encoding for a page content decoding, guessed automatically
 762             when not explicitly specified
 763         data -- POST data (bytes)
 764         headers -- HTTP headers (dict)
 765         query -- URL query (dict)
 766         expected_status -- allows to accept failed HTTP requests (non 2xx
 767             status code) by explicitly specifying a set of accepted status
 768             codes. Can be any of the following entities:
 769                 - an integer type specifying an exact failed status code to
 770                   accept
 771                 - a list or a tuple of integer types specifying a list of
 772                   failed status codes to accept
 773                 - a callable accepting an actual failed status code and
 774                   returning True if it should be accepted
 775             Note that this argument does not affect success status codes (2xx)
 776             which are always accepted.
 777         """
 778
 779         success = False
 780         try_count = 0
 781         while success is False:
 782             try:
 783                 res = self._download_webpage_handle(
 784                     url_or_request, video_id, note, errnote, fatal,
 785                     encoding=encoding, data=data, headers=headers, query=query,
 786                     expected_status=expected_status)
 787                 success = True
 788             except compat_http_client.IncompleteRead as e:
 789                 try_count += 1
 790                 if try_count >= tries:
 791                     raise e
 792                 self._sleep(timeout, video_id)
 793         if res is False:
 794             return res
 795         else:
 796             content, _ = res
 797             return content
 798
 799     def _download_xml_handle(
 800             self, url_or_request, video_id, note='Downloading XML',
 801             errnote='Unable to download XML', transform_source=None,
 802             fatal=True, encoding=None, data=None, headers={}, query={},
 803             expected_status=None):
 804         """
 805         Return a tuple (xml as an compat_etree_Element, URL handle).
 806
 807         See _download_webpage docstring for arguments specification.
 808         """
 809         res = self._download_webpage_handle(
 810             url_or_request, video_id, note, errnote, fatal=fatal,
 811             encoding=encoding, data=data, headers=headers, query=query,
 812             expected_status=expected_status)
 813         if res is False:
 814             return res
 815         xml_string, urlh = res
 816         return self._parse_xml(
 817             xml_string, video_id, transform_source=transform_source,
 818             fatal=fatal), urlh
 819
 820     def _download_xml(
 821             self, url_or_request, video_id,
 822             note='Downloading XML', errnote='Unable to download XML',
 823             transform_source=None, fatal=True, encoding=None,
 824             data=None, headers={}, query={}, expected_status=None):
 825         """
 826         Return the xml as an compat_etree_Element.
 827
 828         See _download_webpage docstring for arguments specification.
 829         """
 830         res = self._download_xml_handle(
 831             url_or_request, video_id, note=note, errnote=errnote,
 832             transform_source=transform_source, fatal=fatal, encoding=encoding,
 833             data=data, headers=headers, query=query,
 834             expected_status=expected_status)
 835         return res if res is False else res[0]
 836
 837     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 838         if transform_source:
 839             xml_string = transform_source(xml_string)
 840         try:
 841             return compat_etree_fromstring(xml_string.encode('utf-8'))
 842         except compat_xml_parse_error as ve:
 843             errmsg = '%s: Failed to parse XML ' % video_id
 844             if fatal:
 845                 raise ExtractorError(errmsg, cause=ve)
 846             else:
 847                 self.report_warning(errmsg + str(ve))
 848
 849     def _download_json_handle(
 850             self, url_or_request, video_id, note='Downloading JSON metadata',
 851             errnote='Unable to download JSON metadata', transform_source=None,
 852             fatal=True, encoding=None, data=None, headers={}, query={},
 853             expected_status=None):
 854         """
 855         Return a tuple (JSON object, URL handle).
 856
 857         See _download_webpage docstring for arguments specification.
 858         """
 859         res = self._download_webpage_handle(
 860             url_or_request, video_id, note, errnote, fatal=fatal,
 861             encoding=encoding, data=data, headers=headers, query=query,
 862             expected_status=expected_status)
 863         if res is False:
 864             return res
 865         json_string, urlh = res
 866         return self._parse_json(
 867             json_string, video_id, transform_source=transform_source,
 868             fatal=fatal), urlh
 869
 870     def _download_json(
 871             self, url_or_request, video_id, note='Downloading JSON metadata',
 872             errnote='Unable to download JSON metadata', transform_source=None,
 873             fatal=True, encoding=None, data=None, headers={}, query={},
 874             expected_status=None):
 875         """
 876         Return the JSON object as a dict.
 877
 878         See _download_webpage docstring for arguments specification.
 879         """
 880         res = self._download_json_handle(
 881             url_or_request, video_id, note=note, errnote=errnote,
 882             transform_source=transform_source, fatal=fatal, encoding=encoding,
 883             data=data, headers=headers, query=query,
 884             expected_status=expected_status)
 885         return res if res is False else res[0]
 886
 887     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 888         if transform_source:
 889             json_string = transform_source(json_string)
 890         try:
 891             return json.loads(json_string)
 892         except ValueError as ve:
 893             errmsg = '%s: Failed to parse JSON ' % video_id
 894             if fatal:
 895                 raise ExtractorError(errmsg, cause=ve)
 896             else:
 897                 self.report_warning(errmsg + str(ve))
 898
 899     def report_warning(self, msg, video_id=None):
 900         idstr = '' if video_id is None else '%s: ' % video_id
 901         self._downloader.report_warning(
 902             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 903
 904     def to_screen(self, msg):
 905         """Print msg to screen, prefixing it with '[ie_name]'"""
 906         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 907
 908     def report_extraction(self, id_or_name):
 909         """Report information extraction."""
 910         self.to_screen('%s: Extracting information' % id_or_name)
 911
 912     def report_download_webpage(self, video_id):
 913         """Report webpage download."""
 914         self.to_screen('%s: Downloading webpage' % video_id)
 915
 916     def report_age_confirmation(self):
 917         """Report attempt to confirm age."""
 918         self.to_screen('Confirming age')
 919
 920     def report_login(self):
 921         """Report attempt to log in."""
 922         self.to_screen('Logging in')
 923
 924     @staticmethod
 925     def raise_login_required(msg='This video is only available for registered users'):
 926         raise ExtractorError(
 927             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 928             expected=True)
 929
 930     @staticmethod
 931     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 932         raise GeoRestrictedError(msg, countries=countries)
 933
 934     # Methods for following #608
 935     @staticmethod
 936     def url_result(url, ie=None, video_id=None, video_title=None):
 937         """Returns a URL that points to a page that should be processed"""
 938         # TODO: ie should be the class used for getting the info
 939         video_info = {'_type': 'url',
 940                       'url': url,
 941                       'ie_key': ie}
 942         if video_id is not None:
 943             video_info['id'] = video_id
 944         if video_title is not None:
 945             video_info['title'] = video_title
 946         return video_info
 947
 948     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 949         urls = orderedSet(
 950             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 951             for m in matches)
 952         return self.playlist_result(
 953             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 954
 955     @staticmethod
 956     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 957         """Returns a playlist"""
 958         video_info = {'_type': 'playlist',
 959                       'entries': entries}
 960         if playlist_id:
 961             video_info['id'] = playlist_id
 962         if playlist_title:
 963             video_info['title'] = playlist_title
 964         if playlist_description:
 965             video_info['description'] = playlist_description
 966         return video_info
 967
 968     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 969         """
 970         Perform a regex search on the given string, using a single or a list of
 971         patterns returning the first matching group.
 972         In case of failure return a default value or raise a WARNING or a
 973         RegexNotFoundError, depending on fatal, specifying the field name.
 974         """
 975         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 976             mobj = re.search(pattern, string, flags)
 977         else:
 978             for p in pattern:
 979                 mobj = re.search(p, string, flags)
 980                 if mobj:
 981                     break
 982
 983         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 984             _name = '\033[0;34m%s\033[0m' % name
 985         else:
 986             _name = name
 987
 988         if mobj:
 989             if group is None:
 990                 # return the first matching group
 991                 return next(g for g in mobj.groups() if g is not None)
 992             else:
 993                 return mobj.group(group)
 994         elif default is not NO_DEFAULT:
 995             return default
 996         elif fatal:
 997             raise RegexNotFoundError('Unable to extract %s' % _name)
 998         else:
 999             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1000             return None
1001
1002     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1003         """
1004         Like _search_regex, but strips HTML tags and unescapes entities.
1005         """
1006         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1007         if res:
1008             return clean_html(res).strip()
1009         else:
1010             return res
1011
1012     def _get_netrc_login_info(self, netrc_machine=None):
1013         username = None
1014         password = None
1015         netrc_machine = netrc_machine or self._NETRC_MACHINE
1016
1017         if self._downloader.params.get('usenetrc', False):
1018             try:
1019                 info = netrc.netrc().authenticators(netrc_machine)
1020                 if info is not None:
1021                     username = info[0]
1022                     password = info[2]
1023                 else:
1024                     raise netrc.NetrcParseError(
1025                         'No authenticators for %s' % netrc_machine)
1026             except (IOError, netrc.NetrcParseError) as err:
1027                 self._downloader.report_warning(
1028                     'parsing .netrc: %s' % error_to_compat_str(err))
1029
1030         return username, password
1031
1032     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1033         """
1034         Get the login info as (username, password)
1035         First look for the manually specified credentials using username_option
1036         and password_option as keys in params dictionary. If no such credentials
1037         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1038         value.
1039         If there's no info available, return (None, None)
1040         """
1041         if self._downloader is None:
1042             return (None, None)
1043
1044         downloader_params = self._downloader.params
1045
1046         # Attempt to use provided username and password or .netrc data
1047         if downloader_params.get(username_option) is not None:
1048             username = downloader_params[username_option]
1049             password = downloader_params[password_option]
1050         else:
1051             username, password = self._get_netrc_login_info(netrc_machine)
1052
1053         return username, password
1054
1055     def _get_tfa_info(self, note='two-factor verification code'):
1056         """
1057         Get the two-factor authentication info
1058         TODO - asking the user will be required for sms/phone verify
1059         currently just uses the command line option
1060         If there's no info available, return None
1061         """
1062         if self._downloader is None:
1063             return None
1064         downloader_params = self._downloader.params
1065
1066         if downloader_params.get('twofactor') is not None:
1067             return downloader_params['twofactor']
1068
1069         return compat_getpass('Type %s and press [Return]: ' % note)
1070
1071     # Helper functions for extracting OpenGraph info
1072     @staticmethod
1073     def _og_regexes(prop):
1074         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1075         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1076                        % {'prop': re.escape(prop)})
1077         template = r'<meta[^>]+?%s[^>]+?%s'
1078         return [
1079             template % (property_re, content_re),
1080             template % (content_re, property_re),
1081         ]
1082
1083     @staticmethod
1084     def _meta_regex(prop):
1085         return r'''(?isx)<meta
1086                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1087                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1088
1089     def _og_search_property(self, prop, html, name=None, **kargs):
1090         if not isinstance(prop, (list, tuple)):
1091             prop = [prop]
1092         if name is None:
1093             name = 'OpenGraph %s' % prop[0]
1094         og_regexes = []
1095         for p in prop:
1096             og_regexes.extend(self._og_regexes(p))
1097         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1098         if escaped is None:
1099             return None
1100         return unescapeHTML(escaped)
1101
1102     def _og_search_thumbnail(self, html, **kargs):
1103         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1104
1105     def _og_search_description(self, html, **kargs):
1106         return self._og_search_property('description', html, fatal=False, **kargs)
1107
1108     def _og_search_title(self, html, **kargs):
1109         return self._og_search_property('title', html, **kargs)
1110
1111     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1112         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1113         if secure:
1114             regexes = self._og_regexes('video:secure_url') + regexes
1115         return self._html_search_regex(regexes, html, name, **kargs)
1116
1117     def _og_search_url(self, html, **kargs):
1118         return self._og_search_property('url', html, **kargs)
1119
1120     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1121         if not isinstance(name, (list, tuple)):
1122             name = [name]
1123         if display_name is None:
1124             display_name = name[0]
1125         return self._html_search_regex(
1126             [self._meta_regex(n) for n in name],
1127             html, display_name, fatal=fatal, group='content', **kwargs)
1128
1129     def _dc_search_uploader(self, html):
1130         return self._html_search_meta('dc.creator', html, 'uploader')
1131
1132     def _rta_search(self, html):
1133         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1134         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1135                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1136                      html):
1137             return 18
1138         return 0
1139
1140     def _media_rating_search(self, html):
1141         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1142         rating = self._html_search_meta('rating', html)
1143
1144         if not rating:
1145             return None
1146
1147         RATING_TABLE = {
1148             'safe for kids': 0,
1149             'general': 8,
1150             '14 years': 14,
1151             'mature': 17,
1152             'restricted': 19,
1153         }
1154         return RATING_TABLE.get(rating.lower())
1155
1156     def _family_friendly_search(self, html):
1157         # See http://schema.org/VideoObject
1158         family_friendly = self._html_search_meta(
1159             'isFamilyFriendly', html, default=None)
1160
1161         if not family_friendly:
1162             return None
1163
1164         RATING_TABLE = {
1165             '1': 0,
1166             'true': 0,
1167             '0': 18,
1168             'false': 18,
1169         }
1170         return RATING_TABLE.get(family_friendly.lower())
1171
1172     def _twitter_search_player(self, html):
1173         return self._html_search_meta('twitter:player', html,
1174                                       'twitter card player')
1175
1176     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1177         json_ld = self._search_regex(
1178             JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
1179         default = kwargs.get('default', NO_DEFAULT)
1180         if not json_ld:
1181             return default if default is not NO_DEFAULT else {}
1182         # JSON-LD may be malformed and thus `fatal` should be respected.
1183         # At the same time `default` may be passed that assumes `fatal=False`
1184         # for _search_regex. Let's simulate the same behavior here as well.
1185         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1186         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1187
1188     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1189         if isinstance(json_ld, compat_str):
1190             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1191         if not json_ld:
1192             return {}
1193         info = {}
1194         if not isinstance(json_ld, (list, tuple, dict)):
1195             return info
1196         if isinstance(json_ld, dict):
1197             json_ld = [json_ld]
1198
1199         INTERACTION_TYPE_MAP = {
1200             'CommentAction': 'comment',
1201             'AgreeAction': 'like',
1202             'DisagreeAction': 'dislike',
1203             'LikeAction': 'like',
1204             'DislikeAction': 'dislike',
1205             'ListenAction': 'view',
1206             'WatchAction': 'view',
1207             'ViewAction': 'view',
1208         }
1209
1210         def extract_interaction_statistic(e):
1211             interaction_statistic = e.get('interactionStatistic')
1212             if not isinstance(interaction_statistic, list):
1213                 return
1214             for is_e in interaction_statistic:
1215                 if not isinstance(is_e, dict):
1216                     continue
1217                 if is_e.get('@type') != 'InteractionCounter':
1218                     continue
1219                 interaction_type = is_e.get('interactionType')
1220                 if not isinstance(interaction_type, compat_str):
1221                     continue
1222                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1223                 if interaction_count is None:
1224                     continue
1225                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1226                 if not count_kind:
1227                     continue
1228                 count_key = '%s_count' % count_kind
1229                 if info.get(count_key) is not None:
1230                     continue
1231                 info[count_key] = interaction_count
1232
1233         def extract_video_object(e):
1234             assert e['@type'] == 'VideoObject'
1235             info.update({
1236                 'url': url_or_none(e.get('contentUrl')),
1237                 'title': unescapeHTML(e.get('name')),
1238                 'description': unescapeHTML(e.get('description')),
1239                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1240                 'duration': parse_duration(e.get('duration')),
1241                 'timestamp': unified_timestamp(e.get('uploadDate')),
1242                 'filesize': float_or_none(e.get('contentSize')),
1243                 'tbr': int_or_none(e.get('bitrate')),
1244                 'width': int_or_none(e.get('width')),
1245                 'height': int_or_none(e.get('height')),
1246                 'view_count': int_or_none(e.get('interactionCount')),
1247             })
1248             extract_interaction_statistic(e)
1249
1250         for e in json_ld:
1251             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1252                 item_type = e.get('@type')
1253                 if expected_type is not None and expected_type != item_type:
1254                     return info
1255                 if item_type in ('TVEpisode', 'Episode'):
1256                     episode_name = unescapeHTML(e.get('name'))
1257                     info.update({
1258                         'episode': episode_name,
1259                         'episode_number': int_or_none(e.get('episodeNumber')),
1260                         'description': unescapeHTML(e.get('description')),
1261                     })
1262                     if not info.get('title') and episode_name:
1263                         info['title'] = episode_name
1264                     part_of_season = e.get('partOfSeason')
1265                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1266                         info.update({
1267                             'season': unescapeHTML(part_of_season.get('name')),
1268                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1269                         })
1270                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1271                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1272                         info['series'] = unescapeHTML(part_of_series.get('name'))
1273                 elif item_type == 'Movie':
1274                     info.update({
1275                         'title': unescapeHTML(e.get('name')),
1276                         'description': unescapeHTML(e.get('description')),
1277                         'duration': parse_duration(e.get('duration')),
1278                         'timestamp': unified_timestamp(e.get('dateCreated')),
1279                     })
1280                 elif item_type in ('Article', 'NewsArticle'):
1281                     info.update({
1282                         'timestamp': parse_iso8601(e.get('datePublished')),
1283                         'title': unescapeHTML(e.get('headline')),
1284                         'description': unescapeHTML(e.get('articleBody')),
1285                     })
1286                 elif item_type == 'VideoObject':
1287                     extract_video_object(e)
1288                     continue
1289                 video = e.get('video')
1290                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1291                     extract_video_object(video)
1292                 break
1293         return dict((k, v) for k, v in info.items() if v is not None)
1294
1295     @staticmethod
1296     def _hidden_inputs(html):
1297         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1298         hidden_inputs = {}
1299         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1300             attrs = extract_attributes(input)
1301             if not input:
1302                 continue
1303             if attrs.get('type') not in ('hidden', 'submit'):
1304                 continue
1305             name = attrs.get('name') or attrs.get('id')
1306             value = attrs.get('value')
1307             if name and value is not None:
1308                 hidden_inputs[name] = value
1309         return hidden_inputs
1310
1311     def _form_hidden_inputs(self, form_id, html):
1312         form = self._search_regex(
1313             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1314             html, '%s form' % form_id, group='form')
1315         return self._hidden_inputs(form)
1316
1317     def _sort_formats(self, formats, field_preference=None):
1318         if not formats:
1319             raise ExtractorError('No video formats found')
1320
1321         for f in formats:
1322             # Automatically determine tbr when missing based on abr and vbr (improves
1323             # formats sorting in some cases)
1324             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1325                 f['tbr'] = f['abr'] + f['vbr']
1326
1327         def _formats_key(f):
1328             # TODO remove the following workaround
1329             from ..utils import determine_ext
1330             if not f.get('ext') and 'url' in f:
1331                 f['ext'] = determine_ext(f['url'])
1332
1333             if isinstance(field_preference, (list, tuple)):
1334                 return tuple(
1335                     f.get(field)
1336                     if f.get(field) is not None
1337                     else ('' if field == 'format_id' else -1)
1338                     for field in field_preference)
1339
1340             preference = f.get('preference')
1341             if preference is None:
1342                 preference = 0
1343                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1344                     preference -= 0.5
1345
1346             protocol = f.get('protocol') or determine_protocol(f)
1347             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1348
1349             if f.get('vcodec') == 'none':  # audio only
1350                 preference -= 50
1351                 if self._downloader.params.get('prefer_free_formats'):
1352                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1353                 else:
1354                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1355                 ext_preference = 0
1356                 try:
1357                     audio_ext_preference = ORDER.index(f['ext'])
1358                 except ValueError:
1359                     audio_ext_preference = -1
1360             else:
1361                 if f.get('acodec') == 'none':  # video only
1362                     preference -= 40
1363                 if self._downloader.params.get('prefer_free_formats'):
1364                     ORDER = ['flv', 'mp4', 'webm']
1365                 else:
1366                     ORDER = ['webm', 'flv', 'mp4']
1367                 try:
1368                     ext_preference = ORDER.index(f['ext'])
1369                 except ValueError:
1370                     ext_preference = -1
1371                 audio_ext_preference = 0
1372
1373             return (
1374                 preference,
1375                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1376                 f.get('quality') if f.get('quality') is not None else -1,
1377                 f.get('tbr') if f.get('tbr') is not None else -1,
1378                 f.get('filesize') if f.get('filesize') is not None else -1,
1379                 f.get('vbr') if f.get('vbr') is not None else -1,
1380                 f.get('height') if f.get('height') is not None else -1,
1381                 f.get('width') if f.get('width') is not None else -1,
1382                 proto_preference,
1383                 ext_preference,
1384                 f.get('abr') if f.get('abr') is not None else -1,
1385                 audio_ext_preference,
1386                 f.get('fps') if f.get('fps') is not None else -1,
1387                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1388                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1389                 f.get('format_id') if f.get('format_id') is not None else '',
1390             )
1391         formats.sort(key=_formats_key)
1392
1393     def _check_formats(self, formats, video_id):
1394         if formats:
1395             formats[:] = filter(
1396                 lambda f: self._is_valid_url(
1397                     f['url'], video_id,
1398                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1399                 formats)
1400
1401     @staticmethod
1402     def _remove_duplicate_formats(formats):
1403         format_urls = set()
1404         unique_formats = []
1405         for f in formats:
1406             if f['url'] not in format_urls:
1407                 format_urls.add(f['url'])
1408                 unique_formats.append(f)
1409         formats[:] = unique_formats
1410
1411     def _is_valid_url(self, url, video_id, item='video', headers={}):
1412         url = self._proto_relative_url(url, scheme='http:')
1413         # For now assume non HTTP(S) URLs always valid
1414         if not (url.startswith('http://') or url.startswith('https://')):
1415             return True
1416         try:
1417             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1418             return True
1419         except ExtractorError as e:
1420             if isinstance(e.cause, compat_urllib_error.URLError):
1421                 self.to_screen(
1422                     '%s: %s URL is invalid, skipping' % (video_id, item))
1423                 return False
1424             raise
1425
1426     def http_scheme(self):
1427         """ Either "http:" or "https:", depending on the user's preferences """
1428         return (
1429             'http:'
1430             if self._downloader.params.get('prefer_insecure', False)
1431             else 'https:')
1432
1433     def _proto_relative_url(self, url, scheme=None):
1434         if url is None:
1435             return url
1436         if url.startswith('//'):
1437             if scheme is None:
1438                 scheme = self.http_scheme()
1439             return scheme + url
1440         else:
1441             return url
1442
1443     def _sleep(self, timeout, video_id, msg_template=None):
1444         if msg_template is None:
1445             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1446         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1447         self.to_screen(msg)
1448         time.sleep(timeout)
1449
1450     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1451                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1452                              fatal=True, m3u8_id=None):
1453         manifest = self._download_xml(
1454             manifest_url, video_id, 'Downloading f4m manifest',
1455             'Unable to download f4m manifest',
1456             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1457             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1458             transform_source=transform_source,
1459             fatal=fatal)
1460
1461         if manifest is False:
1462             return []
1463
1464         return self._parse_f4m_formats(
1465             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1466             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1467
1468     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1469                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1470                            fatal=True, m3u8_id=None):
1471         if not isinstance(manifest, compat_etree_Element) and not fatal:
1472             return []
1473
1474         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1475         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1476         if akamai_pv is not None and ';' in akamai_pv.text:
1477             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1478             if playerVerificationChallenge.strip() != '':
1479                 return []
1480
1481         formats = []
1482         manifest_version = '1.0'
1483         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1484         if not media_nodes:
1485             manifest_version = '2.0'
1486             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1487         # Remove unsupported DRM protected media from final formats
1488         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1489         media_nodes = remove_encrypted_media(media_nodes)
1490         if not media_nodes:
1491             return formats
1492
1493         manifest_base_url = get_base_url(manifest)
1494
1495         bootstrap_info = xpath_element(
1496             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1497             'bootstrap info', default=None)
1498
1499         vcodec = None
1500         mime_type = xpath_text(
1501             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1502             'base URL', default=None)
1503         if mime_type and mime_type.startswith('audio/'):
1504             vcodec = 'none'
1505
1506         for i, media_el in enumerate(media_nodes):
1507             tbr = int_or_none(media_el.attrib.get('bitrate'))
1508             width = int_or_none(media_el.attrib.get('width'))
1509             height = int_or_none(media_el.attrib.get('height'))
1510             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1511             # If <bootstrapInfo> is present, the specified f4m is a
1512             # stream-level manifest, and only set-level manifests may refer to
1513             # external resources.  See section 11.4 and section 4 of F4M spec
1514             if bootstrap_info is None:
1515                 media_url = None
1516                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1517                 if manifest_version == '2.0':
1518                     media_url = media_el.attrib.get('href')
1519                 if media_url is None:
1520                     media_url = media_el.attrib.get('url')
1521                 if not media_url:
1522                     continue
1523                 manifest_url = (
1524                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1525                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1526                 # If media_url is itself a f4m manifest do the recursive extraction
1527                 # since bitrates in parent manifest (this one) and media_url manifest
1528                 # may differ leading to inability to resolve the format by requested
1529                 # bitrate in f4m downloader
1530                 ext = determine_ext(manifest_url)
1531                 if ext == 'f4m':
1532                     f4m_formats = self._extract_f4m_formats(
1533                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1534                         transform_source=transform_source, fatal=fatal)
1535                     # Sometimes stream-level manifest contains single media entry that
1536                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1537                     # At the same time parent's media entry in set-level manifest may
1538                     # contain it. We will copy it from parent in such cases.
1539                     if len(f4m_formats) == 1:
1540                         f = f4m_formats[0]
1541                         f.update({
1542                             'tbr': f.get('tbr') or tbr,
1543                             'width': f.get('width') or width,
1544                             'height': f.get('height') or height,
1545                             'format_id': f.get('format_id') if not tbr else format_id,
1546                             'vcodec': vcodec,
1547                         })
1548                     formats.extend(f4m_formats)
1549                     continue
1550                 elif ext == 'm3u8':
1551                     formats.extend(self._extract_m3u8_formats(
1552                         manifest_url, video_id, 'mp4', preference=preference,
1553                         m3u8_id=m3u8_id, fatal=fatal))
1554                     continue
1555             formats.append({
1556                 'format_id': format_id,
1557                 'url': manifest_url,
1558                 'manifest_url': manifest_url,
1559                 'ext': 'flv' if bootstrap_info is not None else None,
1560                 'protocol': 'f4m',
1561                 'tbr': tbr,
1562                 'width': width,
1563                 'height': height,
1564                 'vcodec': vcodec,
1565                 'preference': preference,
1566             })
1567         return formats
1568
1569     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1570         return {
1571             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1572             'url': m3u8_url,
1573             'ext': ext,
1574             'protocol': 'm3u8',
1575             'preference': preference - 100 if preference else -100,
1576             'resolution': 'multiple',
1577             'format_note': 'Quality selection URL',
1578         }
1579
1580     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1581                               entry_protocol='m3u8', preference=None,
1582                               m3u8_id=None, note=None, errnote=None,
1583                               fatal=True, live=False):
1584         res = self._download_webpage_handle(
1585             m3u8_url, video_id,
1586             note=note or 'Downloading m3u8 information',
1587             errnote=errnote or 'Failed to download m3u8 information',
1588             fatal=fatal)
1589
1590         if res is False:
1591             return []
1592
1593         m3u8_doc, urlh = res
1594         m3u8_url = urlh.geturl()
1595
1596         return self._parse_m3u8_formats(
1597             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1598             preference=preference, m3u8_id=m3u8_id, live=live)
1599
1600     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1601                             entry_protocol='m3u8', preference=None,
1602                             m3u8_id=None, live=False):
1603         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1604             return []
1605
1606         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1607             return []
1608
1609         formats = []
1610
1611         format_url = lambda u: (
1612             u
1613             if re.match(r'^https?://', u)
1614             else compat_urlparse.urljoin(m3u8_url, u))
1615
1616         # References:
1617         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1618         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1619         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1620
1621         # We should try extracting formats only from master playlists [1, 4.3.4],
1622         # i.e. playlists that describe available qualities. On the other hand
1623         # media playlists [1, 4.3.3] should be returned as is since they contain
1624         # just the media without qualities renditions.
1625         # Fortunately, master playlist can be easily distinguished from media
1626         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1627         # master playlist tags MUST NOT appear in a media playist and vice versa.
1628         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1629         # media playlist and MUST NOT appear in master playlist thus we can
1630         # clearly detect media playlist with this criterion.
1631
1632         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1633             return [{
1634                 'url': m3u8_url,
1635                 'format_id': m3u8_id,
1636                 'ext': ext,
1637                 'protocol': entry_protocol,
1638                 'preference': preference,
1639             }]
1640
1641         groups = {}
1642         last_stream_inf = {}
1643
1644         def extract_media(x_media_line):
1645             media = parse_m3u8_attributes(x_media_line)
1646             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1647             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1648             if not (media_type and group_id and name):
1649                 return
1650             groups.setdefault(group_id, []).append(media)
1651             if media_type not in ('VIDEO', 'AUDIO'):
1652                 return
1653             media_url = media.get('URI')
1654             if media_url:
1655                 format_id = []
1656                 for v in (m3u8_id, group_id, name):
1657                     if v:
1658                         format_id.append(v)
1659                 f = {
1660                     'format_id': '-'.join(format_id),
1661                     'url': format_url(media_url),
1662                     'manifest_url': m3u8_url,
1663                     'language': media.get('LANGUAGE'),
1664                     'ext': ext,
1665                     'protocol': entry_protocol,
1666                     'preference': preference,
1667                 }
1668                 if media_type == 'AUDIO':
1669                     f['vcodec'] = 'none'
1670                 formats.append(f)
1671
1672         def build_stream_name():
1673             # Despite specification does not mention NAME attribute for
1674             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1675             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1676             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1677             stream_name = last_stream_inf.get('NAME')
1678             if stream_name:
1679                 return stream_name
1680             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1681             # from corresponding rendition group
1682             stream_group_id = last_stream_inf.get('VIDEO')
1683             if not stream_group_id:
1684                 return
1685             stream_group = groups.get(stream_group_id)
1686             if not stream_group:
1687                 return stream_group_id
1688             rendition = stream_group[0]
1689             return rendition.get('NAME') or stream_group_id
1690
1691         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
1692         # chance to detect video only formats when EXT-X-STREAM-INF tags
1693         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1694         for line in m3u8_doc.splitlines():
1695             if line.startswith('#EXT-X-MEDIA:'):
1696                 extract_media(line)
1697
1698         for line in m3u8_doc.splitlines():
1699             if line.startswith('#EXT-X-STREAM-INF:'):
1700                 last_stream_inf = parse_m3u8_attributes(line)
1701             elif line.startswith('#') or not line.strip():
1702                 continue
1703             else:
1704                 tbr = float_or_none(
1705                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1706                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1707                 format_id = []
1708                 if m3u8_id:
1709                     format_id.append(m3u8_id)
1710                 stream_name = build_stream_name()
1711                 # Bandwidth of live streams may differ over time thus making
1712                 # format_id unpredictable. So it's better to keep provided
1713                 # format_id intact.
1714                 if not live:
1715                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1716                 manifest_url = format_url(line.strip())
1717                 f = {
1718                     'format_id': '-'.join(format_id),
1719                     'url': manifest_url,
1720                     'manifest_url': m3u8_url,
1721                     'tbr': tbr,
1722                     'ext': ext,
1723                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1724                     'protocol': entry_protocol,
1725                     'preference': preference,
1726                 }
1727                 resolution = last_stream_inf.get('RESOLUTION')
1728                 if resolution:
1729                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1730                     if mobj:
1731                         f['width'] = int(mobj.group('width'))
1732                         f['height'] = int(mobj.group('height'))
1733                 # Unified Streaming Platform
1734                 mobj = re.search(
1735                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1736                 if mobj:
1737                     abr, vbr = mobj.groups()
1738                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1739                     f.update({
1740                         'vbr': vbr,
1741                         'abr': abr,
1742                     })
1743                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1744                 f.update(codecs)
1745                 audio_group_id = last_stream_inf.get('AUDIO')
1746                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1747                 # references a rendition group MUST have a CODECS attribute.
1748                 # However, this is not always respected, for example, [2]
1749                 # contains EXT-X-STREAM-INF tag which references AUDIO
1750                 # rendition group but does not have CODECS and despite
1751                 # referencing an audio group it represents a complete
1752                 # (with audio and video) format. So, for such cases we will
1753                 # ignore references to rendition groups and treat them
1754                 # as complete formats.
1755                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1756                     audio_group = groups.get(audio_group_id)
1757                     if audio_group and audio_group[0].get('URI'):
1758                         # TODO: update acodec for audio only formats with
1759                         # the same GROUP-ID
1760                         f['acodec'] = 'none'
1761                 formats.append(f)
1762                 last_stream_inf = {}
1763         return formats
1764
1765     @staticmethod
1766     def _xpath_ns(path, namespace=None):
1767         if not namespace:
1768             return path
1769         out = []
1770         for c in path.split('/'):
1771             if not c or c == '.':
1772                 out.append(c)
1773             else:
1774                 out.append('{%s}%s' % (namespace, c))
1775         return '/'.join(out)
1776
1777     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1778         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1779
1780         if smil is False:
1781             assert not fatal
1782             return []
1783
1784         namespace = self._parse_smil_namespace(smil)
1785
1786         return self._parse_smil_formats(
1787             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1788
1789     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1790         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1791         if smil is False:
1792             return {}
1793         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1794
1795     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1796         return self._download_xml(
1797             smil_url, video_id, 'Downloading SMIL file',
1798             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1799
1800     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1801         namespace = self._parse_smil_namespace(smil)
1802
1803         formats = self._parse_smil_formats(
1804             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1805         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1806
1807         video_id = os.path.splitext(url_basename(smil_url))[0]
1808         title = None
1809         description = None
1810         upload_date = None
1811         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1812             name = meta.attrib.get('name')
1813             content = meta.attrib.get('content')
1814             if not name or not content:
1815                 continue
1816             if not title and name == 'title':
1817                 title = content
1818             elif not description and name in ('description', 'abstract'):
1819                 description = content
1820             elif not upload_date and name == 'date':
1821                 upload_date = unified_strdate(content)
1822
1823         thumbnails = [{
1824             'id': image.get('type'),
1825             'url': image.get('src'),
1826             'width': int_or_none(image.get('width')),
1827             'height': int_or_none(image.get('height')),
1828         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1829
1830         return {
1831             'id': video_id,
1832             'title': title or video_id,
1833             'description': description,
1834             'upload_date': upload_date,
1835             'thumbnails': thumbnails,
1836             'formats': formats,
1837             'subtitles': subtitles,
1838         }
1839
1840     def _parse_smil_namespace(self, smil):
1841         return self._search_regex(
1842             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1843
1844     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1845         base = smil_url
1846         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1847             b = meta.get('base') or meta.get('httpBase')
1848             if b:
1849                 base = b
1850                 break
1851
1852         formats = []
1853         rtmp_count = 0
1854         http_count = 0
1855         m3u8_count = 0
1856
1857         srcs = []
1858         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1859         for medium in media:
1860             src = medium.get('src')
1861             if not src or src in srcs:
1862                 continue
1863             srcs.append(src)
1864
1865             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1866             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1867             width = int_or_none(medium.get('width'))
1868             height = int_or_none(medium.get('height'))
1869             proto = medium.get('proto')
1870             ext = medium.get('ext')
1871             src_ext = determine_ext(src)
1872             streamer = medium.get('streamer') or base
1873
1874             if proto == 'rtmp' or streamer.startswith('rtmp'):
1875                 rtmp_count += 1
1876                 formats.append({
1877                     'url': streamer,
1878                     'play_path': src,
1879                     'ext': 'flv',
1880                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1881                     'tbr': bitrate,
1882                     'filesize': filesize,
1883                     'width': width,
1884                     'height': height,
1885                 })
1886                 if transform_rtmp_url:
1887                     streamer, src = transform_rtmp_url(streamer, src)
1888                     formats[-1].update({
1889                         'url': streamer,
1890                         'play_path': src,
1891                     })
1892                 continue
1893
1894             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1895             src_url = src_url.strip()
1896
1897             if proto == 'm3u8' or src_ext == 'm3u8':
1898                 m3u8_formats = self._extract_m3u8_formats(
1899                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1900                 if len(m3u8_formats) == 1:
1901                     m3u8_count += 1
1902                     m3u8_formats[0].update({
1903                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1904                         'tbr': bitrate,
1905                         'width': width,
1906                         'height': height,
1907                     })
1908                 formats.extend(m3u8_formats)
1909             elif src_ext == 'f4m':
1910                 f4m_url = src_url
1911                 if not f4m_params:
1912                     f4m_params = {
1913                         'hdcore': '3.2.0',
1914                         'plugin': 'flowplayer-3.2.0.1',
1915                     }
1916                 f4m_url += '&' if '?' in f4m_url else '?'
1917                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1918                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1919             elif src_ext == 'mpd':
1920                 formats.extend(self._extract_mpd_formats(
1921                     src_url, video_id, mpd_id='dash', fatal=False))
1922             elif re.search(r'\.ism/[Mm]anifest', src_url):
1923                 formats.extend(self._extract_ism_formats(
1924                     src_url, video_id, ism_id='mss', fatal=False))
1925             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1926                 http_count += 1
1927                 formats.append({
1928                     'url': src_url,
1929                     'ext': ext or src_ext or 'flv',
1930                     'format_id': 'http-%d' % (bitrate or http_count),
1931                     'tbr': bitrate,
1932                     'filesize': filesize,
1933                     'width': width,
1934                     'height': height,
1935                 })
1936
1937         return formats
1938
1939     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1940         urls = []
1941         subtitles = {}
1942         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1943             src = textstream.get('src')
1944             if not src or src in urls:
1945                 continue
1946             urls.append(src)
1947             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1948             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1949             subtitles.setdefault(lang, []).append({
1950                 'url': src,
1951                 'ext': ext,
1952             })
1953         return subtitles
1954
1955     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1956         xspf = self._download_xml(
1957             xspf_url, playlist_id, 'Downloading xpsf playlist',
1958             'Unable to download xspf manifest', fatal=fatal)
1959         if xspf is False:
1960             return []
1961         return self._parse_xspf(
1962             xspf, playlist_id, xspf_url=xspf_url,
1963             xspf_base_url=base_url(xspf_url))
1964
1965     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1966         NS_MAP = {
1967             'xspf': 'http://xspf.org/ns/0/',
1968             's1': 'http://static.streamone.nl/player/ns/0',
1969         }
1970
1971         entries = []
1972         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1973             title = xpath_text(
1974                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1975             description = xpath_text(
1976                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1977             thumbnail = xpath_text(
1978                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1979             duration = float_or_none(
1980                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1981
1982             formats = []
1983             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1984                 format_url = urljoin(xspf_base_url, location.text)
1985                 if not format_url:
1986                     continue
1987                 formats.append({
1988                     'url': format_url,
1989                     'manifest_url': xspf_url,
1990                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1991                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1992                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1993                 })
1994             self._sort_formats(formats)
1995
1996             entries.append({
1997                 'id': playlist_id,
1998                 'title': title,
1999                 'description': description,
2000                 'thumbnail': thumbnail,
2001                 'duration': duration,
2002                 'formats': formats,
2003             })
2004         return entries
2005
2006     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
2007         res = self._download_xml_handle(
2008             mpd_url, video_id,
2009             note=note or 'Downloading MPD manifest',
2010             errnote=errnote or 'Failed to download MPD manifest',
2011             fatal=fatal)
2012         if res is False:
2013             return []
2014         mpd_doc, urlh = res
2015         mpd_base_url = base_url(urlh.geturl())
2016
2017         return self._parse_mpd_formats(
2018             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
2019             formats_dict=formats_dict, mpd_url=mpd_url)
2020
2021     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
2022         """
2023         Parse formats from MPD manifest.
2024         References:
2025          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2026             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2027          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2028         """
2029         if mpd_doc.get('type') == 'dynamic':
2030             return []
2031
2032         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2033
2034         def _add_ns(path):
2035             return self._xpath_ns(path, namespace)
2036
2037         def is_drm_protected(element):
2038             return element.find(_add_ns('ContentProtection')) is not None
2039
2040         def extract_multisegment_info(element, ms_parent_info):
2041             ms_info = ms_parent_info.copy()
2042
2043             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2044             # common attributes and elements.  We will only extract relevant
2045             # for us.
2046             def extract_common(source):
2047                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2048                 if segment_timeline is not None:
2049                     s_e = segment_timeline.findall(_add_ns('S'))
2050                     if s_e:
2051                         ms_info['total_number'] = 0
2052                         ms_info['s'] = []
2053                         for s in s_e:
2054                             r = int(s.get('r', 0))
2055                             ms_info['total_number'] += 1 + r
2056                             ms_info['s'].append({
2057                                 't': int(s.get('t', 0)),
2058                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2059                                 'd': int(s.attrib['d']),
2060                                 'r': r,
2061                             })
2062                 start_number = source.get('startNumber')
2063                 if start_number:
2064                     ms_info['start_number'] = int(start_number)
2065                 timescale = source.get('timescale')
2066                 if timescale:
2067                     ms_info['timescale'] = int(timescale)
2068                 segment_duration = source.get('duration')
2069                 if segment_duration:
2070                     ms_info['segment_duration'] = float(segment_duration)
2071
2072             def extract_Initialization(source):
2073                 initialization = source.find(_add_ns('Initialization'))
2074                 if initialization is not None:
2075                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2076
2077             segment_list = element.find(_add_ns('SegmentList'))
2078             if segment_list is not None:
2079                 extract_common(segment_list)
2080                 extract_Initialization(segment_list)
2081                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2082                 if segment_urls_e:
2083                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2084             else:
2085                 segment_template = element.find(_add_ns('SegmentTemplate'))
2086                 if segment_template is not None:
2087                     extract_common(segment_template)
2088                     media = segment_template.get('media')
2089                     if media:
2090                         ms_info['media'] = media
2091                     initialization = segment_template.get('initialization')
2092                     if initialization:
2093                         ms_info['initialization'] = initialization
2094                     else:
2095                         extract_Initialization(segment_template)
2096             return ms_info
2097
2098         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2099         formats = []
2100         for period in mpd_doc.findall(_add_ns('Period')):
2101             period_duration = parse_duration(period.get('duration')) or mpd_duration
2102             period_ms_info = extract_multisegment_info(period, {
2103                 'start_number': 1,
2104                 'timescale': 1,
2105             })
2106             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2107                 if is_drm_protected(adaptation_set):
2108                     continue
2109                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2110                 for representation in adaptation_set.findall(_add_ns('Representation')):
2111                     if is_drm_protected(representation):
2112                         continue
2113                     representation_attrib = adaptation_set.attrib.copy()
2114                     representation_attrib.update(representation.attrib)
2115                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2116                     mime_type = representation_attrib['mimeType']
2117                     content_type = mime_type.split('/')[0]
2118                     if content_type == 'text':
2119                         # TODO implement WebVTT downloading
2120                         pass
2121                     elif content_type in ('video', 'audio'):
2122                         base_url = ''
2123                         for element in (representation, adaptation_set, period, mpd_doc):
2124                             base_url_e = element.find(_add_ns('BaseURL'))
2125                             if base_url_e is not None:
2126                                 base_url = base_url_e.text + base_url
2127                                 if re.match(r'^https?://', base_url):
2128                                     break
2129                         if mpd_base_url and not re.match(r'^https?://', base_url):
2130                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2131                                 mpd_base_url += '/'
2132                             base_url = mpd_base_url + base_url
2133                         representation_id = representation_attrib.get('id')
2134                         lang = representation_attrib.get('lang')
2135                         url_el = representation.find(_add_ns('BaseURL'))
2136                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2137                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2138                         f = {
2139                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2140                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2141                             'url': mpd_url or base_url,
2142                             'manifest_url': mpd_url,
2143                             'ext': mimetype2ext(mime_type),
2144                             'width': int_or_none(representation_attrib.get('width')),
2145                             'height': int_or_none(representation_attrib.get('height')),
2146                             'tbr': float_or_none(bandwidth, 1000),
2147                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2148                             'fps': int_or_none(representation_attrib.get('frameRate')),
2149                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2150                             'format_note': 'DASH %s' % content_type,
2151                             'filesize': filesize,
2152                             'container': mimetype2ext(mime_type) + '_dash',
2153                         }
2154                         f.update(parse_codecs(representation_attrib.get('codecs')))
2155                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2156
2157                         def prepare_template(template_name, identifiers):
2158                             tmpl = representation_ms_info[template_name]
2159                             # First of, % characters outside $...$ templates
2160                             # must be escaped by doubling for proper processing
2161                             # by % operator string formatting used further (see
2162                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2163                             t = ''
2164                             in_template = False
2165                             for c in tmpl:
2166                                 t += c
2167                                 if c == '$':
2168                                     in_template = not in_template
2169                                 elif c == '%' and not in_template:
2170                                     t += c
2171                             # Next, $...$ templates are translated to their
2172                             # %(...) counterparts to be used with % operator
2173                             t = t.replace('$RepresentationID$', representation_id)
2174                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2175                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2176                             t.replace('$$', '$')
2177                             return t
2178
2179                         # @initialization is a regular template like @media one
2180                         # so it should be handled just the same way (see
2181                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2182                         if 'initialization' in representation_ms_info:
2183                             initialization_template = prepare_template(
2184                                 'initialization',
2185                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2186                                 # $Time$ shall not be included for @initialization thus
2187                                 # only $Bandwidth$ remains
2188                                 ('Bandwidth', ))
2189                             representation_ms_info['initialization_url'] = initialization_template % {
2190                                 'Bandwidth': bandwidth,
2191                             }
2192
2193                         def location_key(location):
2194                             return 'url' if re.match(r'^https?://', location) else 'path'
2195
2196                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2197
2198                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2199                             media_location_key = location_key(media_template)
2200
2201                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2202                             # can't be used at the same time
2203                             if '%(Number' in media_template and 's' not in representation_ms_info:
2204                                 segment_duration = None
2205                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2206                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2207                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2208                                 representation_ms_info['fragments'] = [{
2209                                     media_location_key: media_template % {
2210                                         'Number': segment_number,
2211                                         'Bandwidth': bandwidth,
2212                                     },
2213                                     'duration': segment_duration,
2214                                 } for segment_number in range(
2215                                     representation_ms_info['start_number'],
2216                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2217                             else:
2218                                 # $Number*$ or $Time$ in media template with S list available
2219                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2220                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2221                                 representation_ms_info['fragments'] = []
2222                                 segment_time = 0
2223                                 segment_d = None
2224                                 segment_number = representation_ms_info['start_number']
2225
2226                                 def add_segment_url():
2227                                     segment_url = media_template % {
2228                                         'Time': segment_time,
2229                                         'Bandwidth': bandwidth,
2230                                         'Number': segment_number,
2231                                     }
2232                                     representation_ms_info['fragments'].append({
2233                                         media_location_key: segment_url,
2234                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2235                                     })
2236
2237                                 for num, s in enumerate(representation_ms_info['s']):
2238                                     segment_time = s.get('t') or segment_time
2239                                     segment_d = s['d']
2240                                     add_segment_url()
2241                                     segment_number += 1
2242                                     for r in range(s.get('r', 0)):
2243                                         segment_time += segment_d
2244                                         add_segment_url()
2245                                         segment_number += 1
2246                                     segment_time += segment_d
2247                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2248                             # No media template
2249                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2250                             # or any YouTube dashsegments video
2251                             fragments = []
2252                             segment_index = 0
2253                             timescale = representation_ms_info['timescale']
2254                             for s in representation_ms_info['s']:
2255                                 duration = float_or_none(s['d'], timescale)
2256                                 for r in range(s.get('r', 0) + 1):
2257                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2258                                     fragments.append({
2259                                         location_key(segment_uri): segment_uri,
2260                                         'duration': duration,
2261                                     })
2262                                     segment_index += 1
2263                             representation_ms_info['fragments'] = fragments
2264                         elif 'segment_urls' in representation_ms_info:
2265                             # Segment URLs with no SegmentTimeline
2266                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2267                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2268                             fragments = []
2269                             segment_duration = float_or_none(
2270                                 representation_ms_info['segment_duration'],
2271                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2272                             for segment_url in representation_ms_info['segment_urls']:
2273                                 fragment = {
2274                                     location_key(segment_url): segment_url,
2275                                 }
2276                                 if segment_duration:
2277                                     fragment['duration'] = segment_duration
2278                                 fragments.append(fragment)
2279                             representation_ms_info['fragments'] = fragments
2280                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2281                         # No fragments key is present in this case.
2282                         if 'fragments' in representation_ms_info:
2283                             f.update({
2284                                 'fragment_base_url': base_url,
2285                                 'fragments': [],
2286                                 'protocol': 'http_dash_segments',
2287                             })
2288                             if 'initialization_url' in representation_ms_info:
2289                                 initialization_url = representation_ms_info['initialization_url']
2290                                 if not f.get('url'):
2291                                     f['url'] = initialization_url
2292                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2293                             f['fragments'].extend(representation_ms_info['fragments'])
2294                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2295                         # is not necessarily unique within a Period thus formats with
2296                         # the same `format_id` are quite possible. There are numerous examples
2297                         # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111,
2298                         # https://github.com/ytdl-org/youtube-dl/issues/13919)
2299                         full_info = formats_dict.get(representation_id, {}).copy()
2300                         full_info.update(f)
2301                         formats.append(full_info)
2302                     else:
2303                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2304         return formats
2305
2306     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2307         res = self._download_xml_handle(
2308             ism_url, video_id,
2309             note=note or 'Downloading ISM manifest',
2310             errnote=errnote or 'Failed to download ISM manifest',
2311             fatal=fatal)
2312         if res is False:
2313             return []
2314         ism_doc, urlh = res
2315
2316         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2317
2318     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2319         """
2320         Parse formats from ISM manifest.
2321         References:
2322          1. [MS-SSTR]: Smooth Streaming Protocol,
2323             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2324         """
2325         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2326             return []
2327
2328         duration = int(ism_doc.attrib['Duration'])
2329         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2330
2331         formats = []
2332         for stream in ism_doc.findall('StreamIndex'):
2333             stream_type = stream.get('Type')
2334             if stream_type not in ('video', 'audio'):
2335                 continue
2336             url_pattern = stream.attrib['Url']
2337             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2338             stream_name = stream.get('Name')
2339             for track in stream.findall('QualityLevel'):
2340                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2341                 # TODO: add support for WVC1 and WMAP
2342                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2343                     self.report_warning('%s is not a supported codec' % fourcc)
2344                     continue
2345                 tbr = int(track.attrib['Bitrate']) // 1000
2346                 # [1] does not mention Width and Height attributes. However,
2347                 # they're often present while MaxWidth and MaxHeight are
2348                 # missing, so should be used as fallbacks
2349                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2350                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2351                 sampling_rate = int_or_none(track.get('SamplingRate'))
2352
2353                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2354                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2355
2356                 fragments = []
2357                 fragment_ctx = {
2358                     'time': 0,
2359                 }
2360                 stream_fragments = stream.findall('c')
2361                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2362                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2363                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2364                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2365                     if not fragment_ctx['duration']:
2366                         try:
2367                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2368                         except IndexError:
2369                             next_fragment_time = duration
2370                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2371                     for _ in range(fragment_repeat):
2372                         fragments.append({
2373                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2374                             'duration': fragment_ctx['duration'] / stream_timescale,
2375                         })
2376                         fragment_ctx['time'] += fragment_ctx['duration']
2377
2378                 format_id = []
2379                 if ism_id:
2380                     format_id.append(ism_id)
2381                 if stream_name:
2382                     format_id.append(stream_name)
2383                 format_id.append(compat_str(tbr))
2384
2385                 formats.append({
2386                     'format_id': '-'.join(format_id),
2387                     'url': ism_url,
2388                     'manifest_url': ism_url,
2389                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2390                     'width': width,
2391                     'height': height,
2392                     'tbr': tbr,
2393                     'asr': sampling_rate,
2394                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2395                     'acodec': 'none' if stream_type == 'video' else fourcc,
2396                     'protocol': 'ism',
2397                     'fragments': fragments,
2398                     '_download_params': {
2399                         'duration': duration,
2400                         'timescale': stream_timescale,
2401                         'width': width or 0,
2402                         'height': height or 0,
2403                         'fourcc': fourcc,
2404                         'codec_private_data': track.get('CodecPrivateData'),
2405                         'sampling_rate': sampling_rate,
2406                         'channels': int_or_none(track.get('Channels', 2)),
2407                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2408                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2409                     },
2410                 })
2411         return formats
2412
2413     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2414         def absolute_url(item_url):
2415             return urljoin(base_url, item_url)
2416
2417         def parse_content_type(content_type):
2418             if not content_type:
2419                 return {}
2420             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2421             if ctr:
2422                 mimetype, codecs = ctr.groups()
2423                 f = parse_codecs(codecs)
2424                 f['ext'] = mimetype2ext(mimetype)
2425                 return f
2426             return {}
2427
2428         def _media_formats(src, cur_media_type, type_info={}):
2429             full_url = absolute_url(src)
2430             ext = type_info.get('ext') or determine_ext(full_url)
2431             if ext == 'm3u8':
2432                 is_plain_url = False
2433                 formats = self._extract_m3u8_formats(
2434                     full_url, video_id, ext='mp4',
2435                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2436                     preference=preference, fatal=False)
2437             elif ext == 'mpd':
2438                 is_plain_url = False
2439                 formats = self._extract_mpd_formats(
2440                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2441             else:
2442                 is_plain_url = True
2443                 formats = [{
2444                     'url': full_url,
2445                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2446                 }]
2447             return is_plain_url, formats
2448
2449         entries = []
2450         # amp-video and amp-audio are very similar to their HTML5 counterparts
2451         # so we wll include them right here (see
2452         # https://www.ampproject.org/docs/reference/components/amp-video)
2453         media_tags = [(media_tag, media_type, '')
2454                       for media_tag, media_type
2455                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2456         media_tags.extend(re.findall(
2457             # We only allow video|audio followed by a whitespace or '>'.
2458             # Allowing more characters may end up in significant slow down (see
2459             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2460             # http://www.porntrex.com/maps/videositemap.xml).
2461             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2462         for media_tag, media_type, media_content in media_tags:
2463             media_info = {
2464                 'formats': [],
2465                 'subtitles': {},
2466             }
2467             media_attributes = extract_attributes(media_tag)
2468             src = media_attributes.get('src')
2469             if src:
2470                 _, formats = _media_formats(src, media_type)
2471                 media_info['formats'].extend(formats)
2472             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2473             if media_content:
2474                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2475                     source_attributes = extract_attributes(source_tag)
2476                     src = source_attributes.get('src')
2477                     if not src:
2478                         continue
2479                     f = parse_content_type(source_attributes.get('type'))
2480                     is_plain_url, formats = _media_formats(src, media_type, f)
2481                     if is_plain_url:
2482                         # res attribute is not standard but seen several times
2483                         # in the wild
2484                         f.update({
2485                             'height': int_or_none(source_attributes.get('res')),
2486                             'format_id': source_attributes.get('label'),
2487                         })
2488                         f.update(formats[0])
2489                         media_info['formats'].append(f)
2490                     else:
2491                         media_info['formats'].extend(formats)
2492                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2493                     track_attributes = extract_attributes(track_tag)
2494                     kind = track_attributes.get('kind')
2495                     if not kind or kind in ('subtitles', 'captions'):
2496                         src = track_attributes.get('src')
2497                         if not src:
2498                             continue
2499                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2500                         media_info['subtitles'].setdefault(lang, []).append({
2501                             'url': absolute_url(src),
2502                         })
2503             for f in media_info['formats']:
2504                 f.setdefault('http_headers', {})['Referer'] = base_url
2505             if media_info['formats'] or media_info['subtitles']:
2506                 entries.append(media_info)
2507         return entries
2508
2509     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2510         formats = []
2511         hdcore_sign = 'hdcore=3.7.0'
2512         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2513         hds_host = hosts.get('hds')
2514         if hds_host:
2515             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2516         if 'hdcore=' not in f4m_url:
2517             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2518         f4m_formats = self._extract_f4m_formats(
2519             f4m_url, video_id, f4m_id='hds', fatal=False)
2520         for entry in f4m_formats:
2521             entry.update({'extra_param_to_segment_url': hdcore_sign})
2522         formats.extend(f4m_formats)
2523         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2524         hls_host = hosts.get('hls')
2525         if hls_host:
2526             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2527         formats.extend(self._extract_m3u8_formats(
2528             m3u8_url, video_id, 'mp4', 'm3u8_native',
2529             m3u8_id='hls', fatal=False))
2530         return formats
2531
2532     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2533         query = compat_urlparse.urlparse(url).query
2534         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2535         mobj = re.search(
2536             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2537         url_base = mobj.group('url')
2538         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2539         formats = []
2540
2541         def manifest_url(manifest):
2542             m_url = '%s/%s' % (http_base_url, manifest)
2543             if query:
2544                 m_url += '?%s' % query
2545             return m_url
2546
2547         if 'm3u8' not in skip_protocols:
2548             formats.extend(self._extract_m3u8_formats(
2549                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2550                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2551         if 'f4m' not in skip_protocols:
2552             formats.extend(self._extract_f4m_formats(
2553                 manifest_url('manifest.f4m'),
2554                 video_id, f4m_id='hds', fatal=False))
2555         if 'dash' not in skip_protocols:
2556             formats.extend(self._extract_mpd_formats(
2557                 manifest_url('manifest.mpd'),
2558                 video_id, mpd_id='dash', fatal=False))
2559         if re.search(r'(?:/smil:|\.smil)', url_base):
2560             if 'smil' not in skip_protocols:
2561                 rtmp_formats = self._extract_smil_formats(
2562                     manifest_url('jwplayer.smil'),
2563                     video_id, fatal=False)
2564                 for rtmp_format in rtmp_formats:
2565                     rtsp_format = rtmp_format.copy()
2566                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2567                     del rtsp_format['play_path']
2568                     del rtsp_format['ext']
2569                     rtsp_format.update({
2570                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2571                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2572                         'protocol': 'rtsp',
2573                     })
2574                     formats.extend([rtmp_format, rtsp_format])
2575         else:
2576             for protocol in ('rtmp', 'rtsp'):
2577                 if protocol not in skip_protocols:
2578                     formats.append({
2579                         'url': '%s:%s' % (protocol, url_base),
2580                         'format_id': protocol,
2581                         'protocol': protocol,
2582                     })
2583         return formats
2584
2585     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2586         mobj = re.search(
2587             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2588             webpage)
2589         if mobj:
2590             try:
2591                 jwplayer_data = self._parse_json(mobj.group('options'),
2592                                                  video_id=video_id,
2593                                                  transform_source=transform_source)
2594             except ExtractorError:
2595                 pass
2596             else:
2597                 if isinstance(jwplayer_data, dict):
2598                     return jwplayer_data
2599
2600     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2601         jwplayer_data = self._find_jwplayer_data(
2602             webpage, video_id, transform_source=js_to_json)
2603         return self._parse_jwplayer_data(
2604             jwplayer_data, video_id, *args, **kwargs)
2605
2606     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2607                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2608         # JWPlayer backward compatibility: flattened playlists
2609         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2610         if 'playlist' not in jwplayer_data:
2611             jwplayer_data = {'playlist': [jwplayer_data]}
2612
2613         entries = []
2614
2615         # JWPlayer backward compatibility: single playlist item
2616         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2617         if not isinstance(jwplayer_data['playlist'], list):
2618             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2619
2620         for video_data in jwplayer_data['playlist']:
2621             # JWPlayer backward compatibility: flattened sources
2622             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2623             if 'sources' not in video_data:
2624                 video_data['sources'] = [video_data]
2625
2626             this_video_id = video_id or video_data['mediaid']
2627
2628             formats = self._parse_jwplayer_formats(
2629                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2630                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2631
2632             subtitles = {}
2633             tracks = video_data.get('tracks')
2634             if tracks and isinstance(tracks, list):
2635                 for track in tracks:
2636                     if not isinstance(track, dict):
2637                         continue
2638                     track_kind = track.get('kind')
2639                     if not track_kind or not isinstance(track_kind, compat_str):
2640                         continue
2641                     if track_kind.lower() not in ('captions', 'subtitles'):
2642                         continue
2643                     track_url = urljoin(base_url, track.get('file'))
2644                     if not track_url:
2645                         continue
2646                     subtitles.setdefault(track.get('label') or 'en', []).append({
2647                         'url': self._proto_relative_url(track_url)
2648                     })
2649
2650             entry = {
2651                 'id': this_video_id,
2652                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2653                 'description': video_data.get('description'),
2654                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
2655                 'timestamp': int_or_none(video_data.get('pubdate')),
2656                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2657                 'subtitles': subtitles,
2658             }
2659             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2660             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2661                 entry.update({
2662                     '_type': 'url_transparent',
2663                     'url': formats[0]['url'],
2664                 })
2665             else:
2666                 self._sort_formats(formats)
2667                 entry['formats'] = formats
2668             entries.append(entry)
2669         if len(entries) == 1:
2670             return entries[0]
2671         else:
2672             return self.playlist_result(entries)
2673
2674     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2675                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2676         urls = []
2677         formats = []
2678         for source in jwplayer_sources_data:
2679             if not isinstance(source, dict):
2680                 continue
2681             source_url = urljoin(
2682                 base_url, self._proto_relative_url(source.get('file')))
2683             if not source_url or source_url in urls:
2684                 continue
2685             urls.append(source_url)
2686             source_type = source.get('type') or ''
2687             ext = mimetype2ext(source_type) or determine_ext(source_url)
2688             if source_type == 'hls' or ext == 'm3u8':
2689                 formats.extend(self._extract_m3u8_formats(
2690                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2691                     m3u8_id=m3u8_id, fatal=False))
2692             elif source_type == 'dash' or ext == 'mpd':
2693                 formats.extend(self._extract_mpd_formats(
2694                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2695             elif ext == 'smil':
2696                 formats.extend(self._extract_smil_formats(
2697                     source_url, video_id, fatal=False))
2698             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2699             elif source_type.startswith('audio') or ext in (
2700                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2701                 formats.append({
2702                     'url': source_url,
2703                     'vcodec': 'none',
2704                     'ext': ext,
2705                 })
2706             else:
2707                 height = int_or_none(source.get('height'))
2708                 if height is None:
2709                     # Often no height is provided but there is a label in
2710                     # format like "1080p", "720p SD", or 1080.
2711                     height = int_or_none(self._search_regex(
2712                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2713                         'height', default=None))
2714                 a_format = {
2715                     'url': source_url,
2716                     'width': int_or_none(source.get('width')),
2717                     'height': height,
2718                     'tbr': int_or_none(source.get('bitrate')),
2719                     'ext': ext,
2720                 }
2721                 if source_url.startswith('rtmp'):
2722                     a_format['ext'] = 'flv'
2723                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2724                     # of jwplayer.flash.swf
2725                     rtmp_url_parts = re.split(
2726                         r'((?:mp4|mp3|flv):)', source_url, 1)
2727                     if len(rtmp_url_parts) == 3:
2728                         rtmp_url, prefix, play_path = rtmp_url_parts
2729                         a_format.update({
2730                             'url': rtmp_url,
2731                             'play_path': prefix + play_path,
2732                         })
2733                     if rtmp_params:
2734                         a_format.update(rtmp_params)
2735                 formats.append(a_format)
2736         return formats
2737
2738     def _live_title(self, name):
2739         """ Generate the title for a live video """
2740         now = datetime.datetime.now()
2741         now_str = now.strftime('%Y-%m-%d %H:%M')
2742         return name + ' ' + now_str
2743
2744     def _int(self, v, name, fatal=False, **kwargs):
2745         res = int_or_none(v, **kwargs)
2746         if 'get_attr' in kwargs:
2747             print(getattr(v, kwargs['get_attr']))
2748         if res is None:
2749             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2750             if fatal:
2751                 raise ExtractorError(msg)
2752             else:
2753                 self._downloader.report_warning(msg)
2754         return res
2755
2756     def _float(self, v, name, fatal=False, **kwargs):
2757         res = float_or_none(v, **kwargs)
2758         if res is None:
2759             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2760             if fatal:
2761                 raise ExtractorError(msg)
2762             else:
2763                 self._downloader.report_warning(msg)
2764         return res
2765
2766     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2767                     path='/', secure=False, discard=False, rest={}, **kwargs):
2768         cookie = compat_cookiejar.Cookie(
2769             0, name, value, port, port is not None, domain, True,
2770             domain.startswith('.'), path, True, secure, expire_time,
2771             discard, None, None, rest)
2772         self._downloader.cookiejar.set_cookie(cookie)
2773
2774     def _get_cookies(self, url):
2775         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2776         req = sanitized_Request(url)
2777         self._downloader.cookiejar.add_cookie_header(req)
2778         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2779
2780     def get_testcases(self, include_onlymatching=False):
2781         t = getattr(self, '_TEST', None)
2782         if t:
2783             assert not hasattr(self, '_TESTS'), \
2784                 '%s has _TEST and _TESTS' % type(self).__name__
2785             tests = [t]
2786         else:
2787             tests = getattr(self, '_TESTS', [])
2788         for t in tests:
2789             if not include_onlymatching and t.get('only_matching', False):
2790                 continue
2791             t['name'] = type(self).__name__[:-len('IE')]
2792             yield t
2793
2794     def is_suitable(self, age_limit):
2795         """ Test whether the extractor is generally suitable for the given
2796         age limit (i.e. pornographic sites are not, all others usually are) """
2797
2798         any_restricted = False
2799         for tc in self.get_testcases(include_onlymatching=False):
2800             if tc.get('playlist', []):
2801                 tc = tc['playlist'][0]
2802             is_restricted = age_restricted(
2803                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2804             if not is_restricted:
2805                 return True
2806             any_restricted = any_restricted or is_restricted
2807         return not any_restricted
2808
2809     def extract_subtitles(self, *args, **kwargs):
2810         if (self._downloader.params.get('writesubtitles', False) or
2811                 self._downloader.params.get('listsubtitles')):
2812             return self._get_subtitles(*args, **kwargs)
2813         return {}
2814
2815     def _get_subtitles(self, *args, **kwargs):
2816         raise NotImplementedError('This method must be implemented by subclasses')
2817
2818     @staticmethod
2819     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2820         """ Merge subtitle items for one language. Items with duplicated URLs
2821         will be dropped. """
2822         list1_urls = set([item['url'] for item in subtitle_list1])
2823         ret = list(subtitle_list1)
2824         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2825         return ret
2826
2827     @classmethod
2828     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2829         """ Merge two subtitle dictionaries, language by language. """
2830         ret = dict(subtitle_dict1)
2831         for lang in subtitle_dict2:
2832             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2833         return ret
2834
2835     def extract_automatic_captions(self, *args, **kwargs):
2836         if (self._downloader.params.get('writeautomaticsub', False) or
2837                 self._downloader.params.get('listsubtitles')):
2838             return self._get_automatic_captions(*args, **kwargs)
2839         return {}
2840
2841     def _get_automatic_captions(self, *args, **kwargs):
2842         raise NotImplementedError('This method must be implemented by subclasses')
2843
2844     def mark_watched(self, *args, **kwargs):
2845         if (self._downloader.params.get('mark_watched', False) and
2846                 (self._get_login_info()[0] is not None or
2847                     self._downloader.params.get('cookiefile') is not None)):
2848             self._mark_watched(*args, **kwargs)
2849
2850     def _mark_watched(self, *args, **kwargs):
2851         raise NotImplementedError('This method must be implemented by subclasses')
2852
2853     def geo_verification_headers(self):
2854         headers = {}
2855         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2856         if geo_verification_proxy:
2857             headers['Ytdl-request-proxy'] = geo_verification_proxy
2858         return headers
2859
2860     def _generic_id(self, url):
2861         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2862
2863     def _generic_title(self, url):
2864         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2865
2866
2867 class SearchInfoExtractor(InfoExtractor):
2868     """
2869     Base class for paged search queries extractors.
2870     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2871     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2872     """
2873
2874     @classmethod
2875     def _make_valid_url(cls):
2876         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2877
2878     @classmethod
2879     def suitable(cls, url):
2880         return re.match(cls._make_valid_url(), url) is not None
2881
2882     def _real_extract(self, query):
2883         mobj = re.match(self._make_valid_url(), query)
2884         if mobj is None:
2885             raise ExtractorError('Invalid search query "%s"' % query)
2886
2887         prefix = mobj.group('prefix')
2888         query = mobj.group('query')
2889         if prefix == '':
2890             return self._get_n_results(query, 1)
2891         elif prefix == 'all':
2892             return self._get_n_results(query, self._MAX_RESULTS)
2893         else:
2894             n = int(prefix)
2895             if n <= 0:
2896                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2897             elif n > self._MAX_RESULTS:
2898                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2899                 n = self._MAX_RESULTS
2900             return self._get_n_results(query, n)
2901
2902     def _get_n_results(self, query, n):
2903         """Get a specified number of results for a query"""
2904         raise NotImplementedError('This method must be implemented by subclasses')
2905
2906     @property
2907     def SEARCH_KEY(self):
2908         return self._SEARCH_KEY