youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_HTTPError,
  20     compat_http_client,
  21     compat_urllib_error,
  22     compat_urllib_parse,
  23     compat_urllib_parse_urlparse,
  24     compat_urllib_request,
  25     compat_urlparse,
  26     compat_str,
  27 )
  28 from ..utils import (
  29     NO_DEFAULT,
  30     age_restricted,
  31     bug_reports_message,
  32     clean_html,
  33     compiled_regex_type,
  34     determine_ext,
  35     ExtractorError,
  36     fix_xml_ampersands,
  37     float_or_none,
  38     int_or_none,
  39     RegexNotFoundError,
  40     sanitize_filename,
  41     unescapeHTML,
  42     url_basename,
  43     xpath_text,
  44     xpath_with_ns,
  45 )
  46
  47
  48 class InfoExtractor(object):
  49     """Information Extractor class.
  50
  51     Information extractors are the classes that, given a URL, extract
  52     information about the video (or videos) the URL refers to. This
  53     information includes the real video URL, the video title, author and
  54     others. The information is stored in a dictionary which is then
  55     passed to the YoutubeDL. The YoutubeDL processes this
  56     information possibly downloading the video to the file system, among
  57     other possible outcomes.
  58
  59     The type field determines the type of the result.
  60     By far the most common value (and the default if _type is missing) is
  61     "video", which indicates a single video.
  62
  63     For a video, the dictionaries must include the following fields:
  64
  65     id:             Video identifier.
  66     title:          Video title, unescaped.
  67
  68     Additionally, it must contain either a formats entry or a url one:
  69
  70     formats:        A list of dictionaries for each format available, ordered
  71                     from worst to best quality.
  72
  73                     Potential fields:
  74                     * url        Mandatory. The URL of the video file
  75                     * ext        Will be calculated from URL if missing
  76                     * format     A human-readable description of the format
  77                                  ("mp4 container with h264/opus").
  78                                  Calculated from the format_id, width, height.
  79                                  and format_note fields if missing.
  80                     * format_id  A short description of the format
  81                                  ("mp4_h264_opus" or "19").
  82                                 Technically optional, but strongly recommended.
  83                     * format_note Additional info about the format
  84                                  ("3D" or "DASH video")
  85                     * width      Width of the video, if known
  86                     * height     Height of the video, if known
  87                     * resolution Textual description of width and height
  88                     * tbr        Average bitrate of audio and video in KBit/s
  89                     * abr        Average audio bitrate in KBit/s
  90                     * acodec     Name of the audio codec in use
  91                     * asr        Audio sampling rate in Hertz
  92                     * vbr        Average video bitrate in KBit/s
  93                     * fps        Frame rate
  94                     * vcodec     Name of the video codec in use
  95                     * container  Name of the container format
  96                     * filesize   The number of bytes, if known in advance
  97                     * filesize_approx  An estimate for the number of bytes
  98                     * player_url SWF Player URL (used for rtmpdump).
  99                     * protocol   The protocol that will be used for the actual
 100                                  download, lower-case.
 101                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 102                                  "m3u8", or "m3u8_native".
 103                     * preference Order number of this format. If this field is
 104                                  present and not None, the formats get sorted
 105                                  by this field, regardless of all other values.
 106                                  -1 for default (order by other properties),
 107                                  -2 or smaller for less than default.
 108                                  < -1000 to hide the format (if there is
 109                                     another one which is strictly better)
 110                     * language_preference  Is this in the correct requested
 111                                  language?
 112                                  10 if it's what the URL is about,
 113                                  -1 for default (don't know),
 114                                  -10 otherwise, other values reserved for now.
 115                     * quality    Order number of the video quality of this
 116                                  format, irrespective of the file format.
 117                                  -1 for default (order by other properties),
 118                                  -2 or smaller for less than default.
 119                     * source_preference  Order number for this video source
 120                                   (quality takes higher priority)
 121                                  -1 for default (order by other properties),
 122                                  -2 or smaller for less than default.
 123                     * http_headers  A dictionary of additional HTTP headers
 124                                  to add to the request.
 125                     * stretched_ratio  If given and not 1, indicates that the
 126                                  video's pixels are not square.
 127                                  width : height ratio as float.
 128                     * no_resume  The server does not support resuming the
 129                                  (HTTP or RTMP) download. Boolean.
 130
 131     url:            Final video URL.
 132     ext:            Video filename extension.
 133     format:         The video format, defaults to ext (used for --get-format)
 134     player_url:     SWF Player URL (used for rtmpdump).
 135
 136     The following fields are optional:
 137
 138     alt_title:      A secondary title of the video.
 139     display_id      An alternative identifier for the video, not necessarily
 140                     unique, but available before title. Typically, id is
 141                     something like "4234987", title "Dancing naked mole rats",
 142                     and display_id "dancing-naked-mole-rats"
 143     thumbnails:     A list of dictionaries, with the following entries:
 144                         * "id" (optional, string) - Thumbnail format ID
 145                         * "url"
 146                         * "preference" (optional, int) - quality of the image
 147                         * "width" (optional, int)
 148                         * "height" (optional, int)
 149                         * "resolution" (optional, string "{width}x{height"},
 150                                         deprecated)
 151     thumbnail:      Full URL to a video thumbnail image.
 152     description:    Full video description.
 153     uploader:       Full name of the video uploader.
 154     creator:        The main artist who created the video.
 155     timestamp:      UNIX timestamp of the moment the video became available.
 156     upload_date:    Video upload date (YYYYMMDD).
 157                     If not explicitly set, calculated from timestamp.
 158     uploader_id:    Nickname or id of the video uploader.
 159     location:       Physical location where the video was filmed.
 160     subtitles:      The available subtitles as a dictionary in the format
 161                     {language: subformats}. "subformats" is a list sorted from
 162                     lower to higher preference, each element is a dictionary
 163                     with the "ext" entry and one of:
 164                         * "data": The subtitles file contents
 165                         * "url": A URL pointing to the subtitles file
 166     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 167                     automatically generated captions
 168     duration:       Length of the video in seconds, as an integer.
 169     view_count:     How many users have watched the video on the platform.
 170     like_count:     Number of positive ratings of the video
 171     dislike_count:  Number of negative ratings of the video
 172     average_rating: Average rating give by users, the scale used depends on the webpage
 173     comment_count:  Number of comments on the video
 174     comments:       A list of comments, each with one or more of the following
 175                     properties (all but one of text or html optional):
 176                         * "author" - human-readable name of the comment author
 177                         * "author_id" - user ID of the comment author
 178                         * "id" - Comment ID
 179                         * "html" - Comment as HTML
 180                         * "text" - Plain text of the comment
 181                         * "timestamp" - UNIX timestamp of comment
 182                         * "parent" - ID of the comment this one is replying to.
 183                                      Set to "root" to indicate that this is a
 184                                      comment to the original video.
 185     age_limit:      Age restriction for the video, as an integer (years)
 186     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 187                     should allow to get the same result again. (It will be set
 188                     by YoutubeDL if it's missing)
 189     categories:     A list of categories that the video falls in, for example
 190                     ["Sports", "Berlin"]
 191     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 192     is_live:        True, False, or None (=unknown). Whether this video is a
 193                     live stream that goes on instead of a fixed-length video.
 194     start_time:     Time in seconds where the reproduction should start, as
 195                     specified in the URL.
 196     end_time:       Time in seconds where the reproduction should end, as
 197                     specified in the URL.
 198
 199     Unless mentioned otherwise, the fields should be Unicode strings.
 200
 201     Unless mentioned otherwise, None is equivalent to absence of information.
 202
 203
 204     _type "playlist" indicates multiple videos.
 205     There must be a key "entries", which is a list, an iterable, or a PagedList
 206     object, each element of which is a valid dictionary by this specification.
 207
 208     Additionally, playlists can have "title", "description" and "id" attributes
 209     with the same semantics as videos (see above).
 210
 211
 212     _type "multi_video" indicates that there are multiple videos that
 213     form a single show, for examples multiple acts of an opera or TV episode.
 214     It must have an entries key like a playlist and contain all the keys
 215     required for a video at the same time.
 216
 217
 218     _type "url" indicates that the video must be extracted from another
 219     location, possibly by a different extractor. Its only required key is:
 220     "url" - the next URL to extract.
 221     The key "ie_key" can be set to the class name (minus the trailing "IE",
 222     e.g. "Youtube") if the extractor class is known in advance.
 223     Additionally, the dictionary may have any properties of the resolved entity
 224     known in advance, for example "title" if the title of the referred video is
 225     known ahead of time.
 226
 227
 228     _type "url_transparent" entities have the same specification as "url", but
 229     indicate that the given additional information is more precise than the one
 230     associated with the resolved URL.
 231     This is useful when a site employs a video service that hosts the video and
 232     its technical metadata, but that video service does not embed a useful
 233     title, description etc.
 234
 235
 236     Subclasses of this one should re-define the _real_initialize() and
 237     _real_extract() methods and define a _VALID_URL regexp.
 238     Probably, they should also be added to the list of extractors.
 239
 240     Finally, the _WORKING attribute should be set to False for broken IEs
 241     in order to warn the users and skip the tests.
 242     """
 243
 244     _ready = False
 245     _downloader = None
 246     _WORKING = True
 247
 248     def __init__(self, downloader=None):
 249         """Constructor. Receives an optional downloader."""
 250         self._ready = False
 251         self.set_downloader(downloader)
 252
 253     @classmethod
 254     def suitable(cls, url):
 255         """Receives a URL and returns True if suitable for this IE."""
 256
 257         # This does not use has/getattr intentionally - we want to know whether
 258         # we have cached the regexp for *this* class, whereas getattr would also
 259         # match the superclass
 260         if '_VALID_URL_RE' not in cls.__dict__:
 261             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 262         return cls._VALID_URL_RE.match(url) is not None
 263
 264     @classmethod
 265     def _match_id(cls, url):
 266         if '_VALID_URL_RE' not in cls.__dict__:
 267             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 268         m = cls._VALID_URL_RE.match(url)
 269         assert m
 270         return m.group('id')
 271
 272     @classmethod
 273     def working(cls):
 274         """Getter method for _WORKING."""
 275         return cls._WORKING
 276
 277     def initialize(self):
 278         """Initializes an instance (authentication, etc)."""
 279         if not self._ready:
 280             self._real_initialize()
 281             self._ready = True
 282
 283     def extract(self, url):
 284         """Extracts URL information and returns it in list of dicts."""
 285         try:
 286             self.initialize()
 287             return self._real_extract(url)
 288         except ExtractorError:
 289             raise
 290         except compat_http_client.IncompleteRead as e:
 291             raise ExtractorError('A network error has occured.', cause=e, expected=True)
 292         except (KeyError, StopIteration) as e:
 293             raise ExtractorError('An extractor error has occured.', cause=e)
 294
 295     def set_downloader(self, downloader):
 296         """Sets the downloader for this IE."""
 297         self._downloader = downloader
 298
 299     def _real_initialize(self):
 300         """Real initialization process. Redefine in subclasses."""
 301         pass
 302
 303     def _real_extract(self, url):
 304         """Real extraction process. Redefine in subclasses."""
 305         pass
 306
 307     @classmethod
 308     def ie_key(cls):
 309         """A string for getting the InfoExtractor with get_info_extractor"""
 310         return cls.__name__[:-2]
 311
 312     @property
 313     def IE_NAME(self):
 314         return type(self).__name__[:-2]
 315
 316     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 317         """ Returns the response handle """
 318         if note is None:
 319             self.report_download_webpage(video_id)
 320         elif note is not False:
 321             if video_id is None:
 322                 self.to_screen('%s' % (note,))
 323             else:
 324                 self.to_screen('%s: %s' % (video_id, note))
 325         try:
 326             return self._downloader.urlopen(url_or_request)
 327         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 328             if errnote is False:
 329                 return False
 330             if errnote is None:
 331                 errnote = 'Unable to download webpage'
 332             errmsg = '%s: %s' % (errnote, compat_str(err))
 333             if fatal:
 334                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 335             else:
 336                 self._downloader.report_warning(errmsg)
 337                 return False
 338
 339     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 340         """ Returns a tuple (page content as string, URL handle) """
 341         # Strip hashes from the URL (#1038)
 342         if isinstance(url_or_request, (compat_str, str)):
 343             url_or_request = url_or_request.partition('#')[0]
 344
 345         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 346         if urlh is False:
 347             assert not fatal
 348             return False
 349         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 350         return (content, urlh)
 351
 352     @staticmethod
 353     def _guess_encoding_from_content(content_type, webpage_bytes):
 354         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 355         if m:
 356             encoding = m.group(1)
 357         else:
 358             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 359                           webpage_bytes[:1024])
 360             if m:
 361                 encoding = m.group(1).decode('ascii')
 362             elif webpage_bytes.startswith(b'\xff\xfe'):
 363                 encoding = 'utf-16'
 364             else:
 365                 encoding = 'utf-8'
 366
 367         return encoding
 368
 369     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 370         content_type = urlh.headers.get('Content-Type', '')
 371         webpage_bytes = urlh.read()
 372         if prefix is not None:
 373             webpage_bytes = prefix + webpage_bytes
 374         if not encoding:
 375             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 376         if self._downloader.params.get('dump_intermediate_pages', False):
 377             try:
 378                 url = url_or_request.get_full_url()
 379             except AttributeError:
 380                 url = url_or_request
 381             self.to_screen('Dumping request to ' + url)
 382             dump = base64.b64encode(webpage_bytes).decode('ascii')
 383             self._downloader.to_screen(dump)
 384         if self._downloader.params.get('write_pages', False):
 385             try:
 386                 url = url_or_request.get_full_url()
 387             except AttributeError:
 388                 url = url_or_request
 389             basen = '%s_%s' % (video_id, url)
 390             if len(basen) > 240:
 391                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 392                 basen = basen[:240 - len(h)] + h
 393             raw_filename = basen + '.dump'
 394             filename = sanitize_filename(raw_filename, restricted=True)
 395             self.to_screen('Saving request to ' + filename)
 396             # Working around MAX_PATH limitation on Windows (see
 397             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 398             if os.name == 'nt':
 399                 absfilepath = os.path.abspath(filename)
 400                 if len(absfilepath) > 259:
 401                     filename = '\\\\?\\' + absfilepath
 402             with open(filename, 'wb') as outf:
 403                 outf.write(webpage_bytes)
 404
 405         try:
 406             content = webpage_bytes.decode(encoding, 'replace')
 407         except LookupError:
 408             content = webpage_bytes.decode('utf-8', 'replace')
 409
 410         if ('<title>Access to this site is blocked</title>' in content and
 411                 'Websense' in content[:512]):
 412             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 413             blocked_iframe = self._html_search_regex(
 414                 r'<iframe src="([^"]+)"', content,
 415                 'Websense information URL', default=None)
 416             if blocked_iframe:
 417                 msg += ' Visit %s for more details' % blocked_iframe
 418             raise ExtractorError(msg, expected=True)
 419         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 420             msg = (
 421                 'Access to this webpage has been blocked by Indian censorship. '
 422                 'Use a VPN or proxy server (with --proxy) to route around it.')
 423             block_msg = self._html_search_regex(
 424                 r'</h1><p>(.*?)</p>',
 425                 content, 'block message', default=None)
 426             if block_msg:
 427                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 428             raise ExtractorError(msg, expected=True)
 429
 430         return content
 431
 432     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 433         """ Returns the data of the page as a string """
 434         success = False
 435         try_count = 0
 436         while success is False:
 437             try:
 438                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 439                 success = True
 440             except compat_http_client.IncompleteRead as e:
 441                 try_count += 1
 442                 if try_count >= tries:
 443                     raise e
 444                 self._sleep(timeout, video_id)
 445         if res is False:
 446             return res
 447         else:
 448             content, _ = res
 449             return content
 450
 451     def _download_xml(self, url_or_request, video_id,
 452                       note='Downloading XML', errnote='Unable to download XML',
 453                       transform_source=None, fatal=True, encoding=None):
 454         """Return the xml as an xml.etree.ElementTree.Element"""
 455         xml_string = self._download_webpage(
 456             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 457         if xml_string is False:
 458             return xml_string
 459         if transform_source:
 460             xml_string = transform_source(xml_string)
 461         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 462
 463     def _download_json(self, url_or_request, video_id,
 464                        note='Downloading JSON metadata',
 465                        errnote='Unable to download JSON metadata',
 466                        transform_source=None,
 467                        fatal=True, encoding=None):
 468         json_string = self._download_webpage(
 469             url_or_request, video_id, note, errnote, fatal=fatal,
 470             encoding=encoding)
 471         if (not fatal) and json_string is False:
 472             return None
 473         return self._parse_json(
 474             json_string, video_id, transform_source=transform_source, fatal=fatal)
 475
 476     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 477         if transform_source:
 478             json_string = transform_source(json_string)
 479         try:
 480             return json.loads(json_string)
 481         except ValueError as ve:
 482             errmsg = '%s: Failed to parse JSON ' % video_id
 483             if fatal:
 484                 raise ExtractorError(errmsg, cause=ve)
 485             else:
 486                 self.report_warning(errmsg + str(ve))
 487
 488     def report_warning(self, msg, video_id=None):
 489         idstr = '' if video_id is None else '%s: ' % video_id
 490         self._downloader.report_warning(
 491             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 492
 493     def to_screen(self, msg):
 494         """Print msg to screen, prefixing it with '[ie_name]'"""
 495         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 496
 497     def report_extraction(self, id_or_name):
 498         """Report information extraction."""
 499         self.to_screen('%s: Extracting information' % id_or_name)
 500
 501     def report_download_webpage(self, video_id):
 502         """Report webpage download."""
 503         self.to_screen('%s: Downloading webpage' % video_id)
 504
 505     def report_age_confirmation(self):
 506         """Report attempt to confirm age."""
 507         self.to_screen('Confirming age')
 508
 509     def report_login(self):
 510         """Report attempt to log in."""
 511         self.to_screen('Logging in')
 512
 513     @staticmethod
 514     def raise_login_required(msg='This video is only available for registered users'):
 515         raise ExtractorError(
 516             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 517             expected=True)
 518
 519     # Methods for following #608
 520     @staticmethod
 521     def url_result(url, ie=None, video_id=None, video_title=None):
 522         """Returns a URL that points to a page that should be processed"""
 523         # TODO: ie should be the class used for getting the info
 524         video_info = {'_type': 'url',
 525                       'url': url,
 526                       'ie_key': ie}
 527         if video_id is not None:
 528             video_info['id'] = video_id
 529         if video_title is not None:
 530             video_info['title'] = video_title
 531         return video_info
 532
 533     @staticmethod
 534     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 535         """Returns a playlist"""
 536         video_info = {'_type': 'playlist',
 537                       'entries': entries}
 538         if playlist_id:
 539             video_info['id'] = playlist_id
 540         if playlist_title:
 541             video_info['title'] = playlist_title
 542         if playlist_description:
 543             video_info['description'] = playlist_description
 544         return video_info
 545
 546     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 547         """
 548         Perform a regex search on the given string, using a single or a list of
 549         patterns returning the first matching group.
 550         In case of failure return a default value or raise a WARNING or a
 551         RegexNotFoundError, depending on fatal, specifying the field name.
 552         """
 553         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 554             mobj = re.search(pattern, string, flags)
 555         else:
 556             for p in pattern:
 557                 mobj = re.search(p, string, flags)
 558                 if mobj:
 559                     break
 560
 561         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 562             _name = '\033[0;34m%s\033[0m' % name
 563         else:
 564             _name = name
 565
 566         if mobj:
 567             if group is None:
 568                 # return the first matching group
 569                 return next(g for g in mobj.groups() if g is not None)
 570             else:
 571                 return mobj.group(group)
 572         elif default is not NO_DEFAULT:
 573             return default
 574         elif fatal:
 575             raise RegexNotFoundError('Unable to extract %s' % _name)
 576         else:
 577             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 578             return None
 579
 580     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 581         """
 582         Like _search_regex, but strips HTML tags and unescapes entities.
 583         """
 584         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 585         if res:
 586             return clean_html(res).strip()
 587         else:
 588             return res
 589
 590     def _get_login_info(self):
 591         """
 592         Get the login info as (username, password)
 593         It will look in the netrc file using the _NETRC_MACHINE value
 594         If there's no info available, return (None, None)
 595         """
 596         if self._downloader is None:
 597             return (None, None)
 598
 599         username = None
 600         password = None
 601         downloader_params = self._downloader.params
 602
 603         # Attempt to use provided username and password or .netrc data
 604         if downloader_params.get('username', None) is not None:
 605             username = downloader_params['username']
 606             password = downloader_params['password']
 607         elif downloader_params.get('usenetrc', False):
 608             try:
 609                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 610                 if info is not None:
 611                     username = info[0]
 612                     password = info[2]
 613                 else:
 614                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 615             except (IOError, netrc.NetrcParseError) as err:
 616                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 617
 618         return (username, password)
 619
 620     def _get_tfa_info(self, note='two-factor verification code'):
 621         """
 622         Get the two-factor authentication info
 623         TODO - asking the user will be required for sms/phone verify
 624         currently just uses the command line option
 625         If there's no info available, return None
 626         """
 627         if self._downloader is None:
 628             return None
 629         downloader_params = self._downloader.params
 630
 631         if downloader_params.get('twofactor', None) is not None:
 632             return downloader_params['twofactor']
 633
 634         return compat_getpass('Type %s and press [Return]: ' % note)
 635
 636     # Helper functions for extracting OpenGraph info
 637     @staticmethod
 638     def _og_regexes(prop):
 639         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 640         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 641         template = r'<meta[^>]+?%s[^>]+?%s'
 642         return [
 643             template % (property_re, content_re),
 644             template % (content_re, property_re),
 645         ]
 646
 647     @staticmethod
 648     def _meta_regex(prop):
 649         return r'''(?isx)<meta
 650                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 651                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 652
 653     def _og_search_property(self, prop, html, name=None, **kargs):
 654         if name is None:
 655             name = 'OpenGraph %s' % prop
 656         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 657         if escaped is None:
 658             return None
 659         return unescapeHTML(escaped)
 660
 661     def _og_search_thumbnail(self, html, **kargs):
 662         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 663
 664     def _og_search_description(self, html, **kargs):
 665         return self._og_search_property('description', html, fatal=False, **kargs)
 666
 667     def _og_search_title(self, html, **kargs):
 668         return self._og_search_property('title', html, **kargs)
 669
 670     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 671         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 672         if secure:
 673             regexes = self._og_regexes('video:secure_url') + regexes
 674         return self._html_search_regex(regexes, html, name, **kargs)
 675
 676     def _og_search_url(self, html, **kargs):
 677         return self._og_search_property('url', html, **kargs)
 678
 679     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 680         if display_name is None:
 681             display_name = name
 682         return self._html_search_regex(
 683             self._meta_regex(name),
 684             html, display_name, fatal=fatal, group='content', **kwargs)
 685
 686     def _dc_search_uploader(self, html):
 687         return self._html_search_meta('dc.creator', html, 'uploader')
 688
 689     def _rta_search(self, html):
 690         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 691         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 692                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 693                      html):
 694             return 18
 695         return 0
 696
 697     def _media_rating_search(self, html):
 698         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 699         rating = self._html_search_meta('rating', html)
 700
 701         if not rating:
 702             return None
 703
 704         RATING_TABLE = {
 705             'safe for kids': 0,
 706             'general': 8,
 707             '14 years': 14,
 708             'mature': 17,
 709             'restricted': 19,
 710         }
 711         return RATING_TABLE.get(rating.lower(), None)
 712
 713     def _family_friendly_search(self, html):
 714         # See http://schema.org/VideoObject
 715         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 716
 717         if not family_friendly:
 718             return None
 719
 720         RATING_TABLE = {
 721             '1': 0,
 722             'true': 0,
 723             '0': 18,
 724             'false': 18,
 725         }
 726         return RATING_TABLE.get(family_friendly.lower(), None)
 727
 728     def _twitter_search_player(self, html):
 729         return self._html_search_meta('twitter:player', html,
 730                                       'twitter card player')
 731
 732     @staticmethod
 733     def _hidden_inputs(html):
 734         hidden_inputs = {}
 735         for input in re.findall(r'<input([^>]+)>', html):
 736             if not re.search(r'type=(["\'])hidden\1', input):
 737                 continue
 738             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 739             if not name:
 740                 continue
 741             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 742             if not value:
 743                 continue
 744             hidden_inputs[name.group('value')] = value.group('value')
 745         return hidden_inputs
 746
 747     def _form_hidden_inputs(self, form_id, html):
 748         form = self._search_regex(
 749             r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 750             html, '%s form' % form_id, group='form')
 751         return self._hidden_inputs(form)
 752
 753     def _sort_formats(self, formats, field_preference=None):
 754         if not formats:
 755             raise ExtractorError('No video formats found')
 756
 757         def _formats_key(f):
 758             # TODO remove the following workaround
 759             from ..utils import determine_ext
 760             if not f.get('ext') and 'url' in f:
 761                 f['ext'] = determine_ext(f['url'])
 762
 763             if isinstance(field_preference, (list, tuple)):
 764                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 765
 766             preference = f.get('preference')
 767             if preference is None:
 768                 proto = f.get('protocol')
 769                 if proto is None:
 770                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 771
 772                 preference = 0 if proto in ['http', 'https'] else -0.1
 773                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 774                     preference -= 0.5
 775
 776             if f.get('vcodec') == 'none':  # audio only
 777                 if self._downloader.params.get('prefer_free_formats'):
 778                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 779                 else:
 780                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 781                 ext_preference = 0
 782                 try:
 783                     audio_ext_preference = ORDER.index(f['ext'])
 784                 except ValueError:
 785                     audio_ext_preference = -1
 786             else:
 787                 if self._downloader.params.get('prefer_free_formats'):
 788                     ORDER = ['flv', 'mp4', 'webm']
 789                 else:
 790                     ORDER = ['webm', 'flv', 'mp4']
 791                 try:
 792                     ext_preference = ORDER.index(f['ext'])
 793                 except ValueError:
 794                     ext_preference = -1
 795                 audio_ext_preference = 0
 796
 797             return (
 798                 preference,
 799                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 800                 f.get('quality') if f.get('quality') is not None else -1,
 801                 f.get('tbr') if f.get('tbr') is not None else -1,
 802                 f.get('filesize') if f.get('filesize') is not None else -1,
 803                 f.get('vbr') if f.get('vbr') is not None else -1,
 804                 f.get('height') if f.get('height') is not None else -1,
 805                 f.get('width') if f.get('width') is not None else -1,
 806                 ext_preference,
 807                 f.get('abr') if f.get('abr') is not None else -1,
 808                 audio_ext_preference,
 809                 f.get('fps') if f.get('fps') is not None else -1,
 810                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 811                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 812                 f.get('format_id') if f.get('format_id') is not None else '',
 813             )
 814         formats.sort(key=_formats_key)
 815
 816     def _check_formats(self, formats, video_id):
 817         if formats:
 818             formats[:] = filter(
 819                 lambda f: self._is_valid_url(
 820                     f['url'], video_id,
 821                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 822                 formats)
 823
 824     def _is_valid_url(self, url, video_id, item='video'):
 825         url = self._proto_relative_url(url, scheme='http:')
 826         # For now assume non HTTP(S) URLs always valid
 827         if not (url.startswith('http://') or url.startswith('https://')):
 828             return True
 829         try:
 830             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 831             return True
 832         except ExtractorError as e:
 833             if isinstance(e.cause, compat_HTTPError):
 834                 self.to_screen(
 835                     '%s: %s URL is invalid, skipping' % (video_id, item))
 836                 return False
 837             raise
 838
 839     def http_scheme(self):
 840         """ Either "http:" or "https:", depending on the user's preferences """
 841         return (
 842             'http:'
 843             if self._downloader.params.get('prefer_insecure', False)
 844             else 'https:')
 845
 846     def _proto_relative_url(self, url, scheme=None):
 847         if url is None:
 848             return url
 849         if url.startswith('//'):
 850             if scheme is None:
 851                 scheme = self.http_scheme()
 852             return scheme + url
 853         else:
 854             return url
 855
 856     def _sleep(self, timeout, video_id, msg_template=None):
 857         if msg_template is None:
 858             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 859         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 860         self.to_screen(msg)
 861         time.sleep(timeout)
 862
 863     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 864                              transform_source=lambda s: fix_xml_ampersands(s).strip()):
 865         manifest = self._download_xml(
 866             manifest_url, video_id, 'Downloading f4m manifest',
 867             'Unable to download f4m manifest',
 868             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 869             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 870             transform_source=transform_source)
 871
 872         formats = []
 873         manifest_version = '1.0'
 874         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 875         if not media_nodes:
 876             manifest_version = '2.0'
 877             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 878         for i, media_el in enumerate(media_nodes):
 879             if manifest_version == '2.0':
 880                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 881                 if not media_url:
 882                     continue
 883                 manifest_url = (
 884                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 885                     else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
 886                 # If media_url is itself a f4m manifest do the recursive extraction
 887                 # since bitrates in parent manifest (this one) and media_url manifest
 888                 # may differ leading to inability to resolve the format by requested
 889                 # bitrate in f4m downloader
 890                 if determine_ext(manifest_url) == 'f4m':
 891                     formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
 892                     continue
 893             tbr = int_or_none(media_el.attrib.get('bitrate'))
 894             formats.append({
 895                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 896                 'url': manifest_url,
 897                 'ext': 'flv',
 898                 'tbr': tbr,
 899                 'width': int_or_none(media_el.attrib.get('width')),
 900                 'height': int_or_none(media_el.attrib.get('height')),
 901                 'preference': preference,
 902             })
 903         self._sort_formats(formats)
 904
 905         return formats
 906
 907     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 908                               entry_protocol='m3u8', preference=None,
 909                               m3u8_id=None, note=None, errnote=None,
 910                               fatal=True):
 911
 912         formats = [{
 913             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 914             'url': m3u8_url,
 915             'ext': ext,
 916             'protocol': 'm3u8',
 917             'preference': preference - 1 if preference else -1,
 918             'resolution': 'multiple',
 919             'format_note': 'Quality selection URL',
 920         }]
 921
 922         format_url = lambda u: (
 923             u
 924             if re.match(r'^https?://', u)
 925             else compat_urlparse.urljoin(m3u8_url, u))
 926
 927         m3u8_doc = self._download_webpage(
 928             m3u8_url, video_id,
 929             note=note or 'Downloading m3u8 information',
 930             errnote=errnote or 'Failed to download m3u8 information',
 931             fatal=fatal)
 932         if m3u8_doc is False:
 933             return m3u8_doc
 934         last_info = None
 935         last_media = None
 936         kv_rex = re.compile(
 937             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 938         for line in m3u8_doc.splitlines():
 939             if line.startswith('#EXT-X-STREAM-INF:'):
 940                 last_info = {}
 941                 for m in kv_rex.finditer(line):
 942                     v = m.group('val')
 943                     if v.startswith('"'):
 944                         v = v[1:-1]
 945                     last_info[m.group('key')] = v
 946             elif line.startswith('#EXT-X-MEDIA:'):
 947                 last_media = {}
 948                 for m in kv_rex.finditer(line):
 949                     v = m.group('val')
 950                     if v.startswith('"'):
 951                         v = v[1:-1]
 952                     last_media[m.group('key')] = v
 953             elif line.startswith('#') or not line.strip():
 954                 continue
 955             else:
 956                 if last_info is None:
 957                     formats.append({'url': format_url(line)})
 958                     continue
 959                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 960                 format_id = []
 961                 if m3u8_id:
 962                     format_id.append(m3u8_id)
 963                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
 964                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
 965                 f = {
 966                     'format_id': '-'.join(format_id),
 967                     'url': format_url(line.strip()),
 968                     'tbr': tbr,
 969                     'ext': ext,
 970                     'protocol': entry_protocol,
 971                     'preference': preference,
 972                 }
 973                 codecs = last_info.get('CODECS')
 974                 if codecs:
 975                     # TODO: looks like video codec is not always necessarily goes first
 976                     va_codecs = codecs.split(',')
 977                     if va_codecs[0]:
 978                         f['vcodec'] = va_codecs[0].partition('.')[0]
 979                     if len(va_codecs) > 1 and va_codecs[1]:
 980                         f['acodec'] = va_codecs[1].partition('.')[0]
 981                 resolution = last_info.get('RESOLUTION')
 982                 if resolution:
 983                     width_str, height_str = resolution.split('x')
 984                     f['width'] = int(width_str)
 985                     f['height'] = int(height_str)
 986                 if last_media is not None:
 987                     f['m3u8_media'] = last_media
 988                     last_media = None
 989                 formats.append(f)
 990                 last_info = {}
 991         self._sort_formats(formats)
 992         return formats
 993
 994     @staticmethod
 995     def _xpath_ns(path, namespace=None):
 996         if not namespace:
 997             return path
 998         out = []
 999         for c in path.split('/'):
1000             if not c or c == '.':
1001                 out.append(c)
1002             else:
1003                 out.append('{%s}%s' % (namespace, c))
1004         return '/'.join(out)
1005
1006     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1007         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1008
1009         if smil is False:
1010             assert not fatal
1011             return []
1012
1013         namespace = self._parse_smil_namespace(smil)
1014
1015         return self._parse_smil_formats(
1016             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1017
1018     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1019         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1020         if smil is False:
1021             return {}
1022         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1023
1024     def _download_smil(self, smil_url, video_id, fatal=True):
1025         return self._download_xml(
1026             smil_url, video_id, 'Downloading SMIL file',
1027             'Unable to download SMIL file', fatal=fatal)
1028
1029     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1030         namespace = self._parse_smil_namespace(smil)
1031
1032         formats = self._parse_smil_formats(
1033             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1034         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1035
1036         video_id = os.path.splitext(url_basename(smil_url))[0]
1037         title = None
1038         description = None
1039         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1040             name = meta.attrib.get('name')
1041             content = meta.attrib.get('content')
1042             if not name or not content:
1043                 continue
1044             if not title and name == 'title':
1045                 title = content
1046             elif not description and name in ('description', 'abstract'):
1047                 description = content
1048
1049         return {
1050             'id': video_id,
1051             'title': title or video_id,
1052             'description': description,
1053             'formats': formats,
1054             'subtitles': subtitles,
1055         }
1056
1057     def _parse_smil_namespace(self, smil):
1058         return self._search_regex(
1059             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1060
1061     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1062         base = smil_url
1063         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1064             b = meta.get('base') or meta.get('httpBase')
1065             if b:
1066                 base = b
1067                 break
1068
1069         formats = []
1070         rtmp_count = 0
1071         http_count = 0
1072
1073         videos = smil.findall(self._xpath_ns('.//video', namespace))
1074         for video in videos:
1075             src = video.get('src')
1076             if not src:
1077                 continue
1078
1079             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1080             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1081             width = int_or_none(video.get('width'))
1082             height = int_or_none(video.get('height'))
1083             proto = video.get('proto')
1084             ext = video.get('ext')
1085             src_ext = determine_ext(src)
1086             streamer = video.get('streamer') or base
1087
1088             if proto == 'rtmp' or streamer.startswith('rtmp'):
1089                 rtmp_count += 1
1090                 formats.append({
1091                     'url': streamer,
1092                     'play_path': src,
1093                     'ext': 'flv',
1094                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1095                     'tbr': bitrate,
1096                     'filesize': filesize,
1097                     'width': width,
1098                     'height': height,
1099                 })
1100                 if transform_rtmp_url:
1101                     streamer, src = transform_rtmp_url(streamer, src)
1102                     formats[-1].update({
1103                         'url': streamer,
1104                         'play_path': src,
1105                     })
1106                 continue
1107
1108             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1109
1110             if proto == 'm3u8' or src_ext == 'm3u8':
1111                 formats.extend(self._extract_m3u8_formats(
1112                     src_url, video_id, ext or 'mp4', m3u8_id='hls'))
1113                 continue
1114
1115             if src_ext == 'f4m':
1116                 f4m_url = src_url
1117                 if not f4m_params:
1118                     f4m_params = {
1119                         'hdcore': '3.2.0',
1120                         'plugin': 'flowplayer-3.2.0.1',
1121                     }
1122                 f4m_url += '&' if '?' in f4m_url else '?'
1123                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1124                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
1125                 continue
1126
1127             if src_url.startswith('http'):
1128                 http_count += 1
1129                 formats.append({
1130                     'url': src_url,
1131                     'ext': ext or src_ext or 'flv',
1132                     'format_id': 'http-%d' % (bitrate or http_count),
1133                     'tbr': bitrate,
1134                     'filesize': filesize,
1135                     'width': width,
1136                     'height': height,
1137                 })
1138                 continue
1139
1140         self._sort_formats(formats)
1141
1142         return formats
1143
1144     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1145         subtitles = {}
1146         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1147             src = textstream.get('src')
1148             if not src:
1149                 continue
1150             ext = textstream.get('ext') or determine_ext(src)
1151             if not ext:
1152                 type_ = textstream.get('type')
1153                 SUBTITLES_TYPES = {
1154                     'text/vtt': 'vtt',
1155                     'text/srt': 'srt',
1156                     'application/smptett+xml': 'tt',
1157                 }
1158                 if type_ in SUBTITLES_TYPES:
1159                     ext = SUBTITLES_TYPES[type_]
1160             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1161             subtitles.setdefault(lang, []).append({
1162                 'url': src,
1163                 'ext': ext,
1164             })
1165         return subtitles
1166
1167     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1168         xspf = self._download_xml(
1169             playlist_url, playlist_id, 'Downloading xpsf playlist',
1170             'Unable to download xspf manifest', fatal=fatal)
1171         if xspf is False:
1172             return []
1173         return self._parse_xspf(xspf, playlist_id)
1174
1175     def _parse_xspf(self, playlist, playlist_id):
1176         NS_MAP = {
1177             'xspf': 'http://xspf.org/ns/0/',
1178             's1': 'http://static.streamone.nl/player/ns/0',
1179         }
1180
1181         entries = []
1182         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1183             title = xpath_text(
1184                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1185             description = xpath_text(
1186                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1187             thumbnail = xpath_text(
1188                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1189             duration = float_or_none(
1190                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1191
1192             formats = [{
1193                 'url': location.text,
1194                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1195                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1196                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1197             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1198             self._sort_formats(formats)
1199
1200             entries.append({
1201                 'id': playlist_id,
1202                 'title': title,
1203                 'description': description,
1204                 'thumbnail': thumbnail,
1205                 'duration': duration,
1206                 'formats': formats,
1207             })
1208         return entries
1209
1210     def _live_title(self, name):
1211         """ Generate the title for a live video """
1212         now = datetime.datetime.now()
1213         now_str = now.strftime("%Y-%m-%d %H:%M")
1214         return name + ' ' + now_str
1215
1216     def _int(self, v, name, fatal=False, **kwargs):
1217         res = int_or_none(v, **kwargs)
1218         if 'get_attr' in kwargs:
1219             print(getattr(v, kwargs['get_attr']))
1220         if res is None:
1221             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1222             if fatal:
1223                 raise ExtractorError(msg)
1224             else:
1225                 self._downloader.report_warning(msg)
1226         return res
1227
1228     def _float(self, v, name, fatal=False, **kwargs):
1229         res = float_or_none(v, **kwargs)
1230         if res is None:
1231             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1232             if fatal:
1233                 raise ExtractorError(msg)
1234             else:
1235                 self._downloader.report_warning(msg)
1236         return res
1237
1238     def _set_cookie(self, domain, name, value, expire_time=None):
1239         cookie = compat_cookiejar.Cookie(
1240             0, name, value, None, None, domain, None,
1241             None, '/', True, False, expire_time, '', None, None, None)
1242         self._downloader.cookiejar.set_cookie(cookie)
1243
1244     def _get_cookies(self, url):
1245         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1246         req = compat_urllib_request.Request(url)
1247         self._downloader.cookiejar.add_cookie_header(req)
1248         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1249
1250     def get_testcases(self, include_onlymatching=False):
1251         t = getattr(self, '_TEST', None)
1252         if t:
1253             assert not hasattr(self, '_TESTS'), \
1254                 '%s has _TEST and _TESTS' % type(self).__name__
1255             tests = [t]
1256         else:
1257             tests = getattr(self, '_TESTS', [])
1258         for t in tests:
1259             if not include_onlymatching and t.get('only_matching', False):
1260                 continue
1261             t['name'] = type(self).__name__[:-len('IE')]
1262             yield t
1263
1264     def is_suitable(self, age_limit):
1265         """ Test whether the extractor is generally suitable for the given
1266         age limit (i.e. pornographic sites are not, all others usually are) """
1267
1268         any_restricted = False
1269         for tc in self.get_testcases(include_onlymatching=False):
1270             if 'playlist' in tc:
1271                 tc = tc['playlist'][0]
1272             is_restricted = age_restricted(
1273                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1274             if not is_restricted:
1275                 return True
1276             any_restricted = any_restricted or is_restricted
1277         return not any_restricted
1278
1279     def extract_subtitles(self, *args, **kwargs):
1280         if (self._downloader.params.get('writesubtitles', False) or
1281                 self._downloader.params.get('listsubtitles')):
1282             return self._get_subtitles(*args, **kwargs)
1283         return {}
1284
1285     def _get_subtitles(self, *args, **kwargs):
1286         raise NotImplementedError("This method must be implemented by subclasses")
1287
1288     @staticmethod
1289     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1290         """ Merge subtitle items for one language. Items with duplicated URLs
1291         will be dropped. """
1292         list1_urls = set([item['url'] for item in subtitle_list1])
1293         ret = list(subtitle_list1)
1294         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1295         return ret
1296
1297     @classmethod
1298     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1299         """ Merge two subtitle dictionaries, language by language. """
1300         ret = dict(subtitle_dict1)
1301         for lang in subtitle_dict2:
1302             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1303         return ret
1304
1305     def extract_automatic_captions(self, *args, **kwargs):
1306         if (self._downloader.params.get('writeautomaticsub', False) or
1307                 self._downloader.params.get('listsubtitles')):
1308             return self._get_automatic_captions(*args, **kwargs)
1309         return {}
1310
1311     def _get_automatic_captions(self, *args, **kwargs):
1312         raise NotImplementedError("This method must be implemented by subclasses")
1313
1314
1315 class SearchInfoExtractor(InfoExtractor):
1316     """
1317     Base class for paged search queries extractors.
1318     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1319     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1320     """
1321
1322     @classmethod
1323     def _make_valid_url(cls):
1324         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1325
1326     @classmethod
1327     def suitable(cls, url):
1328         return re.match(cls._make_valid_url(), url) is not None
1329
1330     def _real_extract(self, query):
1331         mobj = re.match(self._make_valid_url(), query)
1332         if mobj is None:
1333             raise ExtractorError('Invalid search query "%s"' % query)
1334
1335         prefix = mobj.group('prefix')
1336         query = mobj.group('query')
1337         if prefix == '':
1338             return self._get_n_results(query, 1)
1339         elif prefix == 'all':
1340             return self._get_n_results(query, self._MAX_RESULTS)
1341         else:
1342             n = int(prefix)
1343             if n <= 0:
1344                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1345             elif n > self._MAX_RESULTS:
1346                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1347                 n = self._MAX_RESULTS
1348             return self._get_n_results(query, n)
1349
1350     def _get_n_results(self, query, n):
1351         """Get a specified number of results for a query"""
1352         raise NotImplementedError("This method must be implemented by subclasses")
1353
1354     @property
1355     def SEARCH_KEY(self):
1356         return self._SEARCH_KEY