youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7
   8 from ..utils import (
   9     compat_http_client,
  10     compat_urllib_error,
  11     compat_urllib_request,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     Instead of url and ext, formats can also specified.
  41
  42     The following fields are optional:
  43
  44     format:         The video format, defaults to ext (used for --get-format)
  45     thumbnails:     A list of dictionaries (with the entries "resolution" and
  46                     "url") for the varying thumbnails
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents as a dictionary in the format
  55                     {language: subtitles}.
  56     view_count:     How many users have watched the video on the platform.
  57     urlhandle:      [internal] The urlHandle to be used to download the file,
  58                     like returned by urllib.request.urlopen
  59     age_limit:      Age restriction for the video, as an integer (years)
  60     formats:        A list of dictionaries for each format available, it must
  61                     be ordered from worst to best quality. Potential fields:
  62                     * url       Mandatory. The URL of the video file
  63                     * ext       Will be calculated from url if missing
  64                     * format    A human-readable description of the format
  65                                 ("mp4 container with h264/opus").
  66                                 Calculated from the format_id, width, height.
  67                                 and format_note fields if missing.
  68                     * format_id A short description of the format
  69                                 ("mp4_h264_opus" or "19")
  70                     * format_note Additional info about the format
  71                                 ("3D" or "DASH video")
  72                     * width     Width of the video, if known
  73                     * height    Height of the video, if known
  74     webpage_url:    The url to the video webpage, if given to youtube-dl it
  75                     should allow to get the same result again. (It will be set
  76                     by YoutubeDL if it's missing)
  77
  78     Unless mentioned otherwise, the fields should be Unicode strings.
  79
  80     Subclasses of this one should re-define the _real_initialize() and
  81     _real_extract() methods and define a _VALID_URL regexp.
  82     Probably, they should also be added to the list of extractors.
  83
  84     _real_extract() must return a *list* of information dictionaries as
  85     described above.
  86
  87     Finally, the _WORKING attribute should be set to False for broken IEs
  88     in order to warn the users and skip the tests.
  89     """
  90
  91     _ready = False
  92     _downloader = None
  93     _WORKING = True
  94
  95     def __init__(self, downloader=None):
  96         """Constructor. Receives an optional downloader."""
  97         self._ready = False
  98         self.set_downloader(downloader)
  99
 100     @classmethod
 101     def suitable(cls, url):
 102         """Receives a URL and returns True if suitable for this IE."""
 103
 104         # This does not use has/getattr intentionally - we want to know whether
 105         # we have cached the regexp for *this* class, whereas getattr would also
 106         # match the superclass
 107         if '_VALID_URL_RE' not in cls.__dict__:
 108             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 109         return cls._VALID_URL_RE.match(url) is not None
 110
 111     @classmethod
 112     def working(cls):
 113         """Getter method for _WORKING."""
 114         return cls._WORKING
 115
 116     def initialize(self):
 117         """Initializes an instance (authentication, etc)."""
 118         if not self._ready:
 119             self._real_initialize()
 120             self._ready = True
 121
 122     def extract(self, url):
 123         """Extracts URL information and returns it in list of dicts."""
 124         self.initialize()
 125         return self._real_extract(url)
 126
 127     def set_downloader(self, downloader):
 128         """Sets the downloader for this IE."""
 129         self._downloader = downloader
 130
 131     def _real_initialize(self):
 132         """Real initialization process. Redefine in subclasses."""
 133         pass
 134
 135     def _real_extract(self, url):
 136         """Real extraction process. Redefine in subclasses."""
 137         pass
 138
 139     @classmethod
 140     def ie_key(cls):
 141         """A string for getting the InfoExtractor with get_info_extractor"""
 142         return cls.__name__[:-2]
 143
 144     @property
 145     def IE_NAME(self):
 146         return type(self).__name__[:-2]
 147
 148     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 149         """ Returns the response handle """
 150         if note is None:
 151             self.report_download_webpage(video_id)
 152         elif note is not False:
 153             self.to_screen(u'%s: %s' % (video_id, note))
 154         try:
 155             return compat_urllib_request.urlopen(url_or_request)
 156         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 157             if errnote is None:
 158                 errnote = u'Unable to download webpage'
 159             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 160
 161     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 162         """ Returns a tuple (page content as string, URL handle) """
 163
 164         # Strip hashes from the URL (#1038)
 165         if isinstance(url_or_request, (compat_str, str)):
 166             url_or_request = url_or_request.partition('#')[0]
 167
 168         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 169         content_type = urlh.headers.get('Content-Type', '')
 170         webpage_bytes = urlh.read()
 171         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 172         if m:
 173             encoding = m.group(1)
 174         else:
 175             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 176                           webpage_bytes[:1024])
 177             if m:
 178                 encoding = m.group(1).decode('ascii')
 179             else:
 180                 encoding = 'utf-8'
 181         if self._downloader.params.get('dump_intermediate_pages', False):
 182             try:
 183                 url = url_or_request.get_full_url()
 184             except AttributeError:
 185                 url = url_or_request
 186             self.to_screen(u'Dumping request to ' + url)
 187             dump = base64.b64encode(webpage_bytes).decode('ascii')
 188             self._downloader.to_screen(dump)
 189         if self._downloader.params.get('write_pages', False):
 190             try:
 191                 url = url_or_request.get_full_url()
 192             except AttributeError:
 193                 url = url_or_request
 194             raw_filename = ('%s_%s.dump' % (video_id, url))
 195             filename = sanitize_filename(raw_filename, restricted=True)
 196             self.to_screen(u'Saving request to ' + filename)
 197             with open(filename, 'wb') as outf:
 198                 outf.write(webpage_bytes)
 199
 200         content = webpage_bytes.decode(encoding, 'replace')
 201         return (content, urlh)
 202
 203     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 204         """ Returns the data of the page as a string """
 205         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 206
 207     def to_screen(self, msg):
 208         """Print msg to screen, prefixing it with '[ie_name]'"""
 209         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 210
 211     def report_extraction(self, id_or_name):
 212         """Report information extraction."""
 213         self.to_screen(u'%s: Extracting information' % id_or_name)
 214
 215     def report_download_webpage(self, video_id):
 216         """Report webpage download."""
 217         self.to_screen(u'%s: Downloading webpage' % video_id)
 218
 219     def report_age_confirmation(self):
 220         """Report attempt to confirm age."""
 221         self.to_screen(u'Confirming age')
 222
 223     def report_login(self):
 224         """Report attempt to log in."""
 225         self.to_screen(u'Logging in')
 226
 227     #Methods for following #608
 228     def url_result(self, url, ie=None):
 229         """Returns a url that points to a page that should be processed"""
 230         #TODO: ie should be the class used for getting the info
 231         video_info = {'_type': 'url',
 232                       'url': url,
 233                       'ie_key': ie}
 234         return video_info
 235     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 236         """Returns a playlist"""
 237         video_info = {'_type': 'playlist',
 238                       'entries': entries}
 239         if playlist_id:
 240             video_info['id'] = playlist_id
 241         if playlist_title:
 242             video_info['title'] = playlist_title
 243         return video_info
 244
 245     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 246         """
 247         Perform a regex search on the given string, using a single or a list of
 248         patterns returning the first matching group.
 249         In case of failure return a default value or raise a WARNING or a
 250         RegexNotFoundError, depending on fatal, specifying the field name.
 251         """
 252         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 253             mobj = re.search(pattern, string, flags)
 254         else:
 255             for p in pattern:
 256                 mobj = re.search(p, string, flags)
 257                 if mobj: break
 258
 259         if sys.stderr.isatty() and os.name != 'nt':
 260             _name = u'\033[0;34m%s\033[0m' % name
 261         else:
 262             _name = name
 263
 264         if mobj:
 265             # return the first matching group
 266             return next(g for g in mobj.groups() if g is not None)
 267         elif default is not None:
 268             return default
 269         elif fatal:
 270             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 271         else:
 272             self._downloader.report_warning(u'unable to extract %s; '
 273                 u'please report this issue on http://yt-dl.org/bug' % _name)
 274             return None
 275
 276     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 277         """
 278         Like _search_regex, but strips HTML tags and unescapes entities.
 279         """
 280         res = self._search_regex(pattern, string, name, default, fatal, flags)
 281         if res:
 282             return clean_html(res).strip()
 283         else:
 284             return res
 285
 286     def _get_login_info(self):
 287         """
 288         Get the the login info as (username, password)
 289         It will look in the netrc file using the _NETRC_MACHINE value
 290         If there's no info available, return (None, None)
 291         """
 292         if self._downloader is None:
 293             return (None, None)
 294
 295         username = None
 296         password = None
 297         downloader_params = self._downloader.params
 298
 299         # Attempt to use provided username and password or .netrc data
 300         if downloader_params.get('username', None) is not None:
 301             username = downloader_params['username']
 302             password = downloader_params['password']
 303         elif downloader_params.get('usenetrc', False):
 304             try:
 305                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 306                 if info is not None:
 307                     username = info[0]
 308                     password = info[2]
 309                 else:
 310                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 311             except (IOError, netrc.NetrcParseError) as err:
 312                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 313
 314         return (username, password)
 315
 316     # Helper functions for extracting OpenGraph info
 317     @staticmethod
 318     def _og_regex(prop):
 319         return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
 320
 321     def _og_search_property(self, prop, html, name=None, **kargs):
 322         if name is None:
 323             name = 'OpenGraph %s' % prop
 324         escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
 325         return unescapeHTML(escaped)
 326
 327     def _og_search_thumbnail(self, html, **kargs):
 328         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 329
 330     def _og_search_description(self, html, **kargs):
 331         return self._og_search_property('description', html, fatal=False, **kargs)
 332
 333     def _og_search_title(self, html, **kargs):
 334         return self._og_search_property('title', html, **kargs)
 335
 336     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 337         regexes = [self._og_regex('video')]
 338         if secure: regexes.insert(0, self._og_regex('video:secure_url'))
 339         return self._html_search_regex(regexes, html, name, **kargs)
 340
 341     def _rta_search(self, html):
 342         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 343         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 344                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 345                      html):
 346             return 18
 347         return 0
 348
 349
 350 class SearchInfoExtractor(InfoExtractor):
 351     """
 352     Base class for paged search queries extractors.
 353     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 354     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 355     """
 356
 357     @classmethod
 358     def _make_valid_url(cls):
 359         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 360
 361     @classmethod
 362     def suitable(cls, url):
 363         return re.match(cls._make_valid_url(), url) is not None
 364
 365     def _real_extract(self, query):
 366         mobj = re.match(self._make_valid_url(), query)
 367         if mobj is None:
 368             raise ExtractorError(u'Invalid search query "%s"' % query)
 369
 370         prefix = mobj.group('prefix')
 371         query = mobj.group('query')
 372         if prefix == '':
 373             return self._get_n_results(query, 1)
 374         elif prefix == 'all':
 375             return self._get_n_results(query, self._MAX_RESULTS)
 376         else:
 377             n = int(prefix)
 378             if n <= 0:
 379                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 380             elif n > self._MAX_RESULTS:
 381                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 382                 n = self._MAX_RESULTS
 383             return self._get_n_results(query, n)
 384
 385     def _get_n_results(self, query, n):
 386         """Get a specified number of results for a query"""
 387         raise NotImplementedError("This method must be implemented by subclasses")
 388
 389     @property
 390     def SEARCH_KEY(self):
 391         return self._SEARCH_KEY