youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7
   8 from ..utils import (
   9     compat_http_client,
  10     compat_urllib_error,
  11     compat_urllib_request,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     unescapeHTML,
  18 )
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     title:          Video title, unescaped.
  36     ext:            Video filename extension.
  37
  38     Instead of url and ext, formats can also specified.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnails:     A list of dictionaries (with the entries "resolution" and
  44                     "url") for the varying thumbnails
  45     thumbnail:      Full URL to a video thumbnail image.
  46     description:    One-line video description.
  47     uploader:       Full name of the video uploader.
  48     upload_date:    Video upload date (YYYYMMDD).
  49     uploader_id:    Nickname or id of the video uploader.
  50     location:       Physical location of the video.
  51     player_url:     SWF Player URL (used for rtmpdump).
  52     subtitles:      The subtitle file contents as a dictionary in the format
  53                     {language: subtitles}.
  54     view_count:     How many users have watched the video on the platform.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57     age_limit:      Age restriction for the video, as an integer (years)
  58     formats:        A list of dictionaries for each format available, it must
  59                     be ordered from worst to best quality. Potential fields:
  60                     * url       Mandatory. The URL of the video file
  61                     * ext       Will be calculated from url if missing
  62                     * format    A human-readable description of the format
  63                                 ("mp4 container with h264/opus").
  64                                 Calculated from width and height if missing.
  65                     * format_id A short description of the format
  66                                 ("mp4_h264_opus" or "19")
  67                     * width     Width of the video, if known
  68                     * height    Height of the video, if known
  69
  70     Unless mentioned otherwise, the fields should be Unicode strings.
  71
  72     Subclasses of this one should re-define the _real_initialize() and
  73     _real_extract() methods and define a _VALID_URL regexp.
  74     Probably, they should also be added to the list of extractors.
  75
  76     _real_extract() must return a *list* of information dictionaries as
  77     described above.
  78
  79     Finally, the _WORKING attribute should be set to False for broken IEs
  80     in order to warn the users and skip the tests.
  81     """
  82
  83     _ready = False
  84     _downloader = None
  85     _WORKING = True
  86
  87     def __init__(self, downloader=None):
  88         """Constructor. Receives an optional downloader."""
  89         self._ready = False
  90         self.set_downloader(downloader)
  91
  92     @classmethod
  93     def suitable(cls, url):
  94         """Receives a URL and returns True if suitable for this IE."""
  95
  96         # This does not use has/getattr intentionally - we want to know whether
  97         # we have cached the regexp for *this* class, whereas getattr would also
  98         # match the superclass
  99         if '_VALID_URL_RE' not in cls.__dict__:
 100             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 101         return cls._VALID_URL_RE.match(url) is not None
 102
 103     @classmethod
 104     def working(cls):
 105         """Getter method for _WORKING."""
 106         return cls._WORKING
 107
 108     def initialize(self):
 109         """Initializes an instance (authentication, etc)."""
 110         if not self._ready:
 111             self._real_initialize()
 112             self._ready = True
 113
 114     def extract(self, url):
 115         """Extracts URL information and returns it in list of dicts."""
 116         self.initialize()
 117         return self._real_extract(url)
 118
 119     def set_downloader(self, downloader):
 120         """Sets the downloader for this IE."""
 121         self._downloader = downloader
 122
 123     def _real_initialize(self):
 124         """Real initialization process. Redefine in subclasses."""
 125         pass
 126
 127     def _real_extract(self, url):
 128         """Real extraction process. Redefine in subclasses."""
 129         pass
 130
 131     @classmethod
 132     def ie_key(cls):
 133         """A string for getting the InfoExtractor with get_info_extractor"""
 134         return cls.__name__[:-2]
 135
 136     @property
 137     def IE_NAME(self):
 138         return type(self).__name__[:-2]
 139
 140     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 141         """ Returns the response handle """
 142         if note is None:
 143             self.report_download_webpage(video_id)
 144         elif note is not False:
 145             self.to_screen(u'%s: %s' % (video_id, note))
 146         try:
 147             return compat_urllib_request.urlopen(url_or_request)
 148         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 149             if errnote is None:
 150                 errnote = u'Unable to download webpage'
 151             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 152
 153     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 154         """ Returns a tuple (page content as string, URL handle) """
 155
 156         # Strip hashes from the URL (#1038)
 157         if isinstance(url_or_request, (compat_str, str)):
 158             url_or_request = url_or_request.partition('#')[0]
 159
 160         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 161         content_type = urlh.headers.get('Content-Type', '')
 162         webpage_bytes = urlh.read()
 163         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 164         if m:
 165             encoding = m.group(1)
 166         else:
 167             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 168                           webpage_bytes[:1024])
 169             if m:
 170                 encoding = m.group(1).decode('ascii')
 171             else:
 172                 encoding = 'utf-8'
 173         if self._downloader.params.get('dump_intermediate_pages', False):
 174             try:
 175                 url = url_or_request.get_full_url()
 176             except AttributeError:
 177                 url = url_or_request
 178             self.to_screen(u'Dumping request to ' + url)
 179             dump = base64.b64encode(webpage_bytes).decode('ascii')
 180             self._downloader.to_screen(dump)
 181         content = webpage_bytes.decode(encoding, 'replace')
 182         return (content, urlh)
 183
 184     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 185         """ Returns the data of the page as a string """
 186         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 187
 188     def to_screen(self, msg):
 189         """Print msg to screen, prefixing it with '[ie_name]'"""
 190         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 191
 192     def report_extraction(self, id_or_name):
 193         """Report information extraction."""
 194         self.to_screen(u'%s: Extracting information' % id_or_name)
 195
 196     def report_download_webpage(self, video_id):
 197         """Report webpage download."""
 198         self.to_screen(u'%s: Downloading webpage' % video_id)
 199
 200     def report_age_confirmation(self):
 201         """Report attempt to confirm age."""
 202         self.to_screen(u'Confirming age')
 203
 204     def report_login(self):
 205         """Report attempt to log in."""
 206         self.to_screen(u'Logging in')
 207
 208     #Methods for following #608
 209     def url_result(self, url, ie=None):
 210         """Returns a url that points to a page that should be processed"""
 211         #TODO: ie should be the class used for getting the info
 212         video_info = {'_type': 'url',
 213                       'url': url,
 214                       'ie_key': ie}
 215         return video_info
 216     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 217         """Returns a playlist"""
 218         video_info = {'_type': 'playlist',
 219                       'entries': entries}
 220         if playlist_id:
 221             video_info['id'] = playlist_id
 222         if playlist_title:
 223             video_info['title'] = playlist_title
 224         return video_info
 225
 226     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 227         """
 228         Perform a regex search on the given string, using a single or a list of
 229         patterns returning the first matching group.
 230         In case of failure return a default value or raise a WARNING or a
 231         ExtractorError, depending on fatal, specifying the field name.
 232         """
 233         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 234             mobj = re.search(pattern, string, flags)
 235         else:
 236             for p in pattern:
 237                 mobj = re.search(p, string, flags)
 238                 if mobj: break
 239
 240         if sys.stderr.isatty() and os.name != 'nt':
 241             _name = u'\033[0;34m%s\033[0m' % name
 242         else:
 243             _name = name
 244
 245         if mobj:
 246             # return the first matching group
 247             return next(g for g in mobj.groups() if g is not None)
 248         elif default is not None:
 249             return default
 250         elif fatal:
 251             raise ExtractorError(u'Unable to extract %s' % _name)
 252         else:
 253             self._downloader.report_warning(u'unable to extract %s; '
 254                 u'please report this issue on http://yt-dl.org/bug' % _name)
 255             return None
 256
 257     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 258         """
 259         Like _search_regex, but strips HTML tags and unescapes entities.
 260         """
 261         res = self._search_regex(pattern, string, name, default, fatal, flags)
 262         if res:
 263             return clean_html(res).strip()
 264         else:
 265             return res
 266
 267     def _get_login_info(self):
 268         """
 269         Get the the login info as (username, password)
 270         It will look in the netrc file using the _NETRC_MACHINE value
 271         If there's no info available, return (None, None)
 272         """
 273         if self._downloader is None:
 274             return (None, None)
 275
 276         username = None
 277         password = None
 278         downloader_params = self._downloader.params
 279
 280         # Attempt to use provided username and password or .netrc data
 281         if downloader_params.get('username', None) is not None:
 282             username = downloader_params['username']
 283             password = downloader_params['password']
 284         elif downloader_params.get('usenetrc', False):
 285             try:
 286                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 287                 if info is not None:
 288                     username = info[0]
 289                     password = info[2]
 290                 else:
 291                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 292             except (IOError, netrc.NetrcParseError) as err:
 293                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 294
 295         return (username, password)
 296
 297     # Helper functions for extracting OpenGraph info
 298     @staticmethod
 299     def _og_regex(prop):
 300         return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
 301
 302     def _og_search_property(self, prop, html, name=None, **kargs):
 303         if name is None:
 304             name = 'OpenGraph %s' % prop
 305         escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
 306         return unescapeHTML(escaped)
 307
 308     def _og_search_thumbnail(self, html, **kargs):
 309         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 310
 311     def _og_search_description(self, html, **kargs):
 312         return self._og_search_property('description', html, fatal=False, **kargs)
 313
 314     def _og_search_title(self, html, **kargs):
 315         return self._og_search_property('title', html, **kargs)
 316
 317     def _og_search_video_url(self, html, name='video url', **kargs):
 318         return self._html_search_regex([self._og_regex('video:secure_url'),
 319                                         self._og_regex('video')],
 320                                        html, name, **kargs)
 321
 322     def _rta_search(self, html):
 323         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 324         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 325                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 326                      html):
 327             return 18
 328         return 0
 329
 330
 331 class SearchInfoExtractor(InfoExtractor):
 332     """
 333     Base class for paged search queries extractors.
 334     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 335     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 336     """
 337
 338     @classmethod
 339     def _make_valid_url(cls):
 340         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 341
 342     @classmethod
 343     def suitable(cls, url):
 344         return re.match(cls._make_valid_url(), url) is not None
 345
 346     def _real_extract(self, query):
 347         mobj = re.match(self._make_valid_url(), query)
 348         if mobj is None:
 349             raise ExtractorError(u'Invalid search query "%s"' % query)
 350
 351         prefix = mobj.group('prefix')
 352         query = mobj.group('query')
 353         if prefix == '':
 354             return self._get_n_results(query, 1)
 355         elif prefix == 'all':
 356             return self._get_n_results(query, self._MAX_RESULTS)
 357         else:
 358             n = int(prefix)
 359             if n <= 0:
 360                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 361             elif n > self._MAX_RESULTS:
 362                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 363                 n = self._MAX_RESULTS
 364             return self._get_n_results(query, n)
 365
 366     def _get_n_results(self, query, n):
 367         """Get a specified number of results for a query"""
 368         raise NotImplementedError("This method must be implemented by sublclasses")
 369
 370     @property
 371     def SEARCH_KEY(self):
 372         return self._SEARCH_KEY