youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7
   8 from ..utils import (
   9     compat_http_client,
  10     compat_urllib_error,
  11     compat_str,
  12
  13     clean_html,
  14     compiled_regex_type,
  15     ExtractorError,
  16     RegexNotFoundError,
  17     sanitize_filename,
  18     unescapeHTML,
  19 )
  20
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     Instead of url and ext, formats can also specified.
  41
  42     The following fields are optional:
  43
  44     format:         The video format, defaults to ext (used for --get-format)
  45     thumbnails:     A list of dictionaries (with the entries "resolution" and
  46                     "url") for the varying thumbnails
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents as a dictionary in the format
  55                     {language: subtitles}.
  56     view_count:     How many users have watched the video on the platform.
  57     urlhandle:      [internal] The urlHandle to be used to download the file,
  58                     like returned by urllib.request.urlopen
  59     age_limit:      Age restriction for the video, as an integer (years)
  60     formats:        A list of dictionaries for each format available, it must
  61                     be ordered from worst to best quality. Potential fields:
  62                     * url       Mandatory. The URL of the video file
  63                     * ext       Will be calculated from url if missing
  64                     * format    A human-readable description of the format
  65                                 ("mp4 container with h264/opus").
  66                                 Calculated from the format_id, width, height.
  67                                 and format_note fields if missing.
  68                     * format_id A short description of the format
  69                                 ("mp4_h264_opus" or "19")
  70                     * format_note Additional info about the format
  71                                 ("3D" or "DASH video")
  72                     * width     Width of the video, if known
  73                     * height    Height of the video, if known
  74                     * abr       Average audio bitrate in KBit/s
  75                     * acodec    Name of the audio codec in use
  76                     * vbr       Average video bitrate in KBit/s
  77                     * vcodec    Name of the video codec in use
  78     webpage_url:    The url to the video webpage, if given to youtube-dl it
  79                     should allow to get the same result again. (It will be set
  80                     by YoutubeDL if it's missing)
  81
  82     Unless mentioned otherwise, the fields should be Unicode strings.
  83
  84     Subclasses of this one should re-define the _real_initialize() and
  85     _real_extract() methods and define a _VALID_URL regexp.
  86     Probably, they should also be added to the list of extractors.
  87
  88     _real_extract() must return a *list* of information dictionaries as
  89     described above.
  90
  91     Finally, the _WORKING attribute should be set to False for broken IEs
  92     in order to warn the users and skip the tests.
  93     """
  94
  95     _ready = False
  96     _downloader = None
  97     _WORKING = True
  98
  99     def __init__(self, downloader=None):
 100         """Constructor. Receives an optional downloader."""
 101         self._ready = False
 102         self.set_downloader(downloader)
 103
 104     @classmethod
 105     def suitable(cls, url):
 106         """Receives a URL and returns True if suitable for this IE."""
 107
 108         # This does not use has/getattr intentionally - we want to know whether
 109         # we have cached the regexp for *this* class, whereas getattr would also
 110         # match the superclass
 111         if '_VALID_URL_RE' not in cls.__dict__:
 112             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 113         return cls._VALID_URL_RE.match(url) is not None
 114
 115     @classmethod
 116     def working(cls):
 117         """Getter method for _WORKING."""
 118         return cls._WORKING
 119
 120     def initialize(self):
 121         """Initializes an instance (authentication, etc)."""
 122         if not self._ready:
 123             self._real_initialize()
 124             self._ready = True
 125
 126     def extract(self, url):
 127         """Extracts URL information and returns it in list of dicts."""
 128         self.initialize()
 129         return self._real_extract(url)
 130
 131     def set_downloader(self, downloader):
 132         """Sets the downloader for this IE."""
 133         self._downloader = downloader
 134
 135     def _real_initialize(self):
 136         """Real initialization process. Redefine in subclasses."""
 137         pass
 138
 139     def _real_extract(self, url):
 140         """Real extraction process. Redefine in subclasses."""
 141         pass
 142
 143     @classmethod
 144     def ie_key(cls):
 145         """A string for getting the InfoExtractor with get_info_extractor"""
 146         return cls.__name__[:-2]
 147
 148     @property
 149     def IE_NAME(self):
 150         return type(self).__name__[:-2]
 151
 152     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 153         """ Returns the response handle """
 154         if note is None:
 155             self.report_download_webpage(video_id)
 156         elif note is not False:
 157             self.to_screen(u'%s: %s' % (video_id, note))
 158         try:
 159             return self._downloader.urlopen(url_or_request)
 160         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 161             if errnote is None:
 162                 errnote = u'Unable to download webpage'
 163             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 164
 165     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 166         """ Returns a tuple (page content as string, URL handle) """
 167
 168         # Strip hashes from the URL (#1038)
 169         if isinstance(url_or_request, (compat_str, str)):
 170             url_or_request = url_or_request.partition('#')[0]
 171
 172         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 173         content_type = urlh.headers.get('Content-Type', '')
 174         webpage_bytes = urlh.read()
 175         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 176         if m:
 177             encoding = m.group(1)
 178         else:
 179             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 180                           webpage_bytes[:1024])
 181             if m:
 182                 encoding = m.group(1).decode('ascii')
 183             else:
 184                 encoding = 'utf-8'
 185         if self._downloader.params.get('dump_intermediate_pages', False):
 186             try:
 187                 url = url_or_request.get_full_url()
 188             except AttributeError:
 189                 url = url_or_request
 190             self.to_screen(u'Dumping request to ' + url)
 191             dump = base64.b64encode(webpage_bytes).decode('ascii')
 192             self._downloader.to_screen(dump)
 193         if self._downloader.params.get('write_pages', False):
 194             try:
 195                 url = url_or_request.get_full_url()
 196             except AttributeError:
 197                 url = url_or_request
 198             raw_filename = ('%s_%s.dump' % (video_id, url))
 199             filename = sanitize_filename(raw_filename, restricted=True)
 200             self.to_screen(u'Saving request to ' + filename)
 201             with open(filename, 'wb') as outf:
 202                 outf.write(webpage_bytes)
 203
 204         content = webpage_bytes.decode(encoding, 'replace')
 205         return (content, urlh)
 206
 207     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 208         """ Returns the data of the page as a string """
 209         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 210
 211     def to_screen(self, msg):
 212         """Print msg to screen, prefixing it with '[ie_name]'"""
 213         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 214
 215     def report_extraction(self, id_or_name):
 216         """Report information extraction."""
 217         self.to_screen(u'%s: Extracting information' % id_or_name)
 218
 219     def report_download_webpage(self, video_id):
 220         """Report webpage download."""
 221         self.to_screen(u'%s: Downloading webpage' % video_id)
 222
 223     def report_age_confirmation(self):
 224         """Report attempt to confirm age."""
 225         self.to_screen(u'Confirming age')
 226
 227     def report_login(self):
 228         """Report attempt to log in."""
 229         self.to_screen(u'Logging in')
 230
 231     #Methods for following #608
 232     def url_result(self, url, ie=None):
 233         """Returns a url that points to a page that should be processed"""
 234         #TODO: ie should be the class used for getting the info
 235         video_info = {'_type': 'url',
 236                       'url': url,
 237                       'ie_key': ie}
 238         return video_info
 239     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 240         """Returns a playlist"""
 241         video_info = {'_type': 'playlist',
 242                       'entries': entries}
 243         if playlist_id:
 244             video_info['id'] = playlist_id
 245         if playlist_title:
 246             video_info['title'] = playlist_title
 247         return video_info
 248
 249     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 250         """
 251         Perform a regex search on the given string, using a single or a list of
 252         patterns returning the first matching group.
 253         In case of failure return a default value or raise a WARNING or a
 254         RegexNotFoundError, depending on fatal, specifying the field name.
 255         """
 256         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 257             mobj = re.search(pattern, string, flags)
 258         else:
 259             for p in pattern:
 260                 mobj = re.search(p, string, flags)
 261                 if mobj: break
 262
 263         if sys.stderr.isatty() and os.name != 'nt':
 264             _name = u'\033[0;34m%s\033[0m' % name
 265         else:
 266             _name = name
 267
 268         if mobj:
 269             # return the first matching group
 270             return next(g for g in mobj.groups() if g is not None)
 271         elif default is not None:
 272             return default
 273         elif fatal:
 274             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 275         else:
 276             self._downloader.report_warning(u'unable to extract %s; '
 277                 u'please report this issue on http://yt-dl.org/bug' % _name)
 278             return None
 279
 280     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 281         """
 282         Like _search_regex, but strips HTML tags and unescapes entities.
 283         """
 284         res = self._search_regex(pattern, string, name, default, fatal, flags)
 285         if res:
 286             return clean_html(res).strip()
 287         else:
 288             return res
 289
 290     def _get_login_info(self):
 291         """
 292         Get the the login info as (username, password)
 293         It will look in the netrc file using the _NETRC_MACHINE value
 294         If there's no info available, return (None, None)
 295         """
 296         if self._downloader is None:
 297             return (None, None)
 298
 299         username = None
 300         password = None
 301         downloader_params = self._downloader.params
 302
 303         # Attempt to use provided username and password or .netrc data
 304         if downloader_params.get('username', None) is not None:
 305             username = downloader_params['username']
 306             password = downloader_params['password']
 307         elif downloader_params.get('usenetrc', False):
 308             try:
 309                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 310                 if info is not None:
 311                     username = info[0]
 312                     password = info[2]
 313                 else:
 314                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 315             except (IOError, netrc.NetrcParseError) as err:
 316                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 317
 318         return (username, password)
 319
 320     # Helper functions for extracting OpenGraph info
 321     @staticmethod
 322     def _og_regexes(prop):
 323         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 324         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 325         template = r'<meta[^>]+?%s[^>]+?%s'
 326         return [
 327             template % (property_re, content_re),
 328             template % (content_re, property_re),
 329         ]
 330
 331     def _og_search_property(self, prop, html, name=None, **kargs):
 332         if name is None:
 333             name = 'OpenGraph %s' % prop
 334         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 335         if escaped is None:
 336             return None
 337         return unescapeHTML(escaped)
 338
 339     def _og_search_thumbnail(self, html, **kargs):
 340         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 341
 342     def _og_search_description(self, html, **kargs):
 343         return self._og_search_property('description', html, fatal=False, **kargs)
 344
 345     def _og_search_title(self, html, **kargs):
 346         return self._og_search_property('title', html, **kargs)
 347
 348     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 349         regexes = self._og_regexes('video')
 350         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 351         return self._html_search_regex(regexes, html, name, **kargs)
 352
 353     def _html_search_meta(self, name, html, display_name=None):
 354         if display_name is None:
 355             display_name = name
 356         return self._html_search_regex(
 357             r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\'])
 358                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 359             html, display_name, fatal=False)
 360
 361     def _dc_search_uploader(self, html):
 362         return self._html_search_meta('dc.creator', html, 'uploader')
 363
 364     def _rta_search(self, html):
 365         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 366         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 367                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 368                      html):
 369             return 18
 370         return 0
 371
 372     def _media_rating_search(self, html):
 373         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 374         rating = self._html_search_meta('rating', html)
 375
 376         if not rating:
 377             return None
 378
 379         RATING_TABLE = {
 380             'safe for kids': 0,
 381             'general': 8,
 382             '14 years': 14,
 383             'mature': 17,
 384             'restricted': 19,
 385         }
 386         return RATING_TABLE.get(rating.lower(), None)
 387
 388
 389
 390 class SearchInfoExtractor(InfoExtractor):
 391     """
 392     Base class for paged search queries extractors.
 393     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 394     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 395     """
 396
 397     @classmethod
 398     def _make_valid_url(cls):
 399         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 400
 401     @classmethod
 402     def suitable(cls, url):
 403         return re.match(cls._make_valid_url(), url) is not None
 404
 405     def _real_extract(self, query):
 406         mobj = re.match(self._make_valid_url(), query)
 407         if mobj is None:
 408             raise ExtractorError(u'Invalid search query "%s"' % query)
 409
 410         prefix = mobj.group('prefix')
 411         query = mobj.group('query')
 412         if prefix == '':
 413             return self._get_n_results(query, 1)
 414         elif prefix == 'all':
 415             return self._get_n_results(query, self._MAX_RESULTS)
 416         else:
 417             n = int(prefix)
 418             if n <= 0:
 419                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 420             elif n > self._MAX_RESULTS:
 421                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 422                 n = self._MAX_RESULTS
 423             return self._get_n_results(query, n)
 424
 425     def _get_n_results(self, query, n):
 426         """Get a specified number of results for a query"""
 427         raise NotImplementedError("This method must be implemented by subclasses")
 428
 429     @property
 430     def SEARCH_KEY(self):
 431         return self._SEARCH_KEY