youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7
   8 from ..utils import (
   9     compat_http_client,
  10     compat_urllib_error,
  11     compat_urllib_request,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     unescapeHTML,
  18 )
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     title:          Video title, unescaped.
  36     ext:            Video filename extension.
  37
  38     Instead of url and ext, formats can also specified.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnails:     A list of dictionaries (with the entries "resolution" and
  44                     "url") for the varying thumbnails
  45     thumbnail:      Full URL to a video thumbnail image.
  46     description:    One-line video description.
  47     uploader:       Full name of the video uploader.
  48     upload_date:    Video upload date (YYYYMMDD).
  49     uploader_id:    Nickname or id of the video uploader.
  50     location:       Physical location of the video.
  51     player_url:     SWF Player URL (used for rtmpdump).
  52     subtitles:      The subtitle file contents as a dictionary in the format
  53                     {language: subtitles}.
  54     view_count:     How many users have watched the video on the platform.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57     formats:        A list of dictionaries for each format available, it must
  58                     be ordered from worst to best quality. Potential fields:
  59                     * url       Mandatory. The URL of the video file
  60                     * ext       Will be calculated from url if missing
  61                     * format    A human-readable description of the format
  62                                 ("mp4 container with h264/opus").
  63                                 Calculated from width and height if missing.
  64                     * format_id A short description of the format
  65                                 ("mp4_h264_opus" or "19")
  66                     * width     Width of the video, if known
  67                     * height    Height of the video, if known
  68
  69     Unless mentioned otherwise, the fields should be Unicode strings.
  70
  71     Subclasses of this one should re-define the _real_initialize() and
  72     _real_extract() methods and define a _VALID_URL regexp.
  73     Probably, they should also be added to the list of extractors.
  74
  75     _real_extract() must return a *list* of information dictionaries as
  76     described above.
  77
  78     Finally, the _WORKING attribute should be set to False for broken IEs
  79     in order to warn the users and skip the tests.
  80     """
  81
  82     _ready = False
  83     _downloader = None
  84     _WORKING = True
  85
  86     def __init__(self, downloader=None):
  87         """Constructor. Receives an optional downloader."""
  88         self._ready = False
  89         self.set_downloader(downloader)
  90
  91     @classmethod
  92     def suitable(cls, url):
  93         """Receives a URL and returns True if suitable for this IE."""
  94
  95         # This does not use has/getattr intentionally - we want to know whether
  96         # we have cached the regexp for *this* class, whereas getattr would also
  97         # match the superclass
  98         if '_VALID_URL_RE' not in cls.__dict__:
  99             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 100         return cls._VALID_URL_RE.match(url) is not None
 101
 102     @classmethod
 103     def working(cls):
 104         """Getter method for _WORKING."""
 105         return cls._WORKING
 106
 107     def initialize(self):
 108         """Initializes an instance (authentication, etc)."""
 109         if not self._ready:
 110             self._real_initialize()
 111             self._ready = True
 112
 113     def extract(self, url):
 114         """Extracts URL information and returns it in list of dicts."""
 115         self.initialize()
 116         return self._real_extract(url)
 117
 118     def set_downloader(self, downloader):
 119         """Sets the downloader for this IE."""
 120         self._downloader = downloader
 121
 122     def _real_initialize(self):
 123         """Real initialization process. Redefine in subclasses."""
 124         pass
 125
 126     def _real_extract(self, url):
 127         """Real extraction process. Redefine in subclasses."""
 128         pass
 129
 130     @classmethod
 131     def ie_key(cls):
 132         """A string for getting the InfoExtractor with get_info_extractor"""
 133         return cls.__name__[:-2]
 134
 135     @property
 136     def IE_NAME(self):
 137         return type(self).__name__[:-2]
 138
 139     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 140         """ Returns the response handle """
 141         if note is None:
 142             self.report_download_webpage(video_id)
 143         elif note is not False:
 144             self.to_screen(u'%s: %s' % (video_id, note))
 145         try:
 146             return compat_urllib_request.urlopen(url_or_request)
 147         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 148             if errnote is None:
 149                 errnote = u'Unable to download webpage'
 150             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 151
 152     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 153         """ Returns a tuple (page content as string, URL handle) """
 154
 155         # Strip hashes from the URL (#1038)
 156         if isinstance(url_or_request, (compat_str, str)):
 157             url_or_request = url_or_request.partition('#')[0]
 158
 159         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 160         content_type = urlh.headers.get('Content-Type', '')
 161         webpage_bytes = urlh.read()
 162         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 163         if m:
 164             encoding = m.group(1)
 165         else:
 166             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 167                           webpage_bytes[:1024])
 168             if m:
 169                 encoding = m.group(1).decode('ascii')
 170             else:
 171                 encoding = 'utf-8'
 172         if self._downloader.params.get('dump_intermediate_pages', False):
 173             try:
 174                 url = url_or_request.get_full_url()
 175             except AttributeError:
 176                 url = url_or_request
 177             self.to_screen(u'Dumping request to ' + url)
 178             dump = base64.b64encode(webpage_bytes).decode('ascii')
 179             self._downloader.to_screen(dump)
 180         content = webpage_bytes.decode(encoding, 'replace')
 181         return (content, urlh)
 182
 183     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 184         """ Returns the data of the page as a string """
 185         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 186
 187     def to_screen(self, msg):
 188         """Print msg to screen, prefixing it with '[ie_name]'"""
 189         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 190
 191     def report_extraction(self, id_or_name):
 192         """Report information extraction."""
 193         self.to_screen(u'%s: Extracting information' % id_or_name)
 194
 195     def report_download_webpage(self, video_id):
 196         """Report webpage download."""
 197         self.to_screen(u'%s: Downloading webpage' % video_id)
 198
 199     def report_age_confirmation(self):
 200         """Report attempt to confirm age."""
 201         self.to_screen(u'Confirming age')
 202
 203     def report_login(self):
 204         """Report attempt to log in."""
 205         self.to_screen(u'Logging in')
 206
 207     #Methods for following #608
 208     def url_result(self, url, ie=None):
 209         """Returns a url that points to a page that should be processed"""
 210         #TODO: ie should be the class used for getting the info
 211         video_info = {'_type': 'url',
 212                       'url': url,
 213                       'ie_key': ie}
 214         return video_info
 215     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 216         """Returns a playlist"""
 217         video_info = {'_type': 'playlist',
 218                       'entries': entries}
 219         if playlist_id:
 220             video_info['id'] = playlist_id
 221         if playlist_title:
 222             video_info['title'] = playlist_title
 223         return video_info
 224
 225     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Perform a regex search on the given string, using a single or a list of
 228         patterns returning the first matching group.
 229         In case of failure return a default value or raise a WARNING or a
 230         ExtractorError, depending on fatal, specifying the field name.
 231         """
 232         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 233             mobj = re.search(pattern, string, flags)
 234         else:
 235             for p in pattern:
 236                 mobj = re.search(p, string, flags)
 237                 if mobj: break
 238
 239         if sys.stderr.isatty() and os.name != 'nt':
 240             _name = u'\033[0;34m%s\033[0m' % name
 241         else:
 242             _name = name
 243
 244         if mobj:
 245             # return the first matching group
 246             return next(g for g in mobj.groups() if g is not None)
 247         elif default is not None:
 248             return default
 249         elif fatal:
 250             raise ExtractorError(u'Unable to extract %s' % _name)
 251         else:
 252             self._downloader.report_warning(u'unable to extract %s; '
 253                 u'please report this issue on http://yt-dl.org/bug' % _name)
 254             return None
 255
 256     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 257         """
 258         Like _search_regex, but strips HTML tags and unescapes entities.
 259         """
 260         res = self._search_regex(pattern, string, name, default, fatal, flags)
 261         if res:
 262             return clean_html(res).strip()
 263         else:
 264             return res
 265
 266     def _get_login_info(self):
 267         """
 268         Get the the login info as (username, password)
 269         It will look in the netrc file using the _NETRC_MACHINE value
 270         If there's no info available, return (None, None)
 271         """
 272         if self._downloader is None:
 273             return (None, None)
 274
 275         username = None
 276         password = None
 277         downloader_params = self._downloader.params
 278
 279         # Attempt to use provided username and password or .netrc data
 280         if downloader_params.get('username', None) is not None:
 281             username = downloader_params['username']
 282             password = downloader_params['password']
 283         elif downloader_params.get('usenetrc', False):
 284             try:
 285                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 286                 if info is not None:
 287                     username = info[0]
 288                     password = info[2]
 289                 else:
 290                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 291             except (IOError, netrc.NetrcParseError) as err:
 292                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 293
 294         return (username, password)
 295
 296     # Helper functions for extracting OpenGraph info
 297     @staticmethod
 298     def _og_regex(prop):
 299         return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
 300
 301     def _og_search_property(self, prop, html, name=None, **kargs):
 302         if name is None:
 303             name = 'OpenGraph %s' % prop
 304         escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
 305         return unescapeHTML(escaped)
 306
 307     def _og_search_thumbnail(self, html, **kargs):
 308         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 309
 310     def _og_search_description(self, html, **kargs):
 311         return self._og_search_property('description', html, fatal=False, **kargs)
 312
 313     def _og_search_title(self, html, **kargs):
 314         return self._og_search_property('title', html, **kargs)
 315
 316     def _og_search_video_url(self, html, name='video url', **kargs):
 317         return self._html_search_regex([self._og_regex('video:secure_url'),
 318                                         self._og_regex('video')],
 319                                        html, name, **kargs)
 320
 321 class SearchInfoExtractor(InfoExtractor):
 322     """
 323     Base class for paged search queries extractors.
 324     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 325     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 326     """
 327
 328     @classmethod
 329     def _make_valid_url(cls):
 330         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 331
 332     @classmethod
 333     def suitable(cls, url):
 334         return re.match(cls._make_valid_url(), url) is not None
 335
 336     def _real_extract(self, query):
 337         mobj = re.match(self._make_valid_url(), query)
 338         if mobj is None:
 339             raise ExtractorError(u'Invalid search query "%s"' % query)
 340
 341         prefix = mobj.group('prefix')
 342         query = mobj.group('query')
 343         if prefix == '':
 344             return self._get_n_results(query, 1)
 345         elif prefix == 'all':
 346             return self._get_n_results(query, self._MAX_RESULTS)
 347         else:
 348             n = int(prefix)
 349             if n <= 0:
 350                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 351             elif n > self._MAX_RESULTS:
 352                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 353                 n = self._MAX_RESULTS
 354             return self._get_n_results(query, n)
 355
 356     def _get_n_results(self, query, n):
 357         """Get a specified number of results for a query"""
 358         raise NotImplementedError("This method must be implemented by sublclasses")
 359
 360     @property
 361     def SEARCH_KEY(self):
 362         return self._SEARCH_KEY