youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     title:          Video title, unescaped.
  38     url:            Final video URL.
  39     ext:            Video filename extension.
  40
  41     Instead of url and ext, formats can also specified.
  42
  43     The following fields are optional:
  44
  45     format:         The video format, defaults to ext (used for --get-format)
  46     thumbnails:     A list of dictionaries (with the entries "resolution" and
  47                     "url") for the varying thumbnails
  48     thumbnail:      Full URL to a video thumbnail image.
  49     description:    One-line video description.
  50     uploader:       Full name of the video uploader.
  51     upload_date:    Video upload date (YYYYMMDD).
  52     uploader_id:    Nickname or id of the video uploader.
  53     location:       Physical location of the video.
  54     player_url:     SWF Player URL (used for rtmpdump).
  55     subtitles:      The subtitle file contents as a dictionary in the format
  56                     {language: subtitles}.
  57     duration:       Length of the video in seconds, as an integer.
  58     view_count:     How many users have watched the video on the platform.
  59     like_count:     Number of positive ratings of the video
  60     dislike_count:  Number of negative ratings of the video
  61     comment_count:  Number of comments on the video
  62     urlhandle:      [internal] The urlHandle to be used to download the file,
  63                     like returned by urllib.request.urlopen
  64     age_limit:      Age restriction for the video, as an integer (years)
  65     formats:        A list of dictionaries for each format available, it must
  66                     be ordered from worst to best quality. Potential fields:
  67                     * url       Mandatory. The URL of the video file
  68                     * ext       Will be calculated from url if missing
  69                     * format    A human-readable description of the format
  70                                 ("mp4 container with h264/opus").
  71                                 Calculated from the format_id, width, height.
  72                                 and format_note fields if missing.
  73                     * format_id A short description of the format
  74                                 ("mp4_h264_opus" or "19")
  75                     * format_note Additional info about the format
  76                                 ("3D" or "DASH video")
  77                     * width     Width of the video, if known
  78                     * height    Height of the video, if known
  79                     * abr       Average audio bitrate in KBit/s
  80                     * acodec    Name of the audio codec in use
  81                     * vbr       Average video bitrate in KBit/s
  82                     * vcodec    Name of the video codec in use
  83                     * filesize  The number of bytes, if known in advance
  84     webpage_url:    The url to the video webpage, if given to youtube-dl it
  85                     should allow to get the same result again. (It will be set
  86                     by YoutubeDL if it's missing)
  87
  88     Unless mentioned otherwise, the fields should be Unicode strings.
  89
  90     Subclasses of this one should re-define the _real_initialize() and
  91     _real_extract() methods and define a _VALID_URL regexp.
  92     Probably, they should also be added to the list of extractors.
  93
  94     _real_extract() must return a *list* of information dictionaries as
  95     described above.
  96
  97     Finally, the _WORKING attribute should be set to False for broken IEs
  98     in order to warn the users and skip the tests.
  99     """
 100
 101     _ready = False
 102     _downloader = None
 103     _WORKING = True
 104
 105     def __init__(self, downloader=None):
 106         """Constructor. Receives an optional downloader."""
 107         self._ready = False
 108         self.set_downloader(downloader)
 109
 110     @classmethod
 111     def suitable(cls, url):
 112         """Receives a URL and returns True if suitable for this IE."""
 113
 114         # This does not use has/getattr intentionally - we want to know whether
 115         # we have cached the regexp for *this* class, whereas getattr would also
 116         # match the superclass
 117         if '_VALID_URL_RE' not in cls.__dict__:
 118             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 119         return cls._VALID_URL_RE.match(url) is not None
 120
 121     @classmethod
 122     def working(cls):
 123         """Getter method for _WORKING."""
 124         return cls._WORKING
 125
 126     def initialize(self):
 127         """Initializes an instance (authentication, etc)."""
 128         if not self._ready:
 129             self._real_initialize()
 130             self._ready = True
 131
 132     def extract(self, url):
 133         """Extracts URL information and returns it in list of dicts."""
 134         self.initialize()
 135         return self._real_extract(url)
 136
 137     def set_downloader(self, downloader):
 138         """Sets the downloader for this IE."""
 139         self._downloader = downloader
 140
 141     def _real_initialize(self):
 142         """Real initialization process. Redefine in subclasses."""
 143         pass
 144
 145     def _real_extract(self, url):
 146         """Real extraction process. Redefine in subclasses."""
 147         pass
 148
 149     @classmethod
 150     def ie_key(cls):
 151         """A string for getting the InfoExtractor with get_info_extractor"""
 152         return cls.__name__[:-2]
 153
 154     @property
 155     def IE_NAME(self):
 156         return type(self).__name__[:-2]
 157
 158     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 159         """ Returns the response handle """
 160         if note is None:
 161             self.report_download_webpage(video_id)
 162         elif note is not False:
 163             if video_id is None:
 164                 self.to_screen(u'%s' % (note,))
 165             else:
 166                 self.to_screen(u'%s: %s' % (video_id, note))
 167         try:
 168             return self._downloader.urlopen(url_or_request)
 169         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 170             if errnote is None:
 171                 errnote = u'Unable to download webpage'
 172             errmsg = u'%s: %s' % (errnote, compat_str(err))
 173             if fatal:
 174                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 175             else:
 176                 self._downloader.report_warning(errmsg)
 177                 return False
 178
 179     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 180         """ Returns a tuple (page content as string, URL handle) """
 181
 182         # Strip hashes from the URL (#1038)
 183         if isinstance(url_or_request, (compat_str, str)):
 184             url_or_request = url_or_request.partition('#')[0]
 185
 186         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 187         if urlh is False:
 188             assert not fatal
 189             return False
 190         content_type = urlh.headers.get('Content-Type', '')
 191         webpage_bytes = urlh.read()
 192         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 193         if m:
 194             encoding = m.group(1)
 195         else:
 196             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 197                           webpage_bytes[:1024])
 198             if m:
 199                 encoding = m.group(1).decode('ascii')
 200             else:
 201                 encoding = 'utf-8'
 202         if self._downloader.params.get('dump_intermediate_pages', False):
 203             try:
 204                 url = url_or_request.get_full_url()
 205             except AttributeError:
 206                 url = url_or_request
 207             self.to_screen(u'Dumping request to ' + url)
 208             dump = base64.b64encode(webpage_bytes).decode('ascii')
 209             self._downloader.to_screen(dump)
 210         if self._downloader.params.get('write_pages', False):
 211             try:
 212                 url = url_or_request.get_full_url()
 213             except AttributeError:
 214                 url = url_or_request
 215             raw_filename = ('%s_%s.dump' % (video_id, url))
 216             filename = sanitize_filename(raw_filename, restricted=True)
 217             self.to_screen(u'Saving request to ' + filename)
 218             with open(filename, 'wb') as outf:
 219                 outf.write(webpage_bytes)
 220
 221         content = webpage_bytes.decode(encoding, 'replace')
 222         return (content, urlh)
 223
 224     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 225         """ Returns the data of the page as a string """
 226         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 227         if res is False:
 228             return res
 229         else:
 230             content, _ = res
 231             return content
 232
 233     def _download_xml(self, url_or_request, video_id,
 234                       note=u'Downloading XML', errnote=u'Unable to download XML',
 235                       transform_source=None):
 236         """Return the xml as an xml.etree.ElementTree.Element"""
 237         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 238         if transform_source:
 239             xml_string = transform_source(xml_string)
 240         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 241
 242     def to_screen(self, msg):
 243         """Print msg to screen, prefixing it with '[ie_name]'"""
 244         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 245
 246     def report_extraction(self, id_or_name):
 247         """Report information extraction."""
 248         self.to_screen(u'%s: Extracting information' % id_or_name)
 249
 250     def report_download_webpage(self, video_id):
 251         """Report webpage download."""
 252         self.to_screen(u'%s: Downloading webpage' % video_id)
 253
 254     def report_age_confirmation(self):
 255         """Report attempt to confirm age."""
 256         self.to_screen(u'Confirming age')
 257
 258     def report_login(self):
 259         """Report attempt to log in."""
 260         self.to_screen(u'Logging in')
 261
 262     #Methods for following #608
 263     def url_result(self, url, ie=None, video_id=None):
 264         """Returns a url that points to a page that should be processed"""
 265         #TODO: ie should be the class used for getting the info
 266         video_info = {'_type': 'url',
 267                       'url': url,
 268                       'ie_key': ie}
 269         if video_id is not None:
 270             video_info['id'] = video_id
 271         return video_info
 272     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 273         """Returns a playlist"""
 274         video_info = {'_type': 'playlist',
 275                       'entries': entries}
 276         if playlist_id:
 277             video_info['id'] = playlist_id
 278         if playlist_title:
 279             video_info['title'] = playlist_title
 280         return video_info
 281
 282     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 283         """
 284         Perform a regex search on the given string, using a single or a list of
 285         patterns returning the first matching group.
 286         In case of failure return a default value or raise a WARNING or a
 287         RegexNotFoundError, depending on fatal, specifying the field name.
 288         """
 289         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 290             mobj = re.search(pattern, string, flags)
 291         else:
 292             for p in pattern:
 293                 mobj = re.search(p, string, flags)
 294                 if mobj: break
 295
 296         if sys.stderr.isatty() and os.name != 'nt':
 297             _name = u'\033[0;34m%s\033[0m' % name
 298         else:
 299             _name = name
 300
 301         if mobj:
 302             # return the first matching group
 303             return next(g for g in mobj.groups() if g is not None)
 304         elif default is not None:
 305             return default
 306         elif fatal:
 307             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 308         else:
 309             self._downloader.report_warning(u'unable to extract %s; '
 310                 u'please report this issue on http://yt-dl.org/bug' % _name)
 311             return None
 312
 313     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 314         """
 315         Like _search_regex, but strips HTML tags and unescapes entities.
 316         """
 317         res = self._search_regex(pattern, string, name, default, fatal, flags)
 318         if res:
 319             return clean_html(res).strip()
 320         else:
 321             return res
 322
 323     def _get_login_info(self):
 324         """
 325         Get the the login info as (username, password)
 326         It will look in the netrc file using the _NETRC_MACHINE value
 327         If there's no info available, return (None, None)
 328         """
 329         if self._downloader is None:
 330             return (None, None)
 331
 332         username = None
 333         password = None
 334         downloader_params = self._downloader.params
 335
 336         # Attempt to use provided username and password or .netrc data
 337         if downloader_params.get('username', None) is not None:
 338             username = downloader_params['username']
 339             password = downloader_params['password']
 340         elif downloader_params.get('usenetrc', False):
 341             try:
 342                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 343                 if info is not None:
 344                     username = info[0]
 345                     password = info[2]
 346                 else:
 347                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 348             except (IOError, netrc.NetrcParseError) as err:
 349                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 350
 351         return (username, password)
 352
 353     # Helper functions for extracting OpenGraph info
 354     @staticmethod
 355     def _og_regexes(prop):
 356         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 357         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 358         template = r'<meta[^>]+?%s[^>]+?%s'
 359         return [
 360             template % (property_re, content_re),
 361             template % (content_re, property_re),
 362         ]
 363
 364     def _og_search_property(self, prop, html, name=None, **kargs):
 365         if name is None:
 366             name = 'OpenGraph %s' % prop
 367         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 368         if escaped is None:
 369             return None
 370         return unescapeHTML(escaped)
 371
 372     def _og_search_thumbnail(self, html, **kargs):
 373         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 374
 375     def _og_search_description(self, html, **kargs):
 376         return self._og_search_property('description', html, fatal=False, **kargs)
 377
 378     def _og_search_title(self, html, **kargs):
 379         return self._og_search_property('title', html, **kargs)
 380
 381     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 382         regexes = self._og_regexes('video')
 383         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 384         return self._html_search_regex(regexes, html, name, **kargs)
 385
 386     def _html_search_meta(self, name, html, display_name=None):
 387         if display_name is None:
 388             display_name = name
 389         return self._html_search_regex(
 390             r'''(?ix)<meta
 391                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 392                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 393             html, display_name, fatal=False)
 394
 395     def _dc_search_uploader(self, html):
 396         return self._html_search_meta('dc.creator', html, 'uploader')
 397
 398     def _rta_search(self, html):
 399         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 400         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 401                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 402                      html):
 403             return 18
 404         return 0
 405
 406     def _media_rating_search(self, html):
 407         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 408         rating = self._html_search_meta('rating', html)
 409
 410         if not rating:
 411             return None
 412
 413         RATING_TABLE = {
 414             'safe for kids': 0,
 415             'general': 8,
 416             '14 years': 14,
 417             'mature': 17,
 418             'restricted': 19,
 419         }
 420         return RATING_TABLE.get(rating.lower(), None)
 421
 422
 423
 424 class SearchInfoExtractor(InfoExtractor):
 425     """
 426     Base class for paged search queries extractors.
 427     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 428     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 429     """
 430
 431     @classmethod
 432     def _make_valid_url(cls):
 433         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 434
 435     @classmethod
 436     def suitable(cls, url):
 437         return re.match(cls._make_valid_url(), url) is not None
 438
 439     def _real_extract(self, query):
 440         mobj = re.match(self._make_valid_url(), query)
 441         if mobj is None:
 442             raise ExtractorError(u'Invalid search query "%s"' % query)
 443
 444         prefix = mobj.group('prefix')
 445         query = mobj.group('query')
 446         if prefix == '':
 447             return self._get_n_results(query, 1)
 448         elif prefix == 'all':
 449             return self._get_n_results(query, self._MAX_RESULTS)
 450         else:
 451             n = int(prefix)
 452             if n <= 0:
 453                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 454             elif n > self._MAX_RESULTS:
 455                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 456                 n = self._MAX_RESULTS
 457             return self._get_n_results(query, n)
 458
 459     def _get_n_results(self, query, n):
 460         """Get a specified number of results for a query"""
 461         raise NotImplementedError("This method must be implemented by subclasses")
 462
 463     @property
 464     def SEARCH_KEY(self):
 465         return self._SEARCH_KEY