youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_urllib_parse_urlparse,
  13     compat_str,
  14
  15     clean_html,
  16     compiled_regex_type,
  17     ExtractorError,
  18     RegexNotFoundError,
  19     sanitize_filename,
  20     unescapeHTML,
  21 )
  22 _NO_DEFAULT = object()
  23
  24
  25 class InfoExtractor(object):
  26     """Information Extractor class.
  27
  28     Information extractors are the classes that, given a URL, extract
  29     information about the video (or videos) the URL refers to. This
  30     information includes the real video URL, the video title, author and
  31     others. The information is stored in a dictionary which is then
  32     passed to the FileDownloader. The FileDownloader processes this
  33     information possibly downloading the video to the file system, among
  34     other possible outcomes.
  35
  36     The dictionaries must include the following fields:
  37
  38     id:             Video identifier.
  39     title:          Video title, unescaped.
  40
  41     Additionally, it must contain either a formats entry or a url one:
  42
  43     formats:        A list of dictionaries for each format available, ordered
  44                     from worst to best quality.
  45
  46                     Potential fields:
  47                     * url        Mandatory. The URL of the video file
  48                     * ext        Will be calculated from url if missing
  49                     * format     A human-readable description of the format
  50                                  ("mp4 container with h264/opus").
  51                                  Calculated from the format_id, width, height.
  52                                  and format_note fields if missing.
  53                     * format_id  A short description of the format
  54                                  ("mp4_h264_opus" or "19").
  55                                 Technically optional, but strongly recommended.
  56                     * format_note Additional info about the format
  57                                  ("3D" or "DASH video")
  58                     * width      Width of the video, if known
  59                     * height     Height of the video, if known
  60                     * resolution Textual description of width and height
  61                     * tbr        Average bitrate of audio and video in KBit/s
  62                     * abr        Average audio bitrate in KBit/s
  63                     * acodec     Name of the audio codec in use
  64                     * vbr        Average video bitrate in KBit/s
  65                     * vcodec     Name of the video codec in use
  66                     * filesize   The number of bytes, if known in advance
  67                     * player_url SWF Player URL (used for rtmpdump).
  68                     * protocol   The protocol that will be used for the actual
  69                                  download, lower-case.
  70                                  "http", "https", "rtsp", "rtmp" or so.
  71                     * preference Order number of this format. If this field is
  72                                  present and not None, the formats get sorted
  73                                  by this field.
  74                                  -1 for default (order by other properties),
  75                                  -2 or smaller for less than default.
  76     url:            Final video URL.
  77     ext:            Video filename extension.
  78     format:         The video format, defaults to ext (used for --get-format)
  79     player_url:     SWF Player URL (used for rtmpdump).
  80
  81     The following fields are optional:
  82
  83     thumbnails:     A list of dictionaries (with the entries "resolution" and
  84                     "url") for the varying thumbnails
  85     thumbnail:      Full URL to a video thumbnail image.
  86     description:    One-line video description.
  87     uploader:       Full name of the video uploader.
  88     upload_date:    Video upload date (YYYYMMDD).
  89     uploader_id:    Nickname or id of the video uploader.
  90     location:       Physical location of the video.
  91     subtitles:      The subtitle file contents as a dictionary in the format
  92                     {language: subtitles}.
  93     duration:       Length of the video in seconds, as an integer.
  94     view_count:     How many users have watched the video on the platform.
  95     like_count:     Number of positive ratings of the video
  96     dislike_count:  Number of negative ratings of the video
  97     comment_count:  Number of comments on the video
  98     age_limit:      Age restriction for the video, as an integer (years)
  99     webpage_url:    The url to the video webpage, if given to youtube-dl it
 100                     should allow to get the same result again. (It will be set
 101                     by YoutubeDL if it's missing)
 102
 103     Unless mentioned otherwise, the fields should be Unicode strings.
 104
 105     Subclasses of this one should re-define the _real_initialize() and
 106     _real_extract() methods and define a _VALID_URL regexp.
 107     Probably, they should also be added to the list of extractors.
 108
 109     _real_extract() must return a *list* of information dictionaries as
 110     described above.
 111
 112     Finally, the _WORKING attribute should be set to False for broken IEs
 113     in order to warn the users and skip the tests.
 114     """
 115
 116     _ready = False
 117     _downloader = None
 118     _WORKING = True
 119
 120     def __init__(self, downloader=None):
 121         """Constructor. Receives an optional downloader."""
 122         self._ready = False
 123         self.set_downloader(downloader)
 124
 125     @classmethod
 126     def suitable(cls, url):
 127         """Receives a URL and returns True if suitable for this IE."""
 128
 129         # This does not use has/getattr intentionally - we want to know whether
 130         # we have cached the regexp for *this* class, whereas getattr would also
 131         # match the superclass
 132         if '_VALID_URL_RE' not in cls.__dict__:
 133             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 134         return cls._VALID_URL_RE.match(url) is not None
 135
 136     @classmethod
 137     def working(cls):
 138         """Getter method for _WORKING."""
 139         return cls._WORKING
 140
 141     def initialize(self):
 142         """Initializes an instance (authentication, etc)."""
 143         if not self._ready:
 144             self._real_initialize()
 145             self._ready = True
 146
 147     def extract(self, url):
 148         """Extracts URL information and returns it in list of dicts."""
 149         self.initialize()
 150         return self._real_extract(url)
 151
 152     def set_downloader(self, downloader):
 153         """Sets the downloader for this IE."""
 154         self._downloader = downloader
 155
 156     def _real_initialize(self):
 157         """Real initialization process. Redefine in subclasses."""
 158         pass
 159
 160     def _real_extract(self, url):
 161         """Real extraction process. Redefine in subclasses."""
 162         pass
 163
 164     @classmethod
 165     def ie_key(cls):
 166         """A string for getting the InfoExtractor with get_info_extractor"""
 167         return cls.__name__[:-2]
 168
 169     @property
 170     def IE_NAME(self):
 171         return type(self).__name__[:-2]
 172
 173     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 174         """ Returns the response handle """
 175         if note is None:
 176             self.report_download_webpage(video_id)
 177         elif note is not False:
 178             if video_id is None:
 179                 self.to_screen(u'%s' % (note,))
 180             else:
 181                 self.to_screen(u'%s: %s' % (video_id, note))
 182         try:
 183             return self._downloader.urlopen(url_or_request)
 184         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 185             if errnote is False:
 186                 return False
 187             if errnote is None:
 188                 errnote = u'Unable to download webpage'
 189             errmsg = u'%s: %s' % (errnote, compat_str(err))
 190             if fatal:
 191                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 192             else:
 193                 self._downloader.report_warning(errmsg)
 194                 return False
 195
 196     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 197         """ Returns a tuple (page content as string, URL handle) """
 198
 199         # Strip hashes from the URL (#1038)
 200         if isinstance(url_or_request, (compat_str, str)):
 201             url_or_request = url_or_request.partition('#')[0]
 202
 203         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 204         if urlh is False:
 205             assert not fatal
 206             return False
 207         content_type = urlh.headers.get('Content-Type', '')
 208         webpage_bytes = urlh.read()
 209         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 210         if m:
 211             encoding = m.group(1)
 212         else:
 213             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 214                           webpage_bytes[:1024])
 215             if m:
 216                 encoding = m.group(1).decode('ascii')
 217             else:
 218                 encoding = 'utf-8'
 219         if self._downloader.params.get('dump_intermediate_pages', False):
 220             try:
 221                 url = url_or_request.get_full_url()
 222             except AttributeError:
 223                 url = url_or_request
 224             self.to_screen(u'Dumping request to ' + url)
 225             dump = base64.b64encode(webpage_bytes).decode('ascii')
 226             self._downloader.to_screen(dump)
 227         if self._downloader.params.get('write_pages', False):
 228             try:
 229                 url = url_or_request.get_full_url()
 230             except AttributeError:
 231                 url = url_or_request
 232             raw_filename = ('%s_%s.dump' % (video_id, url))
 233             filename = sanitize_filename(raw_filename, restricted=True)
 234             self.to_screen(u'Saving request to ' + filename)
 235             with open(filename, 'wb') as outf:
 236                 outf.write(webpage_bytes)
 237
 238         content = webpage_bytes.decode(encoding, 'replace')
 239         return (content, urlh)
 240
 241     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 242         """ Returns the data of the page as a string """
 243         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 244         if res is False:
 245             return res
 246         else:
 247             content, _ = res
 248             return content
 249
 250     def _download_xml(self, url_or_request, video_id,
 251                       note=u'Downloading XML', errnote=u'Unable to download XML',
 252                       transform_source=None):
 253         """Return the xml as an xml.etree.ElementTree.Element"""
 254         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 255         if transform_source:
 256             xml_string = transform_source(xml_string)
 257         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 258
 259     def report_warning(self, msg, video_id=None):
 260         idstr = u'' if video_id is None else u'%s: ' % video_id
 261         self._downloader.report_warning(
 262             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 263
 264     def to_screen(self, msg):
 265         """Print msg to screen, prefixing it with '[ie_name]'"""
 266         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 267
 268     def report_extraction(self, id_or_name):
 269         """Report information extraction."""
 270         self.to_screen(u'%s: Extracting information' % id_or_name)
 271
 272     def report_download_webpage(self, video_id):
 273         """Report webpage download."""
 274         self.to_screen(u'%s: Downloading webpage' % video_id)
 275
 276     def report_age_confirmation(self):
 277         """Report attempt to confirm age."""
 278         self.to_screen(u'Confirming age')
 279
 280     def report_login(self):
 281         """Report attempt to log in."""
 282         self.to_screen(u'Logging in')
 283
 284     #Methods for following #608
 285     @staticmethod
 286     def url_result(url, ie=None, video_id=None):
 287         """Returns a url that points to a page that should be processed"""
 288         #TODO: ie should be the class used for getting the info
 289         video_info = {'_type': 'url',
 290                       'url': url,
 291                       'ie_key': ie}
 292         if video_id is not None:
 293             video_info['id'] = video_id
 294         return video_info
 295     @staticmethod
 296     def playlist_result(entries, playlist_id=None, playlist_title=None):
 297         """Returns a playlist"""
 298         video_info = {'_type': 'playlist',
 299                       'entries': entries}
 300         if playlist_id:
 301             video_info['id'] = playlist_id
 302         if playlist_title:
 303             video_info['title'] = playlist_title
 304         return video_info
 305
 306     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 307         """
 308         Perform a regex search on the given string, using a single or a list of
 309         patterns returning the first matching group.
 310         In case of failure return a default value or raise a WARNING or a
 311         RegexNotFoundError, depending on fatal, specifying the field name.
 312         """
 313         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 314             mobj = re.search(pattern, string, flags)
 315         else:
 316             for p in pattern:
 317                 mobj = re.search(p, string, flags)
 318                 if mobj: break
 319
 320         if os.name != 'nt' and sys.stderr.isatty():
 321             _name = u'\033[0;34m%s\033[0m' % name
 322         else:
 323             _name = name
 324
 325         if mobj:
 326             # return the first matching group
 327             return next(g for g in mobj.groups() if g is not None)
 328         elif default is not _NO_DEFAULT:
 329             return default
 330         elif fatal:
 331             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 332         else:
 333             self._downloader.report_warning(u'unable to extract %s; '
 334                 u'please report this issue on http://yt-dl.org/bug' % _name)
 335             return None
 336
 337     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 338         """
 339         Like _search_regex, but strips HTML tags and unescapes entities.
 340         """
 341         res = self._search_regex(pattern, string, name, default, fatal, flags)
 342         if res:
 343             return clean_html(res).strip()
 344         else:
 345             return res
 346
 347     def _get_login_info(self):
 348         """
 349         Get the the login info as (username, password)
 350         It will look in the netrc file using the _NETRC_MACHINE value
 351         If there's no info available, return (None, None)
 352         """
 353         if self._downloader is None:
 354             return (None, None)
 355
 356         username = None
 357         password = None
 358         downloader_params = self._downloader.params
 359
 360         # Attempt to use provided username and password or .netrc data
 361         if downloader_params.get('username', None) is not None:
 362             username = downloader_params['username']
 363             password = downloader_params['password']
 364         elif downloader_params.get('usenetrc', False):
 365             try:
 366                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 367                 if info is not None:
 368                     username = info[0]
 369                     password = info[2]
 370                 else:
 371                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 372             except (IOError, netrc.NetrcParseError) as err:
 373                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 374
 375         return (username, password)
 376
 377     # Helper functions for extracting OpenGraph info
 378     @staticmethod
 379     def _og_regexes(prop):
 380         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 381         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 382         template = r'<meta[^>]+?%s[^>]+?%s'
 383         return [
 384             template % (property_re, content_re),
 385             template % (content_re, property_re),
 386         ]
 387
 388     def _og_search_property(self, prop, html, name=None, **kargs):
 389         if name is None:
 390             name = 'OpenGraph %s' % prop
 391         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 392         if escaped is None:
 393             return None
 394         return unescapeHTML(escaped)
 395
 396     def _og_search_thumbnail(self, html, **kargs):
 397         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 398
 399     def _og_search_description(self, html, **kargs):
 400         return self._og_search_property('description', html, fatal=False, **kargs)
 401
 402     def _og_search_title(self, html, **kargs):
 403         return self._og_search_property('title', html, **kargs)
 404
 405     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 406         regexes = self._og_regexes('video')
 407         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 408         return self._html_search_regex(regexes, html, name, **kargs)
 409
 410     def _html_search_meta(self, name, html, display_name=None):
 411         if display_name is None:
 412             display_name = name
 413         return self._html_search_regex(
 414             r'''(?ix)<meta
 415                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 416                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 417             html, display_name, fatal=False)
 418
 419     def _dc_search_uploader(self, html):
 420         return self._html_search_meta('dc.creator', html, 'uploader')
 421
 422     def _rta_search(self, html):
 423         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 424         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 425                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 426                      html):
 427             return 18
 428         return 0
 429
 430     def _media_rating_search(self, html):
 431         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 432         rating = self._html_search_meta('rating', html)
 433
 434         if not rating:
 435             return None
 436
 437         RATING_TABLE = {
 438             'safe for kids': 0,
 439             'general': 8,
 440             '14 years': 14,
 441             'mature': 17,
 442             'restricted': 19,
 443         }
 444         return RATING_TABLE.get(rating.lower(), None)
 445
 446     def _sort_formats(self, formats):
 447         def _formats_key(f):
 448             # TODO remove the following workaround
 449             from ..utils import determine_ext
 450             if not f.get('ext') and 'url' in f:
 451                 f['ext'] = determine_ext(f['url'])
 452
 453             preference = f.get('preference')
 454             if preference is None:
 455                 proto = f.get('protocol')
 456                 if proto is None:
 457                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 458
 459                 preference = 0 if proto in ['http', 'https'] else -0.1
 460                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 461                     preference -= 0.5
 462
 463             if f.get('vcodec') == 'none':  # audio only
 464                 if self._downloader.params.get('prefer_free_formats'):
 465                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 466                 else:
 467                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 468                 ext_preference = 0
 469                 try:
 470                     audio_ext_preference = ORDER.index(f['ext'])
 471                 except ValueError:
 472                     audio_ext_preference = -1
 473             else:
 474                 if self._downloader.params.get('prefer_free_formats'):
 475                     ORDER = [u'flv', u'mp4', u'webm']
 476                 else:
 477                     ORDER = [u'webm', u'flv', u'mp4']
 478                 try:
 479                     ext_preference = ORDER.index(f['ext'])
 480                 except ValueError:
 481                     ext_preference = -1
 482                 audio_ext_preference = 0
 483
 484             return (
 485                 preference,
 486                 f.get('height') if f.get('height') is not None else -1,
 487                 f.get('width') if f.get('width') is not None else -1,
 488                 ext_preference,
 489                 f.get('vbr') if f.get('vbr') is not None else -1,
 490                 f.get('abr') if f.get('abr') is not None else -1,
 491                 audio_ext_preference,
 492                 f.get('filesize') if f.get('filesize') is not None else -1,
 493                 f.get('format_id'),
 494             )
 495         formats.sort(key=_formats_key)
 496
 497
 498 class SearchInfoExtractor(InfoExtractor):
 499     """
 500     Base class for paged search queries extractors.
 501     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 502     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 503     """
 504
 505     @classmethod
 506     def _make_valid_url(cls):
 507         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 508
 509     @classmethod
 510     def suitable(cls, url):
 511         return re.match(cls._make_valid_url(), url) is not None
 512
 513     def _real_extract(self, query):
 514         mobj = re.match(self._make_valid_url(), query)
 515         if mobj is None:
 516             raise ExtractorError(u'Invalid search query "%s"' % query)
 517
 518         prefix = mobj.group('prefix')
 519         query = mobj.group('query')
 520         if prefix == '':
 521             return self._get_n_results(query, 1)
 522         elif prefix == 'all':
 523             return self._get_n_results(query, self._MAX_RESULTS)
 524         else:
 525             n = int(prefix)
 526             if n <= 0:
 527                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 528             elif n > self._MAX_RESULTS:
 529                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 530                 n = self._MAX_RESULTS
 531             return self._get_n_results(query, n)
 532
 533     def _get_n_results(self, query, n):
 534         """Get a specified number of results for a query"""
 535         raise NotImplementedError("This method must be implemented by subclasses")
 536
 537     @property
 538     def SEARCH_KEY(self):
 539         return self._SEARCH_KEY