youtube_dl/extractor/common.py

   1 import base64
   2 import hashlib
   3 import json
   4 import os
   5 import re
   6 import socket
   7 import sys
   8 import netrc
   9 import xml.etree.ElementTree
  10
  11 from ..utils import (
  12     compat_http_client,
  13     compat_urllib_error,
  14     compat_urllib_parse_urlparse,
  15     compat_str,
  16
  17     clean_html,
  18     compiled_regex_type,
  19     ExtractorError,
  20     RegexNotFoundError,
  21     sanitize_filename,
  22     unescapeHTML,
  23 )
  24 _NO_DEFAULT = object()
  25
  26
  27 class InfoExtractor(object):
  28     """Information Extractor class.
  29
  30     Information extractors are the classes that, given a URL, extract
  31     information about the video (or videos) the URL refers to. This
  32     information includes the real video URL, the video title, author and
  33     others. The information is stored in a dictionary which is then
  34     passed to the FileDownloader. The FileDownloader processes this
  35     information possibly downloading the video to the file system, among
  36     other possible outcomes.
  37
  38     The dictionaries must include the following fields:
  39
  40     id:             Video identifier.
  41     title:          Video title, unescaped.
  42
  43     Additionally, it must contain either a formats entry or a url one:
  44
  45     formats:        A list of dictionaries for each format available, ordered
  46                     from worst to best quality.
  47
  48                     Potential fields:
  49                     * url        Mandatory. The URL of the video file
  50                     * ext        Will be calculated from url if missing
  51                     * format     A human-readable description of the format
  52                                  ("mp4 container with h264/opus").
  53                                  Calculated from the format_id, width, height.
  54                                  and format_note fields if missing.
  55                     * format_id  A short description of the format
  56                                  ("mp4_h264_opus" or "19").
  57                                 Technically optional, but strongly recommended.
  58                     * format_note Additional info about the format
  59                                  ("3D" or "DASH video")
  60                     * width      Width of the video, if known
  61                     * height     Height of the video, if known
  62                     * resolution Textual description of width and height
  63                     * tbr        Average bitrate of audio and video in KBit/s
  64                     * abr        Average audio bitrate in KBit/s
  65                     * acodec     Name of the audio codec in use
  66                     * asr        Audio sampling rate in Hertz
  67                     * vbr        Average video bitrate in KBit/s
  68                     * vcodec     Name of the video codec in use
  69                     * filesize   The number of bytes, if known in advance
  70                     * player_url SWF Player URL (used for rtmpdump).
  71                     * protocol   The protocol that will be used for the actual
  72                                  download, lower-case.
  73                                  "http", "https", "rtsp", "rtmp" or so.
  74                     * preference Order number of this format. If this field is
  75                                  present and not None, the formats get sorted
  76                                  by this field.
  77                                  -1 for default (order by other properties),
  78                                  -2 or smaller for less than default.
  79                     * quality    Order number of the video quality of this
  80                                  format, irrespective of the file format.
  81                                  -1 for default (order by other properties),
  82                                  -2 or smaller for less than default.
  83     url:            Final video URL.
  84     ext:            Video filename extension.
  85     format:         The video format, defaults to ext (used for --get-format)
  86     player_url:     SWF Player URL (used for rtmpdump).
  87
  88     The following fields are optional:
  89
  90     thumbnails:     A list of dictionaries (with the entries "resolution" and
  91                     "url") for the varying thumbnails
  92     thumbnail:      Full URL to a video thumbnail image.
  93     description:    One-line video description.
  94     uploader:       Full name of the video uploader.
  95     upload_date:    Video upload date (YYYYMMDD).
  96     uploader_id:    Nickname or id of the video uploader.
  97     location:       Physical location of the video.
  98     subtitles:      The subtitle file contents as a dictionary in the format
  99                     {language: subtitles}.
 100     duration:       Length of the video in seconds, as an integer.
 101     view_count:     How many users have watched the video on the platform.
 102     like_count:     Number of positive ratings of the video
 103     dislike_count:  Number of negative ratings of the video
 104     comment_count:  Number of comments on the video
 105     age_limit:      Age restriction for the video, as an integer (years)
 106     webpage_url:    The url to the video webpage, if given to youtube-dl it
 107                     should allow to get the same result again. (It will be set
 108                     by YoutubeDL if it's missing)
 109
 110     Unless mentioned otherwise, the fields should be Unicode strings.
 111
 112     Subclasses of this one should re-define the _real_initialize() and
 113     _real_extract() methods and define a _VALID_URL regexp.
 114     Probably, they should also be added to the list of extractors.
 115
 116     _real_extract() must return a *list* of information dictionaries as
 117     described above.
 118
 119     Finally, the _WORKING attribute should be set to False for broken IEs
 120     in order to warn the users and skip the tests.
 121     """
 122
 123     _ready = False
 124     _downloader = None
 125     _WORKING = True
 126
 127     def __init__(self, downloader=None):
 128         """Constructor. Receives an optional downloader."""
 129         self._ready = False
 130         self.set_downloader(downloader)
 131
 132     @classmethod
 133     def suitable(cls, url):
 134         """Receives a URL and returns True if suitable for this IE."""
 135
 136         # This does not use has/getattr intentionally - we want to know whether
 137         # we have cached the regexp for *this* class, whereas getattr would also
 138         # match the superclass
 139         if '_VALID_URL_RE' not in cls.__dict__:
 140             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 141         return cls._VALID_URL_RE.match(url) is not None
 142
 143     @classmethod
 144     def working(cls):
 145         """Getter method for _WORKING."""
 146         return cls._WORKING
 147
 148     def initialize(self):
 149         """Initializes an instance (authentication, etc)."""
 150         if not self._ready:
 151             self._real_initialize()
 152             self._ready = True
 153
 154     def extract(self, url):
 155         """Extracts URL information and returns it in list of dicts."""
 156         self.initialize()
 157         return self._real_extract(url)
 158
 159     def set_downloader(self, downloader):
 160         """Sets the downloader for this IE."""
 161         self._downloader = downloader
 162
 163     def _real_initialize(self):
 164         """Real initialization process. Redefine in subclasses."""
 165         pass
 166
 167     def _real_extract(self, url):
 168         """Real extraction process. Redefine in subclasses."""
 169         pass
 170
 171     @classmethod
 172     def ie_key(cls):
 173         """A string for getting the InfoExtractor with get_info_extractor"""
 174         return cls.__name__[:-2]
 175
 176     @property
 177     def IE_NAME(self):
 178         return type(self).__name__[:-2]
 179
 180     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 181         """ Returns the response handle """
 182         if note is None:
 183             self.report_download_webpage(video_id)
 184         elif note is not False:
 185             if video_id is None:
 186                 self.to_screen(u'%s' % (note,))
 187             else:
 188                 self.to_screen(u'%s: %s' % (video_id, note))
 189         try:
 190             return self._downloader.urlopen(url_or_request)
 191         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 192             if errnote is False:
 193                 return False
 194             if errnote is None:
 195                 errnote = u'Unable to download webpage'
 196             errmsg = u'%s: %s' % (errnote, compat_str(err))
 197             if fatal:
 198                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 199             else:
 200                 self._downloader.report_warning(errmsg)
 201                 return False
 202
 203     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 204         """ Returns a tuple (page content as string, URL handle) """
 205
 206         # Strip hashes from the URL (#1038)
 207         if isinstance(url_or_request, (compat_str, str)):
 208             url_or_request = url_or_request.partition('#')[0]
 209
 210         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 211         if urlh is False:
 212             assert not fatal
 213             return False
 214         content_type = urlh.headers.get('Content-Type', '')
 215         webpage_bytes = urlh.read()
 216         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 217         if m:
 218             encoding = m.group(1)
 219         else:
 220             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 221                           webpage_bytes[:1024])
 222             if m:
 223                 encoding = m.group(1).decode('ascii')
 224             elif webpage_bytes.startswith(b'\xff\xfe'):
 225                 encoding = 'utf-16'
 226             else:
 227                 encoding = 'utf-8'
 228         if self._downloader.params.get('dump_intermediate_pages', False):
 229             try:
 230                 url = url_or_request.get_full_url()
 231             except AttributeError:
 232                 url = url_or_request
 233             self.to_screen(u'Dumping request to ' + url)
 234             dump = base64.b64encode(webpage_bytes).decode('ascii')
 235             self._downloader.to_screen(dump)
 236         if self._downloader.params.get('write_pages', False):
 237             try:
 238                 url = url_or_request.get_full_url()
 239             except AttributeError:
 240                 url = url_or_request
 241             if len(url) > 200:
 242                 h = u'___' + hashlib.md5(url).hexdigest()
 243                 url = url[:200 - len(h)] + h
 244             raw_filename = ('%s_%s.dump' % (video_id, url))
 245             filename = sanitize_filename(raw_filename, restricted=True)
 246             self.to_screen(u'Saving request to ' + filename)
 247             with open(filename, 'wb') as outf:
 248                 outf.write(webpage_bytes)
 249
 250         content = webpage_bytes.decode(encoding, 'replace')
 251         return (content, urlh)
 252
 253     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 254         """ Returns the data of the page as a string """
 255         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 256         if res is False:
 257             return res
 258         else:
 259             content, _ = res
 260             return content
 261
 262     def _download_xml(self, url_or_request, video_id,
 263                       note=u'Downloading XML', errnote=u'Unable to download XML',
 264                       transform_source=None):
 265         """Return the xml as an xml.etree.ElementTree.Element"""
 266         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 267         if transform_source:
 268             xml_string = transform_source(xml_string)
 269         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 270
 271     def _download_json(self, url_or_request, video_id,
 272                        note=u'Downloading JSON metadata',
 273                        errnote=u'Unable to download JSON metadata'):
 274         json_string = self._download_webpage(url_or_request, video_id, note, errnote)
 275         try:
 276             return json.loads(json_string)
 277         except ValueError as ve:
 278             raise ExtractorError('Failed to download JSON', cause=ve)
 279
 280     def report_warning(self, msg, video_id=None):
 281         idstr = u'' if video_id is None else u'%s: ' % video_id
 282         self._downloader.report_warning(
 283             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 284
 285     def to_screen(self, msg):
 286         """Print msg to screen, prefixing it with '[ie_name]'"""
 287         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 288
 289     def report_extraction(self, id_or_name):
 290         """Report information extraction."""
 291         self.to_screen(u'%s: Extracting information' % id_or_name)
 292
 293     def report_download_webpage(self, video_id):
 294         """Report webpage download."""
 295         self.to_screen(u'%s: Downloading webpage' % video_id)
 296
 297     def report_age_confirmation(self):
 298         """Report attempt to confirm age."""
 299         self.to_screen(u'Confirming age')
 300
 301     def report_login(self):
 302         """Report attempt to log in."""
 303         self.to_screen(u'Logging in')
 304
 305     #Methods for following #608
 306     @staticmethod
 307     def url_result(url, ie=None, video_id=None):
 308         """Returns a url that points to a page that should be processed"""
 309         #TODO: ie should be the class used for getting the info
 310         video_info = {'_type': 'url',
 311                       'url': url,
 312                       'ie_key': ie}
 313         if video_id is not None:
 314             video_info['id'] = video_id
 315         return video_info
 316     @staticmethod
 317     def playlist_result(entries, playlist_id=None, playlist_title=None):
 318         """Returns a playlist"""
 319         video_info = {'_type': 'playlist',
 320                       'entries': entries}
 321         if playlist_id:
 322             video_info['id'] = playlist_id
 323         if playlist_title:
 324             video_info['title'] = playlist_title
 325         return video_info
 326
 327     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 328         """
 329         Perform a regex search on the given string, using a single or a list of
 330         patterns returning the first matching group.
 331         In case of failure return a default value or raise a WARNING or a
 332         RegexNotFoundError, depending on fatal, specifying the field name.
 333         """
 334         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 335             mobj = re.search(pattern, string, flags)
 336         else:
 337             for p in pattern:
 338                 mobj = re.search(p, string, flags)
 339                 if mobj: break
 340
 341         if os.name != 'nt' and sys.stderr.isatty():
 342             _name = u'\033[0;34m%s\033[0m' % name
 343         else:
 344             _name = name
 345
 346         if mobj:
 347             # return the first matching group
 348             return next(g for g in mobj.groups() if g is not None)
 349         elif default is not _NO_DEFAULT:
 350             return default
 351         elif fatal:
 352             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 353         else:
 354             self._downloader.report_warning(u'unable to extract %s; '
 355                 u'please report this issue on http://yt-dl.org/bug' % _name)
 356             return None
 357
 358     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 359         """
 360         Like _search_regex, but strips HTML tags and unescapes entities.
 361         """
 362         res = self._search_regex(pattern, string, name, default, fatal, flags)
 363         if res:
 364             return clean_html(res).strip()
 365         else:
 366             return res
 367
 368     def _get_login_info(self):
 369         """
 370         Get the the login info as (username, password)
 371         It will look in the netrc file using the _NETRC_MACHINE value
 372         If there's no info available, return (None, None)
 373         """
 374         if self._downloader is None:
 375             return (None, None)
 376
 377         username = None
 378         password = None
 379         downloader_params = self._downloader.params
 380
 381         # Attempt to use provided username and password or .netrc data
 382         if downloader_params.get('username', None) is not None:
 383             username = downloader_params['username']
 384             password = downloader_params['password']
 385         elif downloader_params.get('usenetrc', False):
 386             try:
 387                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 388                 if info is not None:
 389                     username = info[0]
 390                     password = info[2]
 391                 else:
 392                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 393             except (IOError, netrc.NetrcParseError) as err:
 394                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 395
 396         return (username, password)
 397
 398     # Helper functions for extracting OpenGraph info
 399     @staticmethod
 400     def _og_regexes(prop):
 401         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 402         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 403         template = r'<meta[^>]+?%s[^>]+?%s'
 404         return [
 405             template % (property_re, content_re),
 406             template % (content_re, property_re),
 407         ]
 408
 409     def _og_search_property(self, prop, html, name=None, **kargs):
 410         if name is None:
 411             name = 'OpenGraph %s' % prop
 412         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 413         if escaped is None:
 414             return None
 415         return unescapeHTML(escaped)
 416
 417     def _og_search_thumbnail(self, html, **kargs):
 418         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 419
 420     def _og_search_description(self, html, **kargs):
 421         return self._og_search_property('description', html, fatal=False, **kargs)
 422
 423     def _og_search_title(self, html, **kargs):
 424         return self._og_search_property('title', html, **kargs)
 425
 426     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 427         regexes = self._og_regexes('video')
 428         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 429         return self._html_search_regex(regexes, html, name, **kargs)
 430
 431     def _html_search_meta(self, name, html, display_name=None):
 432         if display_name is None:
 433             display_name = name
 434         return self._html_search_regex(
 435             r'''(?ix)<meta
 436                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 437                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 438             html, display_name, fatal=False)
 439
 440     def _dc_search_uploader(self, html):
 441         return self._html_search_meta('dc.creator', html, 'uploader')
 442
 443     def _rta_search(self, html):
 444         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 445         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 446                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 447                      html):
 448             return 18
 449         return 0
 450
 451     def _media_rating_search(self, html):
 452         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 453         rating = self._html_search_meta('rating', html)
 454
 455         if not rating:
 456             return None
 457
 458         RATING_TABLE = {
 459             'safe for kids': 0,
 460             'general': 8,
 461             '14 years': 14,
 462             'mature': 17,
 463             'restricted': 19,
 464         }
 465         return RATING_TABLE.get(rating.lower(), None)
 466
 467     def _sort_formats(self, formats):
 468         def _formats_key(f):
 469             # TODO remove the following workaround
 470             from ..utils import determine_ext
 471             if not f.get('ext') and 'url' in f:
 472                 f['ext'] = determine_ext(f['url'])
 473
 474             preference = f.get('preference')
 475             if preference is None:
 476                 proto = f.get('protocol')
 477                 if proto is None:
 478                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 479
 480                 preference = 0 if proto in ['http', 'https'] else -0.1
 481                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 482                     preference -= 0.5
 483
 484             if f.get('vcodec') == 'none':  # audio only
 485                 if self._downloader.params.get('prefer_free_formats'):
 486                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 487                 else:
 488                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 489                 ext_preference = 0
 490                 try:
 491                     audio_ext_preference = ORDER.index(f['ext'])
 492                 except ValueError:
 493                     audio_ext_preference = -1
 494             else:
 495                 if self._downloader.params.get('prefer_free_formats'):
 496                     ORDER = [u'flv', u'mp4', u'webm']
 497                 else:
 498                     ORDER = [u'webm', u'flv', u'mp4']
 499                 try:
 500                     ext_preference = ORDER.index(f['ext'])
 501                 except ValueError:
 502                     ext_preference = -1
 503                 audio_ext_preference = 0
 504
 505             return (
 506                 preference,
 507                 f.get('quality') if f.get('quality') is not None else -1,
 508                 f.get('height') if f.get('height') is not None else -1,
 509                 f.get('width') if f.get('width') is not None else -1,
 510                 ext_preference,
 511                 f.get('tbr') if f.get('tbr') is not None else -1,
 512                 f.get('vbr') if f.get('vbr') is not None else -1,
 513                 f.get('abr') if f.get('abr') is not None else -1,
 514                 audio_ext_preference,
 515                 f.get('filesize') if f.get('filesize') is not None else -1,
 516                 f.get('format_id'),
 517             )
 518         formats.sort(key=_formats_key)
 519
 520
 521 class SearchInfoExtractor(InfoExtractor):
 522     """
 523     Base class for paged search queries extractors.
 524     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 525     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 526     """
 527
 528     @classmethod
 529     def _make_valid_url(cls):
 530         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 531
 532     @classmethod
 533     def suitable(cls, url):
 534         return re.match(cls._make_valid_url(), url) is not None
 535
 536     def _real_extract(self, query):
 537         mobj = re.match(self._make_valid_url(), query)
 538         if mobj is None:
 539             raise ExtractorError(u'Invalid search query "%s"' % query)
 540
 541         prefix = mobj.group('prefix')
 542         query = mobj.group('query')
 543         if prefix == '':
 544             return self._get_n_results(query, 1)
 545         elif prefix == 'all':
 546             return self._get_n_results(query, self._MAX_RESULTS)
 547         else:
 548             n = int(prefix)
 549             if n <= 0:
 550                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 551             elif n > self._MAX_RESULTS:
 552                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 553                 n = self._MAX_RESULTS
 554             return self._get_n_results(query, n)
 555
 556     def _get_n_results(self, query, n):
 557         """Get a specified number of results for a query"""
 558         raise NotImplementedError("This method must be implemented by subclasses")
 559
 560     @property
 561     def SEARCH_KEY(self):
 562         return self._SEARCH_KEY