youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_urllib_parse_urlparse,
  13     compat_str,
  14
  15     clean_html,
  16     compiled_regex_type,
  17     ExtractorError,
  18     RegexNotFoundError,
  19     sanitize_filename,
  20     unescapeHTML,
  21 )
  22 _NO_DEFAULT = object()
  23
  24
  25 class InfoExtractor(object):
  26     """Information Extractor class.
  27
  28     Information extractors are the classes that, given a URL, extract
  29     information about the video (or videos) the URL refers to. This
  30     information includes the real video URL, the video title, author and
  31     others. The information is stored in a dictionary which is then
  32     passed to the FileDownloader. The FileDownloader processes this
  33     information possibly downloading the video to the file system, among
  34     other possible outcomes.
  35
  36     The dictionaries must include the following fields:
  37
  38     id:             Video identifier.
  39     title:          Video title, unescaped.
  40
  41     Additionally, it must contain either a formats entry or a url one:
  42
  43     formats:        A list of dictionaries for each format available, ordered
  44                     from worst to best quality.
  45
  46                     Potential fields:
  47                     * url        Mandatory. The URL of the video file
  48                     * ext        Will be calculated from url if missing
  49                     * format     A human-readable description of the format
  50                                  ("mp4 container with h264/opus").
  51                                  Calculated from the format_id, width, height.
  52                                  and format_note fields if missing.
  53                     * format_id  A short description of the format
  54                                  ("mp4_h264_opus" or "19").
  55                                 Technically optional, but strongly recommended.
  56                     * format_note Additional info about the format
  57                                  ("3D" or "DASH video")
  58                     * width      Width of the video, if known
  59                     * height     Height of the video, if known
  60                     * resolution Textual description of width and height
  61                     * tbr        Average bitrate of audio and video in KBit/s
  62                     * abr        Average audio bitrate in KBit/s
  63                     * acodec     Name of the audio codec in use
  64                     * vbr        Average video bitrate in KBit/s
  65                     * vcodec     Name of the video codec in use
  66                     * filesize   The number of bytes, if known in advance
  67                     * player_url SWF Player URL (used for rtmpdump).
  68                     * protocol   The protocol that will be used for the actual
  69                                  download, lower-case.
  70                                  "http", "https", "rtsp", "rtmp" or so.
  71                     * preference Order number of this format. If this field is
  72                                  present, the formats get sorted by this field.
  73                                  -1 for default (order by other properties),
  74                                  -2 or smaller for less than default.
  75     url:            Final video URL.
  76     ext:            Video filename extension.
  77     format:         The video format, defaults to ext (used for --get-format)
  78     player_url:     SWF Player URL (used for rtmpdump).
  79
  80     The following fields are optional:
  81
  82     thumbnails:     A list of dictionaries (with the entries "resolution" and
  83                     "url") for the varying thumbnails
  84     thumbnail:      Full URL to a video thumbnail image.
  85     description:    One-line video description.
  86     uploader:       Full name of the video uploader.
  87     upload_date:    Video upload date (YYYYMMDD).
  88     uploader_id:    Nickname or id of the video uploader.
  89     location:       Physical location of the video.
  90     subtitles:      The subtitle file contents as a dictionary in the format
  91                     {language: subtitles}.
  92     duration:       Length of the video in seconds, as an integer.
  93     view_count:     How many users have watched the video on the platform.
  94     like_count:     Number of positive ratings of the video
  95     dislike_count:  Number of negative ratings of the video
  96     comment_count:  Number of comments on the video
  97     age_limit:      Age restriction for the video, as an integer (years)
  98     webpage_url:    The url to the video webpage, if given to youtube-dl it
  99                     should allow to get the same result again. (It will be set
 100                     by YoutubeDL if it's missing)
 101
 102     Unless mentioned otherwise, the fields should be Unicode strings.
 103
 104     Subclasses of this one should re-define the _real_initialize() and
 105     _real_extract() methods and define a _VALID_URL regexp.
 106     Probably, they should also be added to the list of extractors.
 107
 108     _real_extract() must return a *list* of information dictionaries as
 109     described above.
 110
 111     Finally, the _WORKING attribute should be set to False for broken IEs
 112     in order to warn the users and skip the tests.
 113     """
 114
 115     _ready = False
 116     _downloader = None
 117     _WORKING = True
 118
 119     def __init__(self, downloader=None):
 120         """Constructor. Receives an optional downloader."""
 121         self._ready = False
 122         self.set_downloader(downloader)
 123
 124     @classmethod
 125     def suitable(cls, url):
 126         """Receives a URL and returns True if suitable for this IE."""
 127
 128         # This does not use has/getattr intentionally - we want to know whether
 129         # we have cached the regexp for *this* class, whereas getattr would also
 130         # match the superclass
 131         if '_VALID_URL_RE' not in cls.__dict__:
 132             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 133         return cls._VALID_URL_RE.match(url) is not None
 134
 135     @classmethod
 136     def working(cls):
 137         """Getter method for _WORKING."""
 138         return cls._WORKING
 139
 140     def initialize(self):
 141         """Initializes an instance (authentication, etc)."""
 142         if not self._ready:
 143             self._real_initialize()
 144             self._ready = True
 145
 146     def extract(self, url):
 147         """Extracts URL information and returns it in list of dicts."""
 148         self.initialize()
 149         return self._real_extract(url)
 150
 151     def set_downloader(self, downloader):
 152         """Sets the downloader for this IE."""
 153         self._downloader = downloader
 154
 155     def _real_initialize(self):
 156         """Real initialization process. Redefine in subclasses."""
 157         pass
 158
 159     def _real_extract(self, url):
 160         """Real extraction process. Redefine in subclasses."""
 161         pass
 162
 163     @classmethod
 164     def ie_key(cls):
 165         """A string for getting the InfoExtractor with get_info_extractor"""
 166         return cls.__name__[:-2]
 167
 168     @property
 169     def IE_NAME(self):
 170         return type(self).__name__[:-2]
 171
 172     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 173         """ Returns the response handle """
 174         if note is None:
 175             self.report_download_webpage(video_id)
 176         elif note is not False:
 177             if video_id is None:
 178                 self.to_screen(u'%s' % (note,))
 179             else:
 180                 self.to_screen(u'%s: %s' % (video_id, note))
 181         try:
 182             return self._downloader.urlopen(url_or_request)
 183         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 184             if errnote is False:
 185                 return False
 186             if errnote is None:
 187                 errnote = u'Unable to download webpage'
 188             errmsg = u'%s: %s' % (errnote, compat_str(err))
 189             if fatal:
 190                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 191             else:
 192                 self._downloader.report_warning(errmsg)
 193                 return False
 194
 195     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 196         """ Returns a tuple (page content as string, URL handle) """
 197
 198         # Strip hashes from the URL (#1038)
 199         if isinstance(url_or_request, (compat_str, str)):
 200             url_or_request = url_or_request.partition('#')[0]
 201
 202         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 203         if urlh is False:
 204             assert not fatal
 205             return False
 206         content_type = urlh.headers.get('Content-Type', '')
 207         webpage_bytes = urlh.read()
 208         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 209         if m:
 210             encoding = m.group(1)
 211         else:
 212             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 213                           webpage_bytes[:1024])
 214             if m:
 215                 encoding = m.group(1).decode('ascii')
 216             else:
 217                 encoding = 'utf-8'
 218         if self._downloader.params.get('dump_intermediate_pages', False):
 219             try:
 220                 url = url_or_request.get_full_url()
 221             except AttributeError:
 222                 url = url_or_request
 223             self.to_screen(u'Dumping request to ' + url)
 224             dump = base64.b64encode(webpage_bytes).decode('ascii')
 225             self._downloader.to_screen(dump)
 226         if self._downloader.params.get('write_pages', False):
 227             try:
 228                 url = url_or_request.get_full_url()
 229             except AttributeError:
 230                 url = url_or_request
 231             raw_filename = ('%s_%s.dump' % (video_id, url))
 232             filename = sanitize_filename(raw_filename, restricted=True)
 233             self.to_screen(u'Saving request to ' + filename)
 234             with open(filename, 'wb') as outf:
 235                 outf.write(webpage_bytes)
 236
 237         content = webpage_bytes.decode(encoding, 'replace')
 238         return (content, urlh)
 239
 240     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 241         """ Returns the data of the page as a string """
 242         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 243         if res is False:
 244             return res
 245         else:
 246             content, _ = res
 247             return content
 248
 249     def _download_xml(self, url_or_request, video_id,
 250                       note=u'Downloading XML', errnote=u'Unable to download XML',
 251                       transform_source=None):
 252         """Return the xml as an xml.etree.ElementTree.Element"""
 253         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 254         if transform_source:
 255             xml_string = transform_source(xml_string)
 256         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 257
 258     def report_warning(self, msg, video_id=None):
 259         idstr = u'' if video_id is None else u'%s: ' % video_id
 260         self._downloader.report_warning(
 261             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 262
 263     def to_screen(self, msg):
 264         """Print msg to screen, prefixing it with '[ie_name]'"""
 265         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 266
 267     def report_extraction(self, id_or_name):
 268         """Report information extraction."""
 269         self.to_screen(u'%s: Extracting information' % id_or_name)
 270
 271     def report_download_webpage(self, video_id):
 272         """Report webpage download."""
 273         self.to_screen(u'%s: Downloading webpage' % video_id)
 274
 275     def report_age_confirmation(self):
 276         """Report attempt to confirm age."""
 277         self.to_screen(u'Confirming age')
 278
 279     def report_login(self):
 280         """Report attempt to log in."""
 281         self.to_screen(u'Logging in')
 282
 283     #Methods for following #608
 284     @staticmethod
 285     def url_result(url, ie=None, video_id=None):
 286         """Returns a url that points to a page that should be processed"""
 287         #TODO: ie should be the class used for getting the info
 288         video_info = {'_type': 'url',
 289                       'url': url,
 290                       'ie_key': ie}
 291         if video_id is not None:
 292             video_info['id'] = video_id
 293         return video_info
 294     @staticmethod
 295     def playlist_result(entries, playlist_id=None, playlist_title=None):
 296         """Returns a playlist"""
 297         video_info = {'_type': 'playlist',
 298                       'entries': entries}
 299         if playlist_id:
 300             video_info['id'] = playlist_id
 301         if playlist_title:
 302             video_info['title'] = playlist_title
 303         return video_info
 304
 305     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 306         """
 307         Perform a regex search on the given string, using a single or a list of
 308         patterns returning the first matching group.
 309         In case of failure return a default value or raise a WARNING or a
 310         RegexNotFoundError, depending on fatal, specifying the field name.
 311         """
 312         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 313             mobj = re.search(pattern, string, flags)
 314         else:
 315             for p in pattern:
 316                 mobj = re.search(p, string, flags)
 317                 if mobj: break
 318
 319         if os.name != 'nt' and sys.stderr.isatty():
 320             _name = u'\033[0;34m%s\033[0m' % name
 321         else:
 322             _name = name
 323
 324         if mobj:
 325             # return the first matching group
 326             return next(g for g in mobj.groups() if g is not None)
 327         elif default is not _NO_DEFAULT:
 328             return default
 329         elif fatal:
 330             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 331         else:
 332             self._downloader.report_warning(u'unable to extract %s; '
 333                 u'please report this issue on http://yt-dl.org/bug' % _name)
 334             return None
 335
 336     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 337         """
 338         Like _search_regex, but strips HTML tags and unescapes entities.
 339         """
 340         res = self._search_regex(pattern, string, name, default, fatal, flags)
 341         if res:
 342             return clean_html(res).strip()
 343         else:
 344             return res
 345
 346     def _get_login_info(self):
 347         """
 348         Get the the login info as (username, password)
 349         It will look in the netrc file using the _NETRC_MACHINE value
 350         If there's no info available, return (None, None)
 351         """
 352         if self._downloader is None:
 353             return (None, None)
 354
 355         username = None
 356         password = None
 357         downloader_params = self._downloader.params
 358
 359         # Attempt to use provided username and password or .netrc data
 360         if downloader_params.get('username', None) is not None:
 361             username = downloader_params['username']
 362             password = downloader_params['password']
 363         elif downloader_params.get('usenetrc', False):
 364             try:
 365                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 366                 if info is not None:
 367                     username = info[0]
 368                     password = info[2]
 369                 else:
 370                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 371             except (IOError, netrc.NetrcParseError) as err:
 372                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 373
 374         return (username, password)
 375
 376     # Helper functions for extracting OpenGraph info
 377     @staticmethod
 378     def _og_regexes(prop):
 379         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 380         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 381         template = r'<meta[^>]+?%s[^>]+?%s'
 382         return [
 383             template % (property_re, content_re),
 384             template % (content_re, property_re),
 385         ]
 386
 387     def _og_search_property(self, prop, html, name=None, **kargs):
 388         if name is None:
 389             name = 'OpenGraph %s' % prop
 390         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 391         if escaped is None:
 392             return None
 393         return unescapeHTML(escaped)
 394
 395     def _og_search_thumbnail(self, html, **kargs):
 396         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 397
 398     def _og_search_description(self, html, **kargs):
 399         return self._og_search_property('description', html, fatal=False, **kargs)
 400
 401     def _og_search_title(self, html, **kargs):
 402         return self._og_search_property('title', html, **kargs)
 403
 404     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 405         regexes = self._og_regexes('video')
 406         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 407         return self._html_search_regex(regexes, html, name, **kargs)
 408
 409     def _html_search_meta(self, name, html, display_name=None):
 410         if display_name is None:
 411             display_name = name
 412         return self._html_search_regex(
 413             r'''(?ix)<meta
 414                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 415                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 416             html, display_name, fatal=False)
 417
 418     def _dc_search_uploader(self, html):
 419         return self._html_search_meta('dc.creator', html, 'uploader')
 420
 421     def _rta_search(self, html):
 422         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 423         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 424                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 425                      html):
 426             return 18
 427         return 0
 428
 429     def _media_rating_search(self, html):
 430         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 431         rating = self._html_search_meta('rating', html)
 432
 433         if not rating:
 434             return None
 435
 436         RATING_TABLE = {
 437             'safe for kids': 0,
 438             'general': 8,
 439             '14 years': 14,
 440             'mature': 17,
 441             'restricted': 19,
 442         }
 443         return RATING_TABLE.get(rating.lower(), None)
 444
 445     def _sort_formats(self, formats):
 446         def _formats_key(f):
 447             # TODO remove the following workaround
 448             from ..utils import determine_ext
 449             if not f.get('ext') and 'url' in f:
 450                 f['ext'] = determine_ext(f['url'])
 451
 452             preference = f.get('preference')
 453             if preference is None:
 454                 proto = f.get('protocol')
 455                 if proto is None:
 456                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 457
 458                 preference = 0 if proto in ['http', 'https'] else -0.1
 459                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 460                     preference -= 0.5
 461
 462             if f.get('vcodec') == 'none':  # audio only
 463                 if self._downloader.params.get('prefer_free_formats'):
 464                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 465                 else:
 466                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 467                 ext_preference = 0
 468                 try:
 469                     audio_ext_preference = ORDER.index(f['ext'])
 470                 except ValueError:
 471                     audio_ext_preference = -1
 472             else:
 473                 if self._downloader.params.get('prefer_free_formats'):
 474                     ORDER = [u'flv', u'mp4', u'webm']
 475                 else:
 476                     ORDER = [u'webm', u'flv', u'mp4']
 477                 try:
 478                     ext_preference = ORDER.index(f['ext'])
 479                 except ValueError:
 480                     ext_preference = -1
 481                 audio_ext_preference = 0
 482
 483             return (
 484                 preference,
 485                 f.get('height') if f.get('height') is not None else -1,
 486                 f.get('width') if f.get('width') is not None else -1,
 487                 ext_preference,
 488                 f.get('vbr') if f.get('vbr') is not None else -1,
 489                 f.get('abr') if f.get('abr') is not None else -1,
 490                 audio_ext_preference,
 491                 f.get('filesize') if f.get('filesize') is not None else -1,
 492                 f.get('format_id'),
 493             )
 494         formats.sort(key=_formats_key)
 495
 496
 497 class SearchInfoExtractor(InfoExtractor):
 498     """
 499     Base class for paged search queries extractors.
 500     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 501     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 502     """
 503
 504     @classmethod
 505     def _make_valid_url(cls):
 506         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 507
 508     @classmethod
 509     def suitable(cls, url):
 510         return re.match(cls._make_valid_url(), url) is not None
 511
 512     def _real_extract(self, query):
 513         mobj = re.match(self._make_valid_url(), query)
 514         if mobj is None:
 515             raise ExtractorError(u'Invalid search query "%s"' % query)
 516
 517         prefix = mobj.group('prefix')
 518         query = mobj.group('query')
 519         if prefix == '':
 520             return self._get_n_results(query, 1)
 521         elif prefix == 'all':
 522             return self._get_n_results(query, self._MAX_RESULTS)
 523         else:
 524             n = int(prefix)
 525             if n <= 0:
 526                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 527             elif n > self._MAX_RESULTS:
 528                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 529                 n = self._MAX_RESULTS
 530             return self._get_n_results(query, n)
 531
 532     def _get_n_results(self, query, n):
 533         """Get a specified number of results for a query"""
 534         raise NotImplementedError("This method must be implemented by subclasses")
 535
 536     @property
 537     def SEARCH_KEY(self):
 538         return self._SEARCH_KEY