]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
[vube] Fix comment count
[yt-dlp.git] / youtube_dl / extractor / common.py
1 import base64
2 import hashlib
3 import json
4 import netrc
5 import os
6 import re
7 import socket
8 import sys
9 import time
10 import xml.etree.ElementTree
11
12 from ..utils import (
13 compat_http_client,
14 compat_urllib_error,
15 compat_urllib_parse_urlparse,
16 compat_str,
17
18 clean_html,
19 compiled_regex_type,
20 ExtractorError,
21 RegexNotFoundError,
22 sanitize_filename,
23 unescapeHTML,
24 )
25 _NO_DEFAULT = object()
26
27
28 class InfoExtractor(object):
29 """Information Extractor class.
30
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
38
39 The dictionaries must include the following fields:
40
41 id: Video identifier.
42 title: Video title, unescaped.
43
44 Additionally, it must contain either a formats entry or a url one:
45
46 formats: A list of dictionaries for each format available, ordered
47 from worst to best quality.
48
49 Potential fields:
50 * url Mandatory. The URL of the video file
51 * ext Will be calculated from url if missing
52 * format A human-readable description of the format
53 ("mp4 container with h264/opus").
54 Calculated from the format_id, width, height.
55 and format_note fields if missing.
56 * format_id A short description of the format
57 ("mp4_h264_opus" or "19").
58 Technically optional, but strongly recommended.
59 * format_note Additional info about the format
60 ("3D" or "DASH video")
61 * width Width of the video, if known
62 * height Height of the video, if known
63 * resolution Textual description of width and height
64 * tbr Average bitrate of audio and video in KBit/s
65 * abr Average audio bitrate in KBit/s
66 * acodec Name of the audio codec in use
67 * asr Audio sampling rate in Hertz
68 * vbr Average video bitrate in KBit/s
69 * vcodec Name of the video codec in use
70 * container Name of the container format
71 * filesize The number of bytes, if known in advance
72 * filesize_approx An estimate for the number of bytes
73 * player_url SWF Player URL (used for rtmpdump).
74 * protocol The protocol that will be used for the actual
75 download, lower-case.
76 "http", "https", "rtsp", "rtmp", "m3u8" or so.
77 * preference Order number of this format. If this field is
78 present and not None, the formats get sorted
79 by this field, regardless of all other values.
80 -1 for default (order by other properties),
81 -2 or smaller for less than default.
82 * quality Order number of the video quality of this
83 format, irrespective of the file format.
84 -1 for default (order by other properties),
85 -2 or smaller for less than default.
86 url: Final video URL.
87 ext: Video filename extension.
88 format: The video format, defaults to ext (used for --get-format)
89 player_url: SWF Player URL (used for rtmpdump).
90
91 The following fields are optional:
92
93 display_id An alternative identifier for the video, not necessarily
94 unique, but available before title. Typically, id is
95 something like "4234987", title "Dancing naked mole rats",
96 and display_id "dancing-naked-mole-rats"
97 thumbnails: A list of dictionaries, with the following entries:
98 * "url"
99 * "width" (optional, int)
100 * "height" (optional, int)
101 * "resolution" (optional, string "{width}x{height"},
102 deprecated)
103 thumbnail: Full URL to a video thumbnail image.
104 description: One-line video description.
105 uploader: Full name of the video uploader.
106 timestamp: UNIX timestamp of the moment the video became available.
107 upload_date: Video upload date (YYYYMMDD).
108 If not explicitly set, calculated from timestamp.
109 uploader_id: Nickname or id of the video uploader.
110 location: Physical location of the video.
111 subtitles: The subtitle file contents as a dictionary in the format
112 {language: subtitles}.
113 duration: Length of the video in seconds, as an integer.
114 view_count: How many users have watched the video on the platform.
115 like_count: Number of positive ratings of the video
116 dislike_count: Number of negative ratings of the video
117 comment_count: Number of comments on the video
118 age_limit: Age restriction for the video, as an integer (years)
119 webpage_url: The url to the video webpage, if given to youtube-dl it
120 should allow to get the same result again. (It will be set
121 by YoutubeDL if it's missing)
122 categories: A list of categories that the video falls in, for example
123 ["Sports", "Berlin"]
124
125 Unless mentioned otherwise, the fields should be Unicode strings.
126
127 Subclasses of this one should re-define the _real_initialize() and
128 _real_extract() methods and define a _VALID_URL regexp.
129 Probably, they should also be added to the list of extractors.
130
131 Finally, the _WORKING attribute should be set to False for broken IEs
132 in order to warn the users and skip the tests.
133 """
134
135 _ready = False
136 _downloader = None
137 _WORKING = True
138
139 def __init__(self, downloader=None):
140 """Constructor. Receives an optional downloader."""
141 self._ready = False
142 self.set_downloader(downloader)
143
144 @classmethod
145 def suitable(cls, url):
146 """Receives a URL and returns True if suitable for this IE."""
147
148 # This does not use has/getattr intentionally - we want to know whether
149 # we have cached the regexp for *this* class, whereas getattr would also
150 # match the superclass
151 if '_VALID_URL_RE' not in cls.__dict__:
152 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
153 return cls._VALID_URL_RE.match(url) is not None
154
155 @classmethod
156 def working(cls):
157 """Getter method for _WORKING."""
158 return cls._WORKING
159
160 def initialize(self):
161 """Initializes an instance (authentication, etc)."""
162 if not self._ready:
163 self._real_initialize()
164 self._ready = True
165
166 def extract(self, url):
167 """Extracts URL information and returns it in list of dicts."""
168 self.initialize()
169 return self._real_extract(url)
170
171 def set_downloader(self, downloader):
172 """Sets the downloader for this IE."""
173 self._downloader = downloader
174
175 def _real_initialize(self):
176 """Real initialization process. Redefine in subclasses."""
177 pass
178
179 def _real_extract(self, url):
180 """Real extraction process. Redefine in subclasses."""
181 pass
182
183 @classmethod
184 def ie_key(cls):
185 """A string for getting the InfoExtractor with get_info_extractor"""
186 return cls.__name__[:-2]
187
188 @property
189 def IE_NAME(self):
190 return type(self).__name__[:-2]
191
192 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
193 """ Returns the response handle """
194 if note is None:
195 self.report_download_webpage(video_id)
196 elif note is not False:
197 if video_id is None:
198 self.to_screen(u'%s' % (note,))
199 else:
200 self.to_screen(u'%s: %s' % (video_id, note))
201 try:
202 return self._downloader.urlopen(url_or_request)
203 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
204 if errnote is False:
205 return False
206 if errnote is None:
207 errnote = u'Unable to download webpage'
208 errmsg = u'%s: %s' % (errnote, compat_str(err))
209 if fatal:
210 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
211 else:
212 self._downloader.report_warning(errmsg)
213 return False
214
215 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
216 """ Returns a tuple (page content as string, URL handle) """
217
218 # Strip hashes from the URL (#1038)
219 if isinstance(url_or_request, (compat_str, str)):
220 url_or_request = url_or_request.partition('#')[0]
221
222 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
223 if urlh is False:
224 assert not fatal
225 return False
226 content_type = urlh.headers.get('Content-Type', '')
227 webpage_bytes = urlh.read()
228 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
229 if m:
230 encoding = m.group(1)
231 else:
232 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
233 webpage_bytes[:1024])
234 if m:
235 encoding = m.group(1).decode('ascii')
236 elif webpage_bytes.startswith(b'\xff\xfe'):
237 encoding = 'utf-16'
238 else:
239 encoding = 'utf-8'
240 if self._downloader.params.get('dump_intermediate_pages', False):
241 try:
242 url = url_or_request.get_full_url()
243 except AttributeError:
244 url = url_or_request
245 self.to_screen(u'Dumping request to ' + url)
246 dump = base64.b64encode(webpage_bytes).decode('ascii')
247 self._downloader.to_screen(dump)
248 if self._downloader.params.get('write_pages', False):
249 try:
250 url = url_or_request.get_full_url()
251 except AttributeError:
252 url = url_or_request
253 basen = '%s_%s' % (video_id, url)
254 if len(basen) > 240:
255 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
256 basen = basen[:240 - len(h)] + h
257 raw_filename = basen + '.dump'
258 filename = sanitize_filename(raw_filename, restricted=True)
259 self.to_screen(u'Saving request to ' + filename)
260 with open(filename, 'wb') as outf:
261 outf.write(webpage_bytes)
262
263 try:
264 content = webpage_bytes.decode(encoding, 'replace')
265 except LookupError:
266 content = webpage_bytes.decode('utf-8', 'replace')
267
268 if (u'<title>Access to this site is blocked</title>' in content and
269 u'Websense' in content[:512]):
270 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
271 blocked_iframe = self._html_search_regex(
272 r'<iframe src="([^"]+)"', content,
273 u'Websense information URL', default=None)
274 if blocked_iframe:
275 msg += u' Visit %s for more details' % blocked_iframe
276 raise ExtractorError(msg, expected=True)
277
278 return (content, urlh)
279
280 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
281 """ Returns the data of the page as a string """
282 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
283 if res is False:
284 return res
285 else:
286 content, _ = res
287 return content
288
289 def _download_xml(self, url_or_request, video_id,
290 note=u'Downloading XML', errnote=u'Unable to download XML',
291 transform_source=None, fatal=True):
292 """Return the xml as an xml.etree.ElementTree.Element"""
293 xml_string = self._download_webpage(
294 url_or_request, video_id, note, errnote, fatal=fatal)
295 if xml_string is False:
296 return xml_string
297 if transform_source:
298 xml_string = transform_source(xml_string)
299 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
300
301 def _download_json(self, url_or_request, video_id,
302 note=u'Downloading JSON metadata',
303 errnote=u'Unable to download JSON metadata',
304 transform_source=None,
305 fatal=True):
306 json_string = self._download_webpage(
307 url_or_request, video_id, note, errnote, fatal=fatal)
308 if (not fatal) and json_string is False:
309 return None
310 if transform_source:
311 json_string = transform_source(json_string)
312 try:
313 return json.loads(json_string)
314 except ValueError as ve:
315 raise ExtractorError('Failed to download JSON', cause=ve)
316
317 def report_warning(self, msg, video_id=None):
318 idstr = u'' if video_id is None else u'%s: ' % video_id
319 self._downloader.report_warning(
320 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
321
322 def to_screen(self, msg):
323 """Print msg to screen, prefixing it with '[ie_name]'"""
324 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
325
326 def report_extraction(self, id_or_name):
327 """Report information extraction."""
328 self.to_screen(u'%s: Extracting information' % id_or_name)
329
330 def report_download_webpage(self, video_id):
331 """Report webpage download."""
332 self.to_screen(u'%s: Downloading webpage' % video_id)
333
334 def report_age_confirmation(self):
335 """Report attempt to confirm age."""
336 self.to_screen(u'Confirming age')
337
338 def report_login(self):
339 """Report attempt to log in."""
340 self.to_screen(u'Logging in')
341
342 #Methods for following #608
343 @staticmethod
344 def url_result(url, ie=None, video_id=None):
345 """Returns a url that points to a page that should be processed"""
346 #TODO: ie should be the class used for getting the info
347 video_info = {'_type': 'url',
348 'url': url,
349 'ie_key': ie}
350 if video_id is not None:
351 video_info['id'] = video_id
352 return video_info
353 @staticmethod
354 def playlist_result(entries, playlist_id=None, playlist_title=None):
355 """Returns a playlist"""
356 video_info = {'_type': 'playlist',
357 'entries': entries}
358 if playlist_id:
359 video_info['id'] = playlist_id
360 if playlist_title:
361 video_info['title'] = playlist_title
362 return video_info
363
364 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
365 """
366 Perform a regex search on the given string, using a single or a list of
367 patterns returning the first matching group.
368 In case of failure return a default value or raise a WARNING or a
369 RegexNotFoundError, depending on fatal, specifying the field name.
370 """
371 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
372 mobj = re.search(pattern, string, flags)
373 else:
374 for p in pattern:
375 mobj = re.search(p, string, flags)
376 if mobj: break
377
378 if os.name != 'nt' and sys.stderr.isatty():
379 _name = u'\033[0;34m%s\033[0m' % name
380 else:
381 _name = name
382
383 if mobj:
384 # return the first matching group
385 return next(g for g in mobj.groups() if g is not None)
386 elif default is not _NO_DEFAULT:
387 return default
388 elif fatal:
389 raise RegexNotFoundError(u'Unable to extract %s' % _name)
390 else:
391 self._downloader.report_warning(u'unable to extract %s; '
392 u'please report this issue on http://yt-dl.org/bug' % _name)
393 return None
394
395 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
396 """
397 Like _search_regex, but strips HTML tags and unescapes entities.
398 """
399 res = self._search_regex(pattern, string, name, default, fatal, flags)
400 if res:
401 return clean_html(res).strip()
402 else:
403 return res
404
405 def _get_login_info(self):
406 """
407 Get the the login info as (username, password)
408 It will look in the netrc file using the _NETRC_MACHINE value
409 If there's no info available, return (None, None)
410 """
411 if self._downloader is None:
412 return (None, None)
413
414 username = None
415 password = None
416 downloader_params = self._downloader.params
417
418 # Attempt to use provided username and password or .netrc data
419 if downloader_params.get('username', None) is not None:
420 username = downloader_params['username']
421 password = downloader_params['password']
422 elif downloader_params.get('usenetrc', False):
423 try:
424 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
425 if info is not None:
426 username = info[0]
427 password = info[2]
428 else:
429 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
430 except (IOError, netrc.NetrcParseError) as err:
431 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
432
433 return (username, password)
434
435 # Helper functions for extracting OpenGraph info
436 @staticmethod
437 def _og_regexes(prop):
438 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
439 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
440 template = r'<meta[^>]+?%s[^>]+?%s'
441 return [
442 template % (property_re, content_re),
443 template % (content_re, property_re),
444 ]
445
446 def _og_search_property(self, prop, html, name=None, **kargs):
447 if name is None:
448 name = 'OpenGraph %s' % prop
449 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
450 if escaped is None:
451 return None
452 return unescapeHTML(escaped)
453
454 def _og_search_thumbnail(self, html, **kargs):
455 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
456
457 def _og_search_description(self, html, **kargs):
458 return self._og_search_property('description', html, fatal=False, **kargs)
459
460 def _og_search_title(self, html, **kargs):
461 return self._og_search_property('title', html, **kargs)
462
463 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
464 regexes = self._og_regexes('video')
465 if secure: regexes = self._og_regexes('video:secure_url') + regexes
466 return self._html_search_regex(regexes, html, name, **kargs)
467
468 def _og_search_url(self, html, **kargs):
469 return self._og_search_property('url', html, **kargs)
470
471 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
472 if display_name is None:
473 display_name = name
474 return self._html_search_regex(
475 r'''(?ix)<meta
476 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
477 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
478 html, display_name, fatal=fatal, **kwargs)
479
480 def _dc_search_uploader(self, html):
481 return self._html_search_meta('dc.creator', html, 'uploader')
482
483 def _rta_search(self, html):
484 # See http://www.rtalabel.org/index.php?content=howtofaq#single
485 if re.search(r'(?ix)<meta\s+name="rating"\s+'
486 r' content="RTA-5042-1996-1400-1577-RTA"',
487 html):
488 return 18
489 return 0
490
491 def _media_rating_search(self, html):
492 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
493 rating = self._html_search_meta('rating', html)
494
495 if not rating:
496 return None
497
498 RATING_TABLE = {
499 'safe for kids': 0,
500 'general': 8,
501 '14 years': 14,
502 'mature': 17,
503 'restricted': 19,
504 }
505 return RATING_TABLE.get(rating.lower(), None)
506
507 def _twitter_search_player(self, html):
508 return self._html_search_meta('twitter:player', html,
509 'twitter card player')
510
511 def _sort_formats(self, formats):
512 if not formats:
513 raise ExtractorError(u'No video formats found')
514
515 def _formats_key(f):
516 # TODO remove the following workaround
517 from ..utils import determine_ext
518 if not f.get('ext') and 'url' in f:
519 f['ext'] = determine_ext(f['url'])
520
521 preference = f.get('preference')
522 if preference is None:
523 proto = f.get('protocol')
524 if proto is None:
525 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
526
527 preference = 0 if proto in ['http', 'https'] else -0.1
528 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
529 preference -= 0.5
530
531 if f.get('vcodec') == 'none': # audio only
532 if self._downloader.params.get('prefer_free_formats'):
533 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
534 else:
535 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
536 ext_preference = 0
537 try:
538 audio_ext_preference = ORDER.index(f['ext'])
539 except ValueError:
540 audio_ext_preference = -1
541 else:
542 if self._downloader.params.get('prefer_free_formats'):
543 ORDER = [u'flv', u'mp4', u'webm']
544 else:
545 ORDER = [u'webm', u'flv', u'mp4']
546 try:
547 ext_preference = ORDER.index(f['ext'])
548 except ValueError:
549 ext_preference = -1
550 audio_ext_preference = 0
551
552 return (
553 preference,
554 f.get('quality') if f.get('quality') is not None else -1,
555 f.get('height') if f.get('height') is not None else -1,
556 f.get('width') if f.get('width') is not None else -1,
557 ext_preference,
558 f.get('tbr') if f.get('tbr') is not None else -1,
559 f.get('vbr') if f.get('vbr') is not None else -1,
560 f.get('abr') if f.get('abr') is not None else -1,
561 audio_ext_preference,
562 f.get('filesize') if f.get('filesize') is not None else -1,
563 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
564 f.get('format_id'),
565 )
566 formats.sort(key=_formats_key)
567
568 def http_scheme(self):
569 """ Either "https:" or "https:", depending on the user's preferences """
570 return (
571 'http:'
572 if self._downloader.params.get('prefer_insecure', False)
573 else 'https:')
574
575 def _proto_relative_url(self, url, scheme=None):
576 if url is None:
577 return url
578 if url.startswith('//'):
579 if scheme is None:
580 scheme = self.http_scheme()
581 return scheme + url
582 else:
583 return url
584
585 def _sleep(self, timeout, video_id, msg_template=None):
586 if msg_template is None:
587 msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
588 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
589 self.to_screen(msg)
590 time.sleep(timeout)
591
592
593 class SearchInfoExtractor(InfoExtractor):
594 """
595 Base class for paged search queries extractors.
596 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
597 Instances should define _SEARCH_KEY and _MAX_RESULTS.
598 """
599
600 @classmethod
601 def _make_valid_url(cls):
602 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
603
604 @classmethod
605 def suitable(cls, url):
606 return re.match(cls._make_valid_url(), url) is not None
607
608 def _real_extract(self, query):
609 mobj = re.match(self._make_valid_url(), query)
610 if mobj is None:
611 raise ExtractorError(u'Invalid search query "%s"' % query)
612
613 prefix = mobj.group('prefix')
614 query = mobj.group('query')
615 if prefix == '':
616 return self._get_n_results(query, 1)
617 elif prefix == 'all':
618 return self._get_n_results(query, self._MAX_RESULTS)
619 else:
620 n = int(prefix)
621 if n <= 0:
622 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
623 elif n > self._MAX_RESULTS:
624 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
625 n = self._MAX_RESULTS
626 return self._get_n_results(query, n)
627
628 def _get_n_results(self, query, n):
629 """Get a specified number of results for a query"""
630 raise NotImplementedError("This method must be implemented by subclasses")
631
632 @property
633 def SEARCH_KEY(self):
634 return self._SEARCH_KEY