]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
Remove debug prints
[yt-dlp.git] / youtube_dl / extractor / common.py
1 import base64
2 import hashlib
3 import json
4 import netrc
5 import os
6 import re
7 import socket
8 import sys
9 import time
10 import xml.etree.ElementTree
11
12 from ..utils import (
13 compat_http_client,
14 compat_urllib_error,
15 compat_urllib_parse_urlparse,
16 compat_str,
17
18 clean_html,
19 compiled_regex_type,
20 ExtractorError,
21 int_or_none,
22 RegexNotFoundError,
23 sanitize_filename,
24 unescapeHTML,
25 )
26 _NO_DEFAULT = object()
27
28
29 class InfoExtractor(object):
30 """Information Extractor class.
31
32 Information extractors are the classes that, given a URL, extract
33 information about the video (or videos) the URL refers to. This
34 information includes the real video URL, the video title, author and
35 others. The information is stored in a dictionary which is then
36 passed to the FileDownloader. The FileDownloader processes this
37 information possibly downloading the video to the file system, among
38 other possible outcomes.
39
40 The dictionaries must include the following fields:
41
42 id: Video identifier.
43 title: Video title, unescaped.
44
45 Additionally, it must contain either a formats entry or a url one:
46
47 formats: A list of dictionaries for each format available, ordered
48 from worst to best quality.
49
50 Potential fields:
51 * url Mandatory. The URL of the video file
52 * ext Will be calculated from url if missing
53 * format A human-readable description of the format
54 ("mp4 container with h264/opus").
55 Calculated from the format_id, width, height.
56 and format_note fields if missing.
57 * format_id A short description of the format
58 ("mp4_h264_opus" or "19").
59 Technically optional, but strongly recommended.
60 * format_note Additional info about the format
61 ("3D" or "DASH video")
62 * width Width of the video, if known
63 * height Height of the video, if known
64 * resolution Textual description of width and height
65 * tbr Average bitrate of audio and video in KBit/s
66 * abr Average audio bitrate in KBit/s
67 * acodec Name of the audio codec in use
68 * asr Audio sampling rate in Hertz
69 * vbr Average video bitrate in KBit/s
70 * vcodec Name of the video codec in use
71 * container Name of the container format
72 * filesize The number of bytes, if known in advance
73 * filesize_approx An estimate for the number of bytes
74 * player_url SWF Player URL (used for rtmpdump).
75 * protocol The protocol that will be used for the actual
76 download, lower-case.
77 "http", "https", "rtsp", "rtmp", "m3u8" or so.
78 * preference Order number of this format. If this field is
79 present and not None, the formats get sorted
80 by this field, regardless of all other values.
81 -1 for default (order by other properties),
82 -2 or smaller for less than default.
83 * quality Order number of the video quality of this
84 format, irrespective of the file format.
85 -1 for default (order by other properties),
86 -2 or smaller for less than default.
87 url: Final video URL.
88 ext: Video filename extension.
89 format: The video format, defaults to ext (used for --get-format)
90 player_url: SWF Player URL (used for rtmpdump).
91
92 The following fields are optional:
93
94 display_id An alternative identifier for the video, not necessarily
95 unique, but available before title. Typically, id is
96 something like "4234987", title "Dancing naked mole rats",
97 and display_id "dancing-naked-mole-rats"
98 thumbnails: A list of dictionaries, with the following entries:
99 * "url"
100 * "width" (optional, int)
101 * "height" (optional, int)
102 * "resolution" (optional, string "{width}x{height"},
103 deprecated)
104 thumbnail: Full URL to a video thumbnail image.
105 description: One-line video description.
106 uploader: Full name of the video uploader.
107 timestamp: UNIX timestamp of the moment the video became available.
108 upload_date: Video upload date (YYYYMMDD).
109 If not explicitly set, calculated from timestamp.
110 uploader_id: Nickname or id of the video uploader.
111 location: Physical location of the video.
112 subtitles: The subtitle file contents as a dictionary in the format
113 {language: subtitles}.
114 duration: Length of the video in seconds, as an integer.
115 view_count: How many users have watched the video on the platform.
116 like_count: Number of positive ratings of the video
117 dislike_count: Number of negative ratings of the video
118 comment_count: Number of comments on the video
119 age_limit: Age restriction for the video, as an integer (years)
120 webpage_url: The url to the video webpage, if given to youtube-dl it
121 should allow to get the same result again. (It will be set
122 by YoutubeDL if it's missing)
123 categories: A list of categories that the video falls in, for example
124 ["Sports", "Berlin"]
125
126 Unless mentioned otherwise, the fields should be Unicode strings.
127
128 Subclasses of this one should re-define the _real_initialize() and
129 _real_extract() methods and define a _VALID_URL regexp.
130 Probably, they should also be added to the list of extractors.
131
132 Finally, the _WORKING attribute should be set to False for broken IEs
133 in order to warn the users and skip the tests.
134 """
135
136 _ready = False
137 _downloader = None
138 _WORKING = True
139
140 def __init__(self, downloader=None):
141 """Constructor. Receives an optional downloader."""
142 self._ready = False
143 self.set_downloader(downloader)
144
145 @classmethod
146 def suitable(cls, url):
147 """Receives a URL and returns True if suitable for this IE."""
148
149 # This does not use has/getattr intentionally - we want to know whether
150 # we have cached the regexp for *this* class, whereas getattr would also
151 # match the superclass
152 if '_VALID_URL_RE' not in cls.__dict__:
153 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
154 return cls._VALID_URL_RE.match(url) is not None
155
156 @classmethod
157 def working(cls):
158 """Getter method for _WORKING."""
159 return cls._WORKING
160
161 def initialize(self):
162 """Initializes an instance (authentication, etc)."""
163 if not self._ready:
164 self._real_initialize()
165 self._ready = True
166
167 def extract(self, url):
168 """Extracts URL information and returns it in list of dicts."""
169 self.initialize()
170 return self._real_extract(url)
171
172 def set_downloader(self, downloader):
173 """Sets the downloader for this IE."""
174 self._downloader = downloader
175
176 def _real_initialize(self):
177 """Real initialization process. Redefine in subclasses."""
178 pass
179
180 def _real_extract(self, url):
181 """Real extraction process. Redefine in subclasses."""
182 pass
183
184 @classmethod
185 def ie_key(cls):
186 """A string for getting the InfoExtractor with get_info_extractor"""
187 return cls.__name__[:-2]
188
189 @property
190 def IE_NAME(self):
191 return type(self).__name__[:-2]
192
193 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
194 """ Returns the response handle """
195 if note is None:
196 self.report_download_webpage(video_id)
197 elif note is not False:
198 if video_id is None:
199 self.to_screen(u'%s' % (note,))
200 else:
201 self.to_screen(u'%s: %s' % (video_id, note))
202 try:
203 return self._downloader.urlopen(url_or_request)
204 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
205 if errnote is False:
206 return False
207 if errnote is None:
208 errnote = u'Unable to download webpage'
209 errmsg = u'%s: %s' % (errnote, compat_str(err))
210 if fatal:
211 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
212 else:
213 self._downloader.report_warning(errmsg)
214 return False
215
216 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
217 """ Returns a tuple (page content as string, URL handle) """
218
219 # Strip hashes from the URL (#1038)
220 if isinstance(url_or_request, (compat_str, str)):
221 url_or_request = url_or_request.partition('#')[0]
222
223 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
224 if urlh is False:
225 assert not fatal
226 return False
227 content_type = urlh.headers.get('Content-Type', '')
228 webpage_bytes = urlh.read()
229 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
230 if m:
231 encoding = m.group(1)
232 else:
233 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
234 webpage_bytes[:1024])
235 if m:
236 encoding = m.group(1).decode('ascii')
237 elif webpage_bytes.startswith(b'\xff\xfe'):
238 encoding = 'utf-16'
239 else:
240 encoding = 'utf-8'
241 if self._downloader.params.get('dump_intermediate_pages', False):
242 try:
243 url = url_or_request.get_full_url()
244 except AttributeError:
245 url = url_or_request
246 self.to_screen(u'Dumping request to ' + url)
247 dump = base64.b64encode(webpage_bytes).decode('ascii')
248 self._downloader.to_screen(dump)
249 if self._downloader.params.get('write_pages', False):
250 try:
251 url = url_or_request.get_full_url()
252 except AttributeError:
253 url = url_or_request
254 basen = '%s_%s' % (video_id, url)
255 if len(basen) > 240:
256 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
257 basen = basen[:240 - len(h)] + h
258 raw_filename = basen + '.dump'
259 filename = sanitize_filename(raw_filename, restricted=True)
260 self.to_screen(u'Saving request to ' + filename)
261 with open(filename, 'wb') as outf:
262 outf.write(webpage_bytes)
263
264 try:
265 content = webpage_bytes.decode(encoding, 'replace')
266 except LookupError:
267 content = webpage_bytes.decode('utf-8', 'replace')
268
269 if (u'<title>Access to this site is blocked</title>' in content and
270 u'Websense' in content[:512]):
271 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
272 blocked_iframe = self._html_search_regex(
273 r'<iframe src="([^"]+)"', content,
274 u'Websense information URL', default=None)
275 if blocked_iframe:
276 msg += u' Visit %s for more details' % blocked_iframe
277 raise ExtractorError(msg, expected=True)
278
279 return (content, urlh)
280
281 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
282 """ Returns the data of the page as a string """
283 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
284 if res is False:
285 return res
286 else:
287 content, _ = res
288 return content
289
290 def _download_xml(self, url_or_request, video_id,
291 note=u'Downloading XML', errnote=u'Unable to download XML',
292 transform_source=None, fatal=True):
293 """Return the xml as an xml.etree.ElementTree.Element"""
294 xml_string = self._download_webpage(
295 url_or_request, video_id, note, errnote, fatal=fatal)
296 if xml_string is False:
297 return xml_string
298 if transform_source:
299 xml_string = transform_source(xml_string)
300 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
301
302 def _download_json(self, url_or_request, video_id,
303 note=u'Downloading JSON metadata',
304 errnote=u'Unable to download JSON metadata',
305 transform_source=None,
306 fatal=True):
307 json_string = self._download_webpage(
308 url_or_request, video_id, note, errnote, fatal=fatal)
309 if (not fatal) and json_string is False:
310 return None
311 if transform_source:
312 json_string = transform_source(json_string)
313 try:
314 return json.loads(json_string)
315 except ValueError as ve:
316 raise ExtractorError('Failed to download JSON', cause=ve)
317
318 def report_warning(self, msg, video_id=None):
319 idstr = u'' if video_id is None else u'%s: ' % video_id
320 self._downloader.report_warning(
321 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
322
323 def to_screen(self, msg):
324 """Print msg to screen, prefixing it with '[ie_name]'"""
325 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
326
327 def report_extraction(self, id_or_name):
328 """Report information extraction."""
329 self.to_screen(u'%s: Extracting information' % id_or_name)
330
331 def report_download_webpage(self, video_id):
332 """Report webpage download."""
333 self.to_screen(u'%s: Downloading webpage' % video_id)
334
335 def report_age_confirmation(self):
336 """Report attempt to confirm age."""
337 self.to_screen(u'Confirming age')
338
339 def report_login(self):
340 """Report attempt to log in."""
341 self.to_screen(u'Logging in')
342
343 #Methods for following #608
344 @staticmethod
345 def url_result(url, ie=None, video_id=None):
346 """Returns a url that points to a page that should be processed"""
347 #TODO: ie should be the class used for getting the info
348 video_info = {'_type': 'url',
349 'url': url,
350 'ie_key': ie}
351 if video_id is not None:
352 video_info['id'] = video_id
353 return video_info
354 @staticmethod
355 def playlist_result(entries, playlist_id=None, playlist_title=None):
356 """Returns a playlist"""
357 video_info = {'_type': 'playlist',
358 'entries': entries}
359 if playlist_id:
360 video_info['id'] = playlist_id
361 if playlist_title:
362 video_info['title'] = playlist_title
363 return video_info
364
365 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
366 """
367 Perform a regex search on the given string, using a single or a list of
368 patterns returning the first matching group.
369 In case of failure return a default value or raise a WARNING or a
370 RegexNotFoundError, depending on fatal, specifying the field name.
371 """
372 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
373 mobj = re.search(pattern, string, flags)
374 else:
375 for p in pattern:
376 mobj = re.search(p, string, flags)
377 if mobj:
378 break
379
380 if os.name != 'nt' and sys.stderr.isatty():
381 _name = u'\033[0;34m%s\033[0m' % name
382 else:
383 _name = name
384
385 if mobj:
386 # return the first matching group
387 return next(g for g in mobj.groups() if g is not None)
388 elif default is not _NO_DEFAULT:
389 return default
390 elif fatal:
391 raise RegexNotFoundError(u'Unable to extract %s' % _name)
392 else:
393 self._downloader.report_warning(u'unable to extract %s; '
394 u'please report this issue on http://yt-dl.org/bug' % _name)
395 return None
396
397 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
398 """
399 Like _search_regex, but strips HTML tags and unescapes entities.
400 """
401 res = self._search_regex(pattern, string, name, default, fatal, flags)
402 if res:
403 return clean_html(res).strip()
404 else:
405 return res
406
407 def _get_login_info(self):
408 """
409 Get the the login info as (username, password)
410 It will look in the netrc file using the _NETRC_MACHINE value
411 If there's no info available, return (None, None)
412 """
413 if self._downloader is None:
414 return (None, None)
415
416 username = None
417 password = None
418 downloader_params = self._downloader.params
419
420 # Attempt to use provided username and password or .netrc data
421 if downloader_params.get('username', None) is not None:
422 username = downloader_params['username']
423 password = downloader_params['password']
424 elif downloader_params.get('usenetrc', False):
425 try:
426 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
427 if info is not None:
428 username = info[0]
429 password = info[2]
430 else:
431 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
432 except (IOError, netrc.NetrcParseError) as err:
433 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
434
435 return (username, password)
436
437 def _get_tfa_info(self):
438 """
439 Get the two-factor authentication info
440 TODO - asking the user will be required for sms/phone verify
441 currently just uses the command line option
442 If there's no info available, return None
443 """
444 if self._downloader is None:
445 return None
446 downloader_params = self._downloader.params
447
448 if downloader_params.get('twofactor', None) is not None:
449 return downloader_params['twofactor']
450
451 return None
452
453 # Helper functions for extracting OpenGraph info
454 @staticmethod
455 def _og_regexes(prop):
456 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
457 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
458 template = r'<meta[^>]+?%s[^>]+?%s'
459 return [
460 template % (property_re, content_re),
461 template % (content_re, property_re),
462 ]
463
464 def _og_search_property(self, prop, html, name=None, **kargs):
465 if name is None:
466 name = 'OpenGraph %s' % prop
467 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
468 if escaped is None:
469 return None
470 return unescapeHTML(escaped)
471
472 def _og_search_thumbnail(self, html, **kargs):
473 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
474
475 def _og_search_description(self, html, **kargs):
476 return self._og_search_property('description', html, fatal=False, **kargs)
477
478 def _og_search_title(self, html, **kargs):
479 return self._og_search_property('title', html, **kargs)
480
481 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
482 regexes = self._og_regexes('video')
483 if secure: regexes = self._og_regexes('video:secure_url') + regexes
484 return self._html_search_regex(regexes, html, name, **kargs)
485
486 def _og_search_url(self, html, **kargs):
487 return self._og_search_property('url', html, **kargs)
488
489 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
490 if display_name is None:
491 display_name = name
492 return self._html_search_regex(
493 r'''(?ix)<meta
494 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
495 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
496 html, display_name, fatal=fatal, **kwargs)
497
498 def _dc_search_uploader(self, html):
499 return self._html_search_meta('dc.creator', html, 'uploader')
500
501 def _rta_search(self, html):
502 # See http://www.rtalabel.org/index.php?content=howtofaq#single
503 if re.search(r'(?ix)<meta\s+name="rating"\s+'
504 r' content="RTA-5042-1996-1400-1577-RTA"',
505 html):
506 return 18
507 return 0
508
509 def _media_rating_search(self, html):
510 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
511 rating = self._html_search_meta('rating', html)
512
513 if not rating:
514 return None
515
516 RATING_TABLE = {
517 'safe for kids': 0,
518 'general': 8,
519 '14 years': 14,
520 'mature': 17,
521 'restricted': 19,
522 }
523 return RATING_TABLE.get(rating.lower(), None)
524
525 def _twitter_search_player(self, html):
526 return self._html_search_meta('twitter:player', html,
527 'twitter card player')
528
529 def _sort_formats(self, formats):
530 if not formats:
531 raise ExtractorError(u'No video formats found')
532
533 def _formats_key(f):
534 # TODO remove the following workaround
535 from ..utils import determine_ext
536 if not f.get('ext') and 'url' in f:
537 f['ext'] = determine_ext(f['url'])
538
539 preference = f.get('preference')
540 if preference is None:
541 proto = f.get('protocol')
542 if proto is None:
543 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
544
545 preference = 0 if proto in ['http', 'https'] else -0.1
546 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
547 preference -= 0.5
548
549 if f.get('vcodec') == 'none': # audio only
550 if self._downloader.params.get('prefer_free_formats'):
551 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
552 else:
553 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
554 ext_preference = 0
555 try:
556 audio_ext_preference = ORDER.index(f['ext'])
557 except ValueError:
558 audio_ext_preference = -1
559 else:
560 if self._downloader.params.get('prefer_free_formats'):
561 ORDER = [u'flv', u'mp4', u'webm']
562 else:
563 ORDER = [u'webm', u'flv', u'mp4']
564 try:
565 ext_preference = ORDER.index(f['ext'])
566 except ValueError:
567 ext_preference = -1
568 audio_ext_preference = 0
569
570 return (
571 preference,
572 f.get('quality') if f.get('quality') is not None else -1,
573 f.get('height') if f.get('height') is not None else -1,
574 f.get('width') if f.get('width') is not None else -1,
575 ext_preference,
576 f.get('tbr') if f.get('tbr') is not None else -1,
577 f.get('vbr') if f.get('vbr') is not None else -1,
578 f.get('abr') if f.get('abr') is not None else -1,
579 audio_ext_preference,
580 f.get('filesize') if f.get('filesize') is not None else -1,
581 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
582 f.get('format_id'),
583 )
584 formats.sort(key=_formats_key)
585
586 def http_scheme(self):
587 """ Either "https:" or "https:", depending on the user's preferences """
588 return (
589 'http:'
590 if self._downloader.params.get('prefer_insecure', False)
591 else 'https:')
592
593 def _proto_relative_url(self, url, scheme=None):
594 if url is None:
595 return url
596 if url.startswith('//'):
597 if scheme is None:
598 scheme = self.http_scheme()
599 return scheme + url
600 else:
601 return url
602
603 def _sleep(self, timeout, video_id, msg_template=None):
604 if msg_template is None:
605 msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
606 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
607 self.to_screen(msg)
608 time.sleep(timeout)
609
610 def _extract_f4m_formats(self, manifest_url, video_id):
611 manifest = self._download_xml(
612 manifest_url, video_id, 'Downloading f4m manifest',
613 'Unable to download f4m manifest')
614
615 formats = []
616 for media_el in manifest.findall('{http://ns.adobe.com/f4m/1.0}media'):
617 formats.append({
618 'url': manifest_url,
619 'ext': 'flv',
620 'tbr': int_or_none(media_el.attrib.get('bitrate')),
621 'width': int_or_none(media_el.attrib.get('width')),
622 'height': int_or_none(media_el.attrib.get('height')),
623 })
624 self._sort_formats(formats)
625
626 return formats
627
628
629 class SearchInfoExtractor(InfoExtractor):
630 """
631 Base class for paged search queries extractors.
632 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
633 Instances should define _SEARCH_KEY and _MAX_RESULTS.
634 """
635
636 @classmethod
637 def _make_valid_url(cls):
638 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
639
640 @classmethod
641 def suitable(cls, url):
642 return re.match(cls._make_valid_url(), url) is not None
643
644 def _real_extract(self, query):
645 mobj = re.match(self._make_valid_url(), query)
646 if mobj is None:
647 raise ExtractorError(u'Invalid search query "%s"' % query)
648
649 prefix = mobj.group('prefix')
650 query = mobj.group('query')
651 if prefix == '':
652 return self._get_n_results(query, 1)
653 elif prefix == 'all':
654 return self._get_n_results(query, self._MAX_RESULTS)
655 else:
656 n = int(prefix)
657 if n <= 0:
658 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
659 elif n > self._MAX_RESULTS:
660 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
661 n = self._MAX_RESULTS
662 return self._get_n_results(query, n)
663
664 def _get_n_results(self, query, n):
665 """Get a specified number of results for a query"""
666 raise NotImplementedError("This method must be implemented by subclasses")
667
668 @property
669 def SEARCH_KEY(self):
670 return self._SEARCH_KEY