]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
[extractor/common] --write-pages: Correct file name if video_id is None
[yt-dlp.git] / youtube_dl / extractor / common.py
1 import base64
2 import hashlib
3 import json
4 import os
5 import re
6 import socket
7 import sys
8 import netrc
9 import xml.etree.ElementTree
10
11 from ..utils import (
12 compat_http_client,
13 compat_urllib_error,
14 compat_urllib_parse_urlparse,
15 compat_str,
16
17 clean_html,
18 compiled_regex_type,
19 ExtractorError,
20 RegexNotFoundError,
21 sanitize_filename,
22 unescapeHTML,
23 )
24 _NO_DEFAULT = object()
25
26
27 class InfoExtractor(object):
28 """Information Extractor class.
29
30 Information extractors are the classes that, given a URL, extract
31 information about the video (or videos) the URL refers to. This
32 information includes the real video URL, the video title, author and
33 others. The information is stored in a dictionary which is then
34 passed to the FileDownloader. The FileDownloader processes this
35 information possibly downloading the video to the file system, among
36 other possible outcomes.
37
38 The dictionaries must include the following fields:
39
40 id: Video identifier.
41 title: Video title, unescaped.
42
43 Additionally, it must contain either a formats entry or a url one:
44
45 formats: A list of dictionaries for each format available, ordered
46 from worst to best quality.
47
48 Potential fields:
49 * url Mandatory. The URL of the video file
50 * ext Will be calculated from url if missing
51 * format A human-readable description of the format
52 ("mp4 container with h264/opus").
53 Calculated from the format_id, width, height.
54 and format_note fields if missing.
55 * format_id A short description of the format
56 ("mp4_h264_opus" or "19").
57 Technically optional, but strongly recommended.
58 * format_note Additional info about the format
59 ("3D" or "DASH video")
60 * width Width of the video, if known
61 * height Height of the video, if known
62 * resolution Textual description of width and height
63 * tbr Average bitrate of audio and video in KBit/s
64 * abr Average audio bitrate in KBit/s
65 * acodec Name of the audio codec in use
66 * asr Audio sampling rate in Hertz
67 * vbr Average video bitrate in KBit/s
68 * vcodec Name of the video codec in use
69 * container Name of the container format
70 * filesize The number of bytes, if known in advance
71 * player_url SWF Player URL (used for rtmpdump).
72 * protocol The protocol that will be used for the actual
73 download, lower-case.
74 "http", "https", "rtsp", "rtmp", "m3u8" or so.
75 * preference Order number of this format. If this field is
76 present and not None, the formats get sorted
77 by this field, regardless of all other values.
78 -1 for default (order by other properties),
79 -2 or smaller for less than default.
80 * quality Order number of the video quality of this
81 format, irrespective of the file format.
82 -1 for default (order by other properties),
83 -2 or smaller for less than default.
84 url: Final video URL.
85 ext: Video filename extension.
86 format: The video format, defaults to ext (used for --get-format)
87 player_url: SWF Player URL (used for rtmpdump).
88
89 The following fields are optional:
90
91 display_id An alternative identifier for the video, not necessarily
92 unique, but available before title. Typically, id is
93 something like "4234987", title "Dancing naked mole rats",
94 and display_id "dancing-naked-mole-rats"
95 thumbnails: A list of dictionaries (with the entries "resolution" and
96 "url") for the varying thumbnails
97 thumbnail: Full URL to a video thumbnail image.
98 description: One-line video description.
99 uploader: Full name of the video uploader.
100 timestamp: UNIX timestamp of the moment the video became available.
101 upload_date: Video upload date (YYYYMMDD).
102 If not explicitly set, calculated from timestamp.
103 uploader_id: Nickname or id of the video uploader.
104 location: Physical location of the video.
105 subtitles: The subtitle file contents as a dictionary in the format
106 {language: subtitles}.
107 duration: Length of the video in seconds, as an integer.
108 view_count: How many users have watched the video on the platform.
109 like_count: Number of positive ratings of the video
110 dislike_count: Number of negative ratings of the video
111 comment_count: Number of comments on the video
112 age_limit: Age restriction for the video, as an integer (years)
113 webpage_url: The url to the video webpage, if given to youtube-dl it
114 should allow to get the same result again. (It will be set
115 by YoutubeDL if it's missing)
116
117 Unless mentioned otherwise, the fields should be Unicode strings.
118
119 Subclasses of this one should re-define the _real_initialize() and
120 _real_extract() methods and define a _VALID_URL regexp.
121 Probably, they should also be added to the list of extractors.
122
123 Finally, the _WORKING attribute should be set to False for broken IEs
124 in order to warn the users and skip the tests.
125 """
126
127 _ready = False
128 _downloader = None
129 _WORKING = True
130
131 def __init__(self, downloader=None):
132 """Constructor. Receives an optional downloader."""
133 self._ready = False
134 self.set_downloader(downloader)
135
136 @classmethod
137 def suitable(cls, url):
138 """Receives a URL and returns True if suitable for this IE."""
139
140 # This does not use has/getattr intentionally - we want to know whether
141 # we have cached the regexp for *this* class, whereas getattr would also
142 # match the superclass
143 if '_VALID_URL_RE' not in cls.__dict__:
144 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
145 return cls._VALID_URL_RE.match(url) is not None
146
147 @classmethod
148 def working(cls):
149 """Getter method for _WORKING."""
150 return cls._WORKING
151
152 def initialize(self):
153 """Initializes an instance (authentication, etc)."""
154 if not self._ready:
155 self._real_initialize()
156 self._ready = True
157
158 def extract(self, url):
159 """Extracts URL information and returns it in list of dicts."""
160 self.initialize()
161 return self._real_extract(url)
162
163 def set_downloader(self, downloader):
164 """Sets the downloader for this IE."""
165 self._downloader = downloader
166
167 def _real_initialize(self):
168 """Real initialization process. Redefine in subclasses."""
169 pass
170
171 def _real_extract(self, url):
172 """Real extraction process. Redefine in subclasses."""
173 pass
174
175 @classmethod
176 def ie_key(cls):
177 """A string for getting the InfoExtractor with get_info_extractor"""
178 return cls.__name__[:-2]
179
180 @property
181 def IE_NAME(self):
182 return type(self).__name__[:-2]
183
184 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
185 """ Returns the response handle """
186 if note is None:
187 self.report_download_webpage(video_id)
188 elif note is not False:
189 if video_id is None:
190 self.to_screen(u'%s' % (note,))
191 else:
192 self.to_screen(u'%s: %s' % (video_id, note))
193 try:
194 return self._downloader.urlopen(url_or_request)
195 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
196 if errnote is False:
197 return False
198 if errnote is None:
199 errnote = u'Unable to download webpage'
200 errmsg = u'%s: %s' % (errnote, compat_str(err))
201 if fatal:
202 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
203 else:
204 self._downloader.report_warning(errmsg)
205 return False
206
207 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
208 """ Returns a tuple (page content as string, URL handle) """
209
210 # Strip hashes from the URL (#1038)
211 if isinstance(url_or_request, (compat_str, str)):
212 url_or_request = url_or_request.partition('#')[0]
213
214 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
215 if urlh is False:
216 assert not fatal
217 return False
218 content_type = urlh.headers.get('Content-Type', '')
219 webpage_bytes = urlh.read()
220 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
221 if m:
222 encoding = m.group(1)
223 else:
224 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
225 webpage_bytes[:1024])
226 if m:
227 encoding = m.group(1).decode('ascii')
228 elif webpage_bytes.startswith(b'\xff\xfe'):
229 encoding = 'utf-16'
230 else:
231 encoding = 'utf-8'
232 if self._downloader.params.get('dump_intermediate_pages', False):
233 try:
234 url = url_or_request.get_full_url()
235 except AttributeError:
236 url = url_or_request
237 self.to_screen(u'Dumping request to ' + url)
238 dump = base64.b64encode(webpage_bytes).decode('ascii')
239 self._downloader.to_screen(dump)
240 if self._downloader.params.get('write_pages', False):
241 try:
242 url = url_or_request.get_full_url()
243 except AttributeError:
244 url = url_or_request
245 basen = '%s_%s' % (video_id, url)
246 if len(basen) > 240:
247 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
248 basen = basen[:240 - len(h)] + h
249 raw_filename = basen + '.dump'
250 filename = sanitize_filename(raw_filename, restricted=True)
251 self.to_screen(u'Saving request to ' + filename)
252 with open(filename, 'wb') as outf:
253 outf.write(webpage_bytes)
254
255 try:
256 content = webpage_bytes.decode(encoding, 'replace')
257 except LookupError:
258 content = webpage_bytes.decode('utf-8', 'replace')
259
260 if (u'<title>Access to this site is blocked</title>' in content and
261 u'Websense' in content[:512]):
262 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
263 blocked_iframe = self._html_search_regex(
264 r'<iframe src="([^"]+)"', content,
265 u'Websense information URL', default=None)
266 if blocked_iframe:
267 msg += u' Visit %s for more details' % blocked_iframe
268 raise ExtractorError(msg, expected=True)
269
270 return (content, urlh)
271
272 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
273 """ Returns the data of the page as a string """
274 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
275 if res is False:
276 return res
277 else:
278 content, _ = res
279 return content
280
281 def _download_xml(self, url_or_request, video_id,
282 note=u'Downloading XML', errnote=u'Unable to download XML',
283 transform_source=None, fatal=True):
284 """Return the xml as an xml.etree.ElementTree.Element"""
285 xml_string = self._download_webpage(
286 url_or_request, video_id, note, errnote, fatal=fatal)
287 if xml_string is False:
288 return xml_string
289 if transform_source:
290 xml_string = transform_source(xml_string)
291 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
292
293 def _download_json(self, url_or_request, video_id,
294 note=u'Downloading JSON metadata',
295 errnote=u'Unable to download JSON metadata',
296 transform_source=None):
297 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
298 if transform_source:
299 json_string = transform_source(json_string)
300 try:
301 return json.loads(json_string)
302 except ValueError as ve:
303 raise ExtractorError('Failed to download JSON', cause=ve)
304
305 def report_warning(self, msg, video_id=None):
306 idstr = u'' if video_id is None else u'%s: ' % video_id
307 self._downloader.report_warning(
308 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
309
310 def to_screen(self, msg):
311 """Print msg to screen, prefixing it with '[ie_name]'"""
312 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
313
314 def report_extraction(self, id_or_name):
315 """Report information extraction."""
316 self.to_screen(u'%s: Extracting information' % id_or_name)
317
318 def report_download_webpage(self, video_id):
319 """Report webpage download."""
320 self.to_screen(u'%s: Downloading webpage' % video_id)
321
322 def report_age_confirmation(self):
323 """Report attempt to confirm age."""
324 self.to_screen(u'Confirming age')
325
326 def report_login(self):
327 """Report attempt to log in."""
328 self.to_screen(u'Logging in')
329
330 #Methods for following #608
331 @staticmethod
332 def url_result(url, ie=None, video_id=None):
333 """Returns a url that points to a page that should be processed"""
334 #TODO: ie should be the class used for getting the info
335 video_info = {'_type': 'url',
336 'url': url,
337 'ie_key': ie}
338 if video_id is not None:
339 video_info['id'] = video_id
340 return video_info
341 @staticmethod
342 def playlist_result(entries, playlist_id=None, playlist_title=None):
343 """Returns a playlist"""
344 video_info = {'_type': 'playlist',
345 'entries': entries}
346 if playlist_id:
347 video_info['id'] = playlist_id
348 if playlist_title:
349 video_info['title'] = playlist_title
350 return video_info
351
352 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
353 """
354 Perform a regex search on the given string, using a single or a list of
355 patterns returning the first matching group.
356 In case of failure return a default value or raise a WARNING or a
357 RegexNotFoundError, depending on fatal, specifying the field name.
358 """
359 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
360 mobj = re.search(pattern, string, flags)
361 else:
362 for p in pattern:
363 mobj = re.search(p, string, flags)
364 if mobj: break
365
366 if os.name != 'nt' and sys.stderr.isatty():
367 _name = u'\033[0;34m%s\033[0m' % name
368 else:
369 _name = name
370
371 if mobj:
372 # return the first matching group
373 return next(g for g in mobj.groups() if g is not None)
374 elif default is not _NO_DEFAULT:
375 return default
376 elif fatal:
377 raise RegexNotFoundError(u'Unable to extract %s' % _name)
378 else:
379 self._downloader.report_warning(u'unable to extract %s; '
380 u'please report this issue on http://yt-dl.org/bug' % _name)
381 return None
382
383 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
384 """
385 Like _search_regex, but strips HTML tags and unescapes entities.
386 """
387 res = self._search_regex(pattern, string, name, default, fatal, flags)
388 if res:
389 return clean_html(res).strip()
390 else:
391 return res
392
393 def _get_login_info(self):
394 """
395 Get the the login info as (username, password)
396 It will look in the netrc file using the _NETRC_MACHINE value
397 If there's no info available, return (None, None)
398 """
399 if self._downloader is None:
400 return (None, None)
401
402 username = None
403 password = None
404 downloader_params = self._downloader.params
405
406 # Attempt to use provided username and password or .netrc data
407 if downloader_params.get('username', None) is not None:
408 username = downloader_params['username']
409 password = downloader_params['password']
410 elif downloader_params.get('usenetrc', False):
411 try:
412 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
413 if info is not None:
414 username = info[0]
415 password = info[2]
416 else:
417 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
418 except (IOError, netrc.NetrcParseError) as err:
419 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
420
421 return (username, password)
422
423 # Helper functions for extracting OpenGraph info
424 @staticmethod
425 def _og_regexes(prop):
426 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
427 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
428 template = r'<meta[^>]+?%s[^>]+?%s'
429 return [
430 template % (property_re, content_re),
431 template % (content_re, property_re),
432 ]
433
434 def _og_search_property(self, prop, html, name=None, **kargs):
435 if name is None:
436 name = 'OpenGraph %s' % prop
437 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
438 if escaped is None:
439 return None
440 return unescapeHTML(escaped)
441
442 def _og_search_thumbnail(self, html, **kargs):
443 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
444
445 def _og_search_description(self, html, **kargs):
446 return self._og_search_property('description', html, fatal=False, **kargs)
447
448 def _og_search_title(self, html, **kargs):
449 return self._og_search_property('title', html, **kargs)
450
451 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
452 regexes = self._og_regexes('video')
453 if secure: regexes = self._og_regexes('video:secure_url') + regexes
454 return self._html_search_regex(regexes, html, name, **kargs)
455
456 def _html_search_meta(self, name, html, display_name=None, fatal=False):
457 if display_name is None:
458 display_name = name
459 return self._html_search_regex(
460 r'''(?ix)<meta
461 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
462 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
463 html, display_name, fatal=fatal)
464
465 def _dc_search_uploader(self, html):
466 return self._html_search_meta('dc.creator', html, 'uploader')
467
468 def _rta_search(self, html):
469 # See http://www.rtalabel.org/index.php?content=howtofaq#single
470 if re.search(r'(?ix)<meta\s+name="rating"\s+'
471 r' content="RTA-5042-1996-1400-1577-RTA"',
472 html):
473 return 18
474 return 0
475
476 def _media_rating_search(self, html):
477 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
478 rating = self._html_search_meta('rating', html)
479
480 if not rating:
481 return None
482
483 RATING_TABLE = {
484 'safe for kids': 0,
485 'general': 8,
486 '14 years': 14,
487 'mature': 17,
488 'restricted': 19,
489 }
490 return RATING_TABLE.get(rating.lower(), None)
491
492 def _twitter_search_player(self, html):
493 return self._html_search_meta('twitter:player', html,
494 'twitter card player')
495
496 def _sort_formats(self, formats):
497 if not formats:
498 raise ExtractorError(u'No video formats found')
499
500 def _formats_key(f):
501 # TODO remove the following workaround
502 from ..utils import determine_ext
503 if not f.get('ext') and 'url' in f:
504 f['ext'] = determine_ext(f['url'])
505
506 preference = f.get('preference')
507 if preference is None:
508 proto = f.get('protocol')
509 if proto is None:
510 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
511
512 preference = 0 if proto in ['http', 'https'] else -0.1
513 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
514 preference -= 0.5
515
516 if f.get('vcodec') == 'none': # audio only
517 if self._downloader.params.get('prefer_free_formats'):
518 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
519 else:
520 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
521 ext_preference = 0
522 try:
523 audio_ext_preference = ORDER.index(f['ext'])
524 except ValueError:
525 audio_ext_preference = -1
526 else:
527 if self._downloader.params.get('prefer_free_formats'):
528 ORDER = [u'flv', u'mp4', u'webm']
529 else:
530 ORDER = [u'webm', u'flv', u'mp4']
531 try:
532 ext_preference = ORDER.index(f['ext'])
533 except ValueError:
534 ext_preference = -1
535 audio_ext_preference = 0
536
537 return (
538 preference,
539 f.get('quality') if f.get('quality') is not None else -1,
540 f.get('height') if f.get('height') is not None else -1,
541 f.get('width') if f.get('width') is not None else -1,
542 ext_preference,
543 f.get('tbr') if f.get('tbr') is not None else -1,
544 f.get('vbr') if f.get('vbr') is not None else -1,
545 f.get('abr') if f.get('abr') is not None else -1,
546 audio_ext_preference,
547 f.get('filesize') if f.get('filesize') is not None else -1,
548 f.get('format_id'),
549 )
550 formats.sort(key=_formats_key)
551
552 def http_scheme(self):
553 """ Either "https:" or "https:", depending on the user's preferences """
554 return (
555 'http:'
556 if self._downloader.params.get('prefer_insecure', False)
557 else 'https:')
558
559 def _proto_relative_url(self, url, scheme=None):
560 if url is None:
561 return url
562 if url.startswith('//'):
563 if scheme is None:
564 scheme = self.http_scheme()
565 return scheme + url
566 else:
567 return url
568
569
570 class SearchInfoExtractor(InfoExtractor):
571 """
572 Base class for paged search queries extractors.
573 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
574 Instances should define _SEARCH_KEY and _MAX_RESULTS.
575 """
576
577 @classmethod
578 def _make_valid_url(cls):
579 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
580
581 @classmethod
582 def suitable(cls, url):
583 return re.match(cls._make_valid_url(), url) is not None
584
585 def _real_extract(self, query):
586 mobj = re.match(self._make_valid_url(), query)
587 if mobj is None:
588 raise ExtractorError(u'Invalid search query "%s"' % query)
589
590 prefix = mobj.group('prefix')
591 query = mobj.group('query')
592 if prefix == '':
593 return self._get_n_results(query, 1)
594 elif prefix == 'all':
595 return self._get_n_results(query, self._MAX_RESULTS)
596 else:
597 n = int(prefix)
598 if n <= 0:
599 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
600 elif n > self._MAX_RESULTS:
601 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
602 n = self._MAX_RESULTS
603 return self._get_n_results(query, n)
604
605 def _get_n_results(self, query, n):
606 """Get a specified number of results for a query"""
607 raise NotImplementedError("This method must be implemented by subclasses")
608
609 @property
610 def SEARCH_KEY(self):
611 return self._SEARCH_KEY
612