]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
Merge branch 'master' of github.com:rg3/youtube-dl
[yt-dlp.git] / youtube_dl / extractor / common.py
1 import base64
2 import hashlib
3 import json
4 import os
5 import re
6 import socket
7 import sys
8 import netrc
9 import xml.etree.ElementTree
10
11 from ..utils import (
12 compat_http_client,
13 compat_urllib_error,
14 compat_urllib_parse_urlparse,
15 compat_str,
16
17 clean_html,
18 compiled_regex_type,
19 ExtractorError,
20 RegexNotFoundError,
21 sanitize_filename,
22 unescapeHTML,
23 )
24 _NO_DEFAULT = object()
25
26
27 class InfoExtractor(object):
28 """Information Extractor class.
29
30 Information extractors are the classes that, given a URL, extract
31 information about the video (or videos) the URL refers to. This
32 information includes the real video URL, the video title, author and
33 others. The information is stored in a dictionary which is then
34 passed to the FileDownloader. The FileDownloader processes this
35 information possibly downloading the video to the file system, among
36 other possible outcomes.
37
38 The dictionaries must include the following fields:
39
40 id: Video identifier.
41 title: Video title, unescaped.
42
43 Additionally, it must contain either a formats entry or a url one:
44
45 formats: A list of dictionaries for each format available, ordered
46 from worst to best quality.
47
48 Potential fields:
49 * url Mandatory. The URL of the video file
50 * ext Will be calculated from url if missing
51 * format A human-readable description of the format
52 ("mp4 container with h264/opus").
53 Calculated from the format_id, width, height.
54 and format_note fields if missing.
55 * format_id A short description of the format
56 ("mp4_h264_opus" or "19").
57 Technically optional, but strongly recommended.
58 * format_note Additional info about the format
59 ("3D" or "DASH video")
60 * width Width of the video, if known
61 * height Height of the video, if known
62 * resolution Textual description of width and height
63 * tbr Average bitrate of audio and video in KBit/s
64 * abr Average audio bitrate in KBit/s
65 * acodec Name of the audio codec in use
66 * vbr Average video bitrate in KBit/s
67 * vcodec Name of the video codec in use
68 * filesize The number of bytes, if known in advance
69 * player_url SWF Player URL (used for rtmpdump).
70 * protocol The protocol that will be used for the actual
71 download, lower-case.
72 "http", "https", "rtsp", "rtmp" or so.
73 * preference Order number of this format. If this field is
74 present and not None, the formats get sorted
75 by this field.
76 -1 for default (order by other properties),
77 -2 or smaller for less than default.
78 * quality Order number of the video quality of this
79 format, irrespective of the file format.
80 -1 for default (order by other properties),
81 -2 or smaller for less than default.
82 url: Final video URL.
83 ext: Video filename extension.
84 format: The video format, defaults to ext (used for --get-format)
85 player_url: SWF Player URL (used for rtmpdump).
86
87 The following fields are optional:
88
89 thumbnails: A list of dictionaries (with the entries "resolution" and
90 "url") for the varying thumbnails
91 thumbnail: Full URL to a video thumbnail image.
92 description: One-line video description.
93 uploader: Full name of the video uploader.
94 upload_date: Video upload date (YYYYMMDD).
95 uploader_id: Nickname or id of the video uploader.
96 location: Physical location of the video.
97 subtitles: The subtitle file contents as a dictionary in the format
98 {language: subtitles}.
99 duration: Length of the video in seconds, as an integer.
100 view_count: How many users have watched the video on the platform.
101 like_count: Number of positive ratings of the video
102 dislike_count: Number of negative ratings of the video
103 comment_count: Number of comments on the video
104 age_limit: Age restriction for the video, as an integer (years)
105 webpage_url: The url to the video webpage, if given to youtube-dl it
106 should allow to get the same result again. (It will be set
107 by YoutubeDL if it's missing)
108
109 Unless mentioned otherwise, the fields should be Unicode strings.
110
111 Subclasses of this one should re-define the _real_initialize() and
112 _real_extract() methods and define a _VALID_URL regexp.
113 Probably, they should also be added to the list of extractors.
114
115 _real_extract() must return a *list* of information dictionaries as
116 described above.
117
118 Finally, the _WORKING attribute should be set to False for broken IEs
119 in order to warn the users and skip the tests.
120 """
121
122 _ready = False
123 _downloader = None
124 _WORKING = True
125
126 def __init__(self, downloader=None):
127 """Constructor. Receives an optional downloader."""
128 self._ready = False
129 self.set_downloader(downloader)
130
131 @classmethod
132 def suitable(cls, url):
133 """Receives a URL and returns True if suitable for this IE."""
134
135 # This does not use has/getattr intentionally - we want to know whether
136 # we have cached the regexp for *this* class, whereas getattr would also
137 # match the superclass
138 if '_VALID_URL_RE' not in cls.__dict__:
139 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
140 return cls._VALID_URL_RE.match(url) is not None
141
142 @classmethod
143 def working(cls):
144 """Getter method for _WORKING."""
145 return cls._WORKING
146
147 def initialize(self):
148 """Initializes an instance (authentication, etc)."""
149 if not self._ready:
150 self._real_initialize()
151 self._ready = True
152
153 def extract(self, url):
154 """Extracts URL information and returns it in list of dicts."""
155 self.initialize()
156 return self._real_extract(url)
157
158 def set_downloader(self, downloader):
159 """Sets the downloader for this IE."""
160 self._downloader = downloader
161
162 def _real_initialize(self):
163 """Real initialization process. Redefine in subclasses."""
164 pass
165
166 def _real_extract(self, url):
167 """Real extraction process. Redefine in subclasses."""
168 pass
169
170 @classmethod
171 def ie_key(cls):
172 """A string for getting the InfoExtractor with get_info_extractor"""
173 return cls.__name__[:-2]
174
175 @property
176 def IE_NAME(self):
177 return type(self).__name__[:-2]
178
179 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
180 """ Returns the response handle """
181 if note is None:
182 self.report_download_webpage(video_id)
183 elif note is not False:
184 if video_id is None:
185 self.to_screen(u'%s' % (note,))
186 else:
187 self.to_screen(u'%s: %s' % (video_id, note))
188 try:
189 return self._downloader.urlopen(url_or_request)
190 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
191 if errnote is False:
192 return False
193 if errnote is None:
194 errnote = u'Unable to download webpage'
195 errmsg = u'%s: %s' % (errnote, compat_str(err))
196 if fatal:
197 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
198 else:
199 self._downloader.report_warning(errmsg)
200 return False
201
202 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
203 """ Returns a tuple (page content as string, URL handle) """
204
205 # Strip hashes from the URL (#1038)
206 if isinstance(url_or_request, (compat_str, str)):
207 url_or_request = url_or_request.partition('#')[0]
208
209 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
210 if urlh is False:
211 assert not fatal
212 return False
213 content_type = urlh.headers.get('Content-Type', '')
214 webpage_bytes = urlh.read()
215 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
216 if m:
217 encoding = m.group(1)
218 else:
219 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
220 webpage_bytes[:1024])
221 if m:
222 encoding = m.group(1).decode('ascii')
223 else:
224 encoding = 'utf-8'
225 if self._downloader.params.get('dump_intermediate_pages', False):
226 try:
227 url = url_or_request.get_full_url()
228 except AttributeError:
229 url = url_or_request
230 self.to_screen(u'Dumping request to ' + url)
231 dump = base64.b64encode(webpage_bytes).decode('ascii')
232 self._downloader.to_screen(dump)
233 if self._downloader.params.get('write_pages', False):
234 try:
235 url = url_or_request.get_full_url()
236 except AttributeError:
237 url = url_or_request
238 if len(url) > 200:
239 h = hashlib.md5(url).hexdigest()
240 url = url[:200 - len(h)] + h
241 raw_filename = ('%s_%s.dump' % (video_id, url))
242 filename = sanitize_filename(raw_filename, restricted=True)
243 self.to_screen(u'Saving request to ' + filename)
244 with open(filename, 'wb') as outf:
245 outf.write(webpage_bytes)
246
247 content = webpage_bytes.decode(encoding, 'replace')
248 return (content, urlh)
249
250 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
251 """ Returns the data of the page as a string """
252 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
253 if res is False:
254 return res
255 else:
256 content, _ = res
257 return content
258
259 def _download_xml(self, url_or_request, video_id,
260 note=u'Downloading XML', errnote=u'Unable to download XML',
261 transform_source=None):
262 """Return the xml as an xml.etree.ElementTree.Element"""
263 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
264 if transform_source:
265 xml_string = transform_source(xml_string)
266 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
267
268 def _download_json(self, url_or_request, video_id,
269 note=u'Downloading JSON metadata',
270 errnote=u'Unable to download JSON metadata'):
271 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
272 try:
273 return json.loads(json_string)
274 except ValueError as ve:
275 raise ExtractorError('Failed to download JSON', cause=ve)
276
277 def report_warning(self, msg, video_id=None):
278 idstr = u'' if video_id is None else u'%s: ' % video_id
279 self._downloader.report_warning(
280 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
281
282 def to_screen(self, msg):
283 """Print msg to screen, prefixing it with '[ie_name]'"""
284 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
285
286 def report_extraction(self, id_or_name):
287 """Report information extraction."""
288 self.to_screen(u'%s: Extracting information' % id_or_name)
289
290 def report_download_webpage(self, video_id):
291 """Report webpage download."""
292 self.to_screen(u'%s: Downloading webpage' % video_id)
293
294 def report_age_confirmation(self):
295 """Report attempt to confirm age."""
296 self.to_screen(u'Confirming age')
297
298 def report_login(self):
299 """Report attempt to log in."""
300 self.to_screen(u'Logging in')
301
302 #Methods for following #608
303 @staticmethod
304 def url_result(url, ie=None, video_id=None):
305 """Returns a url that points to a page that should be processed"""
306 #TODO: ie should be the class used for getting the info
307 video_info = {'_type': 'url',
308 'url': url,
309 'ie_key': ie}
310 if video_id is not None:
311 video_info['id'] = video_id
312 return video_info
313 @staticmethod
314 def playlist_result(entries, playlist_id=None, playlist_title=None):
315 """Returns a playlist"""
316 video_info = {'_type': 'playlist',
317 'entries': entries}
318 if playlist_id:
319 video_info['id'] = playlist_id
320 if playlist_title:
321 video_info['title'] = playlist_title
322 return video_info
323
324 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
325 """
326 Perform a regex search on the given string, using a single or a list of
327 patterns returning the first matching group.
328 In case of failure return a default value or raise a WARNING or a
329 RegexNotFoundError, depending on fatal, specifying the field name.
330 """
331 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
332 mobj = re.search(pattern, string, flags)
333 else:
334 for p in pattern:
335 mobj = re.search(p, string, flags)
336 if mobj: break
337
338 if os.name != 'nt' and sys.stderr.isatty():
339 _name = u'\033[0;34m%s\033[0m' % name
340 else:
341 _name = name
342
343 if mobj:
344 # return the first matching group
345 return next(g for g in mobj.groups() if g is not None)
346 elif default is not _NO_DEFAULT:
347 return default
348 elif fatal:
349 raise RegexNotFoundError(u'Unable to extract %s' % _name)
350 else:
351 self._downloader.report_warning(u'unable to extract %s; '
352 u'please report this issue on http://yt-dl.org/bug' % _name)
353 return None
354
355 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
356 """
357 Like _search_regex, but strips HTML tags and unescapes entities.
358 """
359 res = self._search_regex(pattern, string, name, default, fatal, flags)
360 if res:
361 return clean_html(res).strip()
362 else:
363 return res
364
365 def _get_login_info(self):
366 """
367 Get the the login info as (username, password)
368 It will look in the netrc file using the _NETRC_MACHINE value
369 If there's no info available, return (None, None)
370 """
371 if self._downloader is None:
372 return (None, None)
373
374 username = None
375 password = None
376 downloader_params = self._downloader.params
377
378 # Attempt to use provided username and password or .netrc data
379 if downloader_params.get('username', None) is not None:
380 username = downloader_params['username']
381 password = downloader_params['password']
382 elif downloader_params.get('usenetrc', False):
383 try:
384 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
385 if info is not None:
386 username = info[0]
387 password = info[2]
388 else:
389 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
390 except (IOError, netrc.NetrcParseError) as err:
391 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
392
393 return (username, password)
394
395 # Helper functions for extracting OpenGraph info
396 @staticmethod
397 def _og_regexes(prop):
398 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
399 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
400 template = r'<meta[^>]+?%s[^>]+?%s'
401 return [
402 template % (property_re, content_re),
403 template % (content_re, property_re),
404 ]
405
406 def _og_search_property(self, prop, html, name=None, **kargs):
407 if name is None:
408 name = 'OpenGraph %s' % prop
409 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
410 if escaped is None:
411 return None
412 return unescapeHTML(escaped)
413
414 def _og_search_thumbnail(self, html, **kargs):
415 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
416
417 def _og_search_description(self, html, **kargs):
418 return self._og_search_property('description', html, fatal=False, **kargs)
419
420 def _og_search_title(self, html, **kargs):
421 return self._og_search_property('title', html, **kargs)
422
423 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
424 regexes = self._og_regexes('video')
425 if secure: regexes = self._og_regexes('video:secure_url') + regexes
426 return self._html_search_regex(regexes, html, name, **kargs)
427
428 def _html_search_meta(self, name, html, display_name=None):
429 if display_name is None:
430 display_name = name
431 return self._html_search_regex(
432 r'''(?ix)<meta
433 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
434 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
435 html, display_name, fatal=False)
436
437 def _dc_search_uploader(self, html):
438 return self._html_search_meta('dc.creator', html, 'uploader')
439
440 def _rta_search(self, html):
441 # See http://www.rtalabel.org/index.php?content=howtofaq#single
442 if re.search(r'(?ix)<meta\s+name="rating"\s+'
443 r' content="RTA-5042-1996-1400-1577-RTA"',
444 html):
445 return 18
446 return 0
447
448 def _media_rating_search(self, html):
449 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
450 rating = self._html_search_meta('rating', html)
451
452 if not rating:
453 return None
454
455 RATING_TABLE = {
456 'safe for kids': 0,
457 'general': 8,
458 '14 years': 14,
459 'mature': 17,
460 'restricted': 19,
461 }
462 return RATING_TABLE.get(rating.lower(), None)
463
464 def _sort_formats(self, formats):
465 def _formats_key(f):
466 # TODO remove the following workaround
467 from ..utils import determine_ext
468 if not f.get('ext') and 'url' in f:
469 f['ext'] = determine_ext(f['url'])
470
471 preference = f.get('preference')
472 if preference is None:
473 proto = f.get('protocol')
474 if proto is None:
475 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
476
477 preference = 0 if proto in ['http', 'https'] else -0.1
478 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
479 preference -= 0.5
480
481 if f.get('vcodec') == 'none': # audio only
482 if self._downloader.params.get('prefer_free_formats'):
483 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
484 else:
485 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
486 ext_preference = 0
487 try:
488 audio_ext_preference = ORDER.index(f['ext'])
489 except ValueError:
490 audio_ext_preference = -1
491 else:
492 if self._downloader.params.get('prefer_free_formats'):
493 ORDER = [u'flv', u'mp4', u'webm']
494 else:
495 ORDER = [u'webm', u'flv', u'mp4']
496 try:
497 ext_preference = ORDER.index(f['ext'])
498 except ValueError:
499 ext_preference = -1
500 audio_ext_preference = 0
501
502 return (
503 preference,
504 f.get('quality') if f.get('quality') is not None else -1,
505 f.get('height') if f.get('height') is not None else -1,
506 f.get('width') if f.get('width') is not None else -1,
507 ext_preference,
508 f.get('tbr') if f.get('tbr') is not None else -1,
509 f.get('vbr') if f.get('vbr') is not None else -1,
510 f.get('abr') if f.get('abr') is not None else -1,
511 audio_ext_preference,
512 f.get('filesize') if f.get('filesize') is not None else -1,
513 f.get('format_id'),
514 )
515 formats.sort(key=_formats_key)
516
517
518 class SearchInfoExtractor(InfoExtractor):
519 """
520 Base class for paged search queries extractors.
521 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
522 Instances should define _SEARCH_KEY and _MAX_RESULTS.
523 """
524
525 @classmethod
526 def _make_valid_url(cls):
527 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
528
529 @classmethod
530 def suitable(cls, url):
531 return re.match(cls._make_valid_url(), url) is not None
532
533 def _real_extract(self, query):
534 mobj = re.match(self._make_valid_url(), query)
535 if mobj is None:
536 raise ExtractorError(u'Invalid search query "%s"' % query)
537
538 prefix = mobj.group('prefix')
539 query = mobj.group('query')
540 if prefix == '':
541 return self._get_n_results(query, 1)
542 elif prefix == 'all':
543 return self._get_n_results(query, self._MAX_RESULTS)
544 else:
545 n = int(prefix)
546 if n <= 0:
547 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
548 elif n > self._MAX_RESULTS:
549 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
550 n = self._MAX_RESULTS
551 return self._get_n_results(query, n)
552
553 def _get_n_results(self, query, n):
554 """Get a specified number of results for a query"""
555 raise NotImplementedError("This method must be implemented by subclasses")
556
557 @property
558 def SEARCH_KEY(self):
559 return self._SEARCH_KEY