]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
[jpopsuki] Simplify
[yt-dlp.git] / youtube_dl / extractor / common.py
1 import base64
2 import os
3 import re
4 import socket
5 import sys
6 import netrc
7 import xml.etree.ElementTree
8
9 from ..utils import (
10 compat_http_client,
11 compat_urllib_error,
12 compat_urllib_parse_urlparse,
13 compat_str,
14
15 clean_html,
16 compiled_regex_type,
17 ExtractorError,
18 RegexNotFoundError,
19 sanitize_filename,
20 unescapeHTML,
21 )
22 _NO_DEFAULT = object()
23
24
25 class InfoExtractor(object):
26 """Information Extractor class.
27
28 Information extractors are the classes that, given a URL, extract
29 information about the video (or videos) the URL refers to. This
30 information includes the real video URL, the video title, author and
31 others. The information is stored in a dictionary which is then
32 passed to the FileDownloader. The FileDownloader processes this
33 information possibly downloading the video to the file system, among
34 other possible outcomes.
35
36 The dictionaries must include the following fields:
37
38 id: Video identifier.
39 title: Video title, unescaped.
40
41 Additionally, it must contain either a formats entry or a url one:
42
43 formats: A list of dictionaries for each format available, ordered
44 from worst to best quality.
45
46 Potential fields:
47 * url Mandatory. The URL of the video file
48 * ext Will be calculated from url if missing
49 * format A human-readable description of the format
50 ("mp4 container with h264/opus").
51 Calculated from the format_id, width, height.
52 and format_note fields if missing.
53 * format_id A short description of the format
54 ("mp4_h264_opus" or "19").
55 Technically optional, but strongly recommended.
56 * format_note Additional info about the format
57 ("3D" or "DASH video")
58 * width Width of the video, if known
59 * height Height of the video, if known
60 * resolution Textual description of width and height
61 * tbr Average bitrate of audio and video in KBit/s
62 * abr Average audio bitrate in KBit/s
63 * acodec Name of the audio codec in use
64 * vbr Average video bitrate in KBit/s
65 * vcodec Name of the video codec in use
66 * filesize The number of bytes, if known in advance
67 * player_url SWF Player URL (used for rtmpdump).
68 * protocol The protocol that will be used for the actual
69 download, lower-case.
70 "http", "https", "rtsp", "rtmp" or so.
71 * preference Order number of this format. If this field is
72 present and not None, the formats get sorted
73 by this field.
74 -1 for default (order by other properties),
75 -2 or smaller for less than default.
76 url: Final video URL.
77 ext: Video filename extension.
78 format: The video format, defaults to ext (used for --get-format)
79 player_url: SWF Player URL (used for rtmpdump).
80
81 The following fields are optional:
82
83 thumbnails: A list of dictionaries (with the entries "resolution" and
84 "url") for the varying thumbnails
85 thumbnail: Full URL to a video thumbnail image.
86 description: One-line video description.
87 uploader: Full name of the video uploader.
88 upload_date: Video upload date (YYYYMMDD).
89 uploader_id: Nickname or id of the video uploader.
90 location: Physical location of the video.
91 subtitles: The subtitle file contents as a dictionary in the format
92 {language: subtitles}.
93 duration: Length of the video in seconds, as an integer.
94 view_count: How many users have watched the video on the platform.
95 like_count: Number of positive ratings of the video
96 dislike_count: Number of negative ratings of the video
97 comment_count: Number of comments on the video
98 age_limit: Age restriction for the video, as an integer (years)
99 webpage_url: The url to the video webpage, if given to youtube-dl it
100 should allow to get the same result again. (It will be set
101 by YoutubeDL if it's missing)
102
103 Unless mentioned otherwise, the fields should be Unicode strings.
104
105 Subclasses of this one should re-define the _real_initialize() and
106 _real_extract() methods and define a _VALID_URL regexp.
107 Probably, they should also be added to the list of extractors.
108
109 _real_extract() must return a *list* of information dictionaries as
110 described above.
111
112 Finally, the _WORKING attribute should be set to False for broken IEs
113 in order to warn the users and skip the tests.
114 """
115
116 _ready = False
117 _downloader = None
118 _WORKING = True
119
120 def __init__(self, downloader=None):
121 """Constructor. Receives an optional downloader."""
122 self._ready = False
123 self.set_downloader(downloader)
124
125 @classmethod
126 def suitable(cls, url):
127 """Receives a URL and returns True if suitable for this IE."""
128
129 # This does not use has/getattr intentionally - we want to know whether
130 # we have cached the regexp for *this* class, whereas getattr would also
131 # match the superclass
132 if '_VALID_URL_RE' not in cls.__dict__:
133 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
134 return cls._VALID_URL_RE.match(url) is not None
135
136 @classmethod
137 def working(cls):
138 """Getter method for _WORKING."""
139 return cls._WORKING
140
141 def initialize(self):
142 """Initializes an instance (authentication, etc)."""
143 if not self._ready:
144 self._real_initialize()
145 self._ready = True
146
147 def extract(self, url):
148 """Extracts URL information and returns it in list of dicts."""
149 self.initialize()
150 return self._real_extract(url)
151
152 def set_downloader(self, downloader):
153 """Sets the downloader for this IE."""
154 self._downloader = downloader
155
156 def _real_initialize(self):
157 """Real initialization process. Redefine in subclasses."""
158 pass
159
160 def _real_extract(self, url):
161 """Real extraction process. Redefine in subclasses."""
162 pass
163
164 @classmethod
165 def ie_key(cls):
166 """A string for getting the InfoExtractor with get_info_extractor"""
167 return cls.__name__[:-2]
168
169 @property
170 def IE_NAME(self):
171 return type(self).__name__[:-2]
172
173 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
174 """ Returns the response handle """
175 if note is None:
176 self.report_download_webpage(video_id)
177 elif note is not False:
178 if video_id is None:
179 self.to_screen(u'%s' % (note,))
180 else:
181 self.to_screen(u'%s: %s' % (video_id, note))
182 try:
183 return self._downloader.urlopen(url_or_request)
184 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
185 if errnote is False:
186 return False
187 if errnote is None:
188 errnote = u'Unable to download webpage'
189 errmsg = u'%s: %s' % (errnote, compat_str(err))
190 if fatal:
191 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
192 else:
193 self._downloader.report_warning(errmsg)
194 return False
195
196 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
197 """ Returns a tuple (page content as string, URL handle) """
198
199 # Strip hashes from the URL (#1038)
200 if isinstance(url_or_request, (compat_str, str)):
201 url_or_request = url_or_request.partition('#')[0]
202
203 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
204 if urlh is False:
205 assert not fatal
206 return False
207 content_type = urlh.headers.get('Content-Type', '')
208 webpage_bytes = urlh.read()
209 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
210 if m:
211 encoding = m.group(1)
212 else:
213 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
214 webpage_bytes[:1024])
215 if m:
216 encoding = m.group(1).decode('ascii')
217 else:
218 encoding = 'utf-8'
219 if self._downloader.params.get('dump_intermediate_pages', False):
220 try:
221 url = url_or_request.get_full_url()
222 except AttributeError:
223 url = url_or_request
224 self.to_screen(u'Dumping request to ' + url)
225 dump = base64.b64encode(webpage_bytes).decode('ascii')
226 self._downloader.to_screen(dump)
227 if self._downloader.params.get('write_pages', False):
228 try:
229 url = url_or_request.get_full_url()
230 except AttributeError:
231 url = url_or_request
232 raw_filename = ('%s_%s.dump' % (video_id, url))
233 filename = sanitize_filename(raw_filename, restricted=True)
234 self.to_screen(u'Saving request to ' + filename)
235 with open(filename, 'wb') as outf:
236 outf.write(webpage_bytes)
237
238 content = webpage_bytes.decode(encoding, 'replace')
239 return (content, urlh)
240
241 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
242 """ Returns the data of the page as a string """
243 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
244 if res is False:
245 return res
246 else:
247 content, _ = res
248 return content
249
250 def _download_xml(self, url_or_request, video_id,
251 note=u'Downloading XML', errnote=u'Unable to download XML',
252 transform_source=None):
253 """Return the xml as an xml.etree.ElementTree.Element"""
254 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
255 if transform_source:
256 xml_string = transform_source(xml_string)
257 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
258
259 def report_warning(self, msg, video_id=None):
260 idstr = u'' if video_id is None else u'%s: ' % video_id
261 self._downloader.report_warning(
262 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
263
264 def to_screen(self, msg):
265 """Print msg to screen, prefixing it with '[ie_name]'"""
266 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
267
268 def report_extraction(self, id_or_name):
269 """Report information extraction."""
270 self.to_screen(u'%s: Extracting information' % id_or_name)
271
272 def report_download_webpage(self, video_id):
273 """Report webpage download."""
274 self.to_screen(u'%s: Downloading webpage' % video_id)
275
276 def report_age_confirmation(self):
277 """Report attempt to confirm age."""
278 self.to_screen(u'Confirming age')
279
280 def report_login(self):
281 """Report attempt to log in."""
282 self.to_screen(u'Logging in')
283
284 #Methods for following #608
285 @staticmethod
286 def url_result(url, ie=None, video_id=None):
287 """Returns a url that points to a page that should be processed"""
288 #TODO: ie should be the class used for getting the info
289 video_info = {'_type': 'url',
290 'url': url,
291 'ie_key': ie}
292 if video_id is not None:
293 video_info['id'] = video_id
294 return video_info
295 @staticmethod
296 def playlist_result(entries, playlist_id=None, playlist_title=None):
297 """Returns a playlist"""
298 video_info = {'_type': 'playlist',
299 'entries': entries}
300 if playlist_id:
301 video_info['id'] = playlist_id
302 if playlist_title:
303 video_info['title'] = playlist_title
304 return video_info
305
306 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
307 """
308 Perform a regex search on the given string, using a single or a list of
309 patterns returning the first matching group.
310 In case of failure return a default value or raise a WARNING or a
311 RegexNotFoundError, depending on fatal, specifying the field name.
312 """
313 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
314 mobj = re.search(pattern, string, flags)
315 else:
316 for p in pattern:
317 mobj = re.search(p, string, flags)
318 if mobj: break
319
320 if os.name != 'nt' and sys.stderr.isatty():
321 _name = u'\033[0;34m%s\033[0m' % name
322 else:
323 _name = name
324
325 if mobj:
326 # return the first matching group
327 return next(g for g in mobj.groups() if g is not None)
328 elif default is not _NO_DEFAULT:
329 return default
330 elif fatal:
331 raise RegexNotFoundError(u'Unable to extract %s' % _name)
332 else:
333 self._downloader.report_warning(u'unable to extract %s; '
334 u'please report this issue on http://yt-dl.org/bug' % _name)
335 return None
336
337 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
338 """
339 Like _search_regex, but strips HTML tags and unescapes entities.
340 """
341 res = self._search_regex(pattern, string, name, default, fatal, flags)
342 if res:
343 return clean_html(res).strip()
344 else:
345 return res
346
347 def _get_login_info(self):
348 """
349 Get the the login info as (username, password)
350 It will look in the netrc file using the _NETRC_MACHINE value
351 If there's no info available, return (None, None)
352 """
353 if self._downloader is None:
354 return (None, None)
355
356 username = None
357 password = None
358 downloader_params = self._downloader.params
359
360 # Attempt to use provided username and password or .netrc data
361 if downloader_params.get('username', None) is not None:
362 username = downloader_params['username']
363 password = downloader_params['password']
364 elif downloader_params.get('usenetrc', False):
365 try:
366 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
367 if info is not None:
368 username = info[0]
369 password = info[2]
370 else:
371 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
372 except (IOError, netrc.NetrcParseError) as err:
373 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
374
375 return (username, password)
376
377 # Helper functions for extracting OpenGraph info
378 @staticmethod
379 def _og_regexes(prop):
380 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
381 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
382 template = r'<meta[^>]+?%s[^>]+?%s'
383 return [
384 template % (property_re, content_re),
385 template % (content_re, property_re),
386 ]
387
388 def _og_search_property(self, prop, html, name=None, **kargs):
389 if name is None:
390 name = 'OpenGraph %s' % prop
391 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
392 if escaped is None:
393 return None
394 return unescapeHTML(escaped)
395
396 def _og_search_thumbnail(self, html, **kargs):
397 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
398
399 def _og_search_description(self, html, **kargs):
400 return self._og_search_property('description', html, fatal=False, **kargs)
401
402 def _og_search_title(self, html, **kargs):
403 return self._og_search_property('title', html, **kargs)
404
405 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
406 regexes = self._og_regexes('video')
407 if secure: regexes = self._og_regexes('video:secure_url') + regexes
408 return self._html_search_regex(regexes, html, name, **kargs)
409
410 def _html_search_meta(self, name, html, display_name=None):
411 if display_name is None:
412 display_name = name
413 return self._html_search_regex(
414 r'''(?ix)<meta
415 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
416 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
417 html, display_name, fatal=False)
418
419 def _dc_search_uploader(self, html):
420 return self._html_search_meta('dc.creator', html, 'uploader')
421
422 def _rta_search(self, html):
423 # See http://www.rtalabel.org/index.php?content=howtofaq#single
424 if re.search(r'(?ix)<meta\s+name="rating"\s+'
425 r' content="RTA-5042-1996-1400-1577-RTA"',
426 html):
427 return 18
428 return 0
429
430 def _media_rating_search(self, html):
431 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
432 rating = self._html_search_meta('rating', html)
433
434 if not rating:
435 return None
436
437 RATING_TABLE = {
438 'safe for kids': 0,
439 'general': 8,
440 '14 years': 14,
441 'mature': 17,
442 'restricted': 19,
443 }
444 return RATING_TABLE.get(rating.lower(), None)
445
446 def _sort_formats(self, formats):
447 def _formats_key(f):
448 # TODO remove the following workaround
449 from ..utils import determine_ext
450 if not f.get('ext') and 'url' in f:
451 f['ext'] = determine_ext(f['url'])
452
453 preference = f.get('preference')
454 if preference is None:
455 proto = f.get('protocol')
456 if proto is None:
457 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
458
459 preference = 0 if proto in ['http', 'https'] else -0.1
460 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
461 preference -= 0.5
462
463 if f.get('vcodec') == 'none': # audio only
464 if self._downloader.params.get('prefer_free_formats'):
465 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
466 else:
467 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
468 ext_preference = 0
469 try:
470 audio_ext_preference = ORDER.index(f['ext'])
471 except ValueError:
472 audio_ext_preference = -1
473 else:
474 if self._downloader.params.get('prefer_free_formats'):
475 ORDER = [u'flv', u'mp4', u'webm']
476 else:
477 ORDER = [u'webm', u'flv', u'mp4']
478 try:
479 ext_preference = ORDER.index(f['ext'])
480 except ValueError:
481 ext_preference = -1
482 audio_ext_preference = 0
483
484 return (
485 preference,
486 f.get('height') if f.get('height') is not None else -1,
487 f.get('width') if f.get('width') is not None else -1,
488 ext_preference,
489 f.get('vbr') if f.get('vbr') is not None else -1,
490 f.get('abr') if f.get('abr') is not None else -1,
491 audio_ext_preference,
492 f.get('filesize') if f.get('filesize') is not None else -1,
493 f.get('format_id'),
494 )
495 formats.sort(key=_formats_key)
496
497
498 class SearchInfoExtractor(InfoExtractor):
499 """
500 Base class for paged search queries extractors.
501 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
502 Instances should define _SEARCH_KEY and _MAX_RESULTS.
503 """
504
505 @classmethod
506 def _make_valid_url(cls):
507 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
508
509 @classmethod
510 def suitable(cls, url):
511 return re.match(cls._make_valid_url(), url) is not None
512
513 def _real_extract(self, query):
514 mobj = re.match(self._make_valid_url(), query)
515 if mobj is None:
516 raise ExtractorError(u'Invalid search query "%s"' % query)
517
518 prefix = mobj.group('prefix')
519 query = mobj.group('query')
520 if prefix == '':
521 return self._get_n_results(query, 1)
522 elif prefix == 'all':
523 return self._get_n_results(query, self._MAX_RESULTS)
524 else:
525 n = int(prefix)
526 if n <= 0:
527 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
528 elif n > self._MAX_RESULTS:
529 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
530 n = self._MAX_RESULTS
531 return self._get_n_results(query, n)
532
533 def _get_n_results(self, query, n):
534 """Get a specified number of results for a query"""
535 raise NotImplementedError("This method must be implemented by subclasses")
536
537 @property
538 def SEARCH_KEY(self):
539 return self._SEARCH_KEY