]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
[yahoo] Use centralized sorting, and add tbr field
[yt-dlp.git] / youtube_dl / extractor / common.py
1 import base64
2 import os
3 import re
4 import socket
5 import sys
6 import netrc
7 import xml.etree.ElementTree
8
9 from ..utils import (
10 compat_http_client,
11 compat_urllib_error,
12 compat_urllib_parse_urlparse,
13 compat_str,
14
15 clean_html,
16 compiled_regex_type,
17 ExtractorError,
18 RegexNotFoundError,
19 sanitize_filename,
20 unescapeHTML,
21 )
22 _NO_DEFAULT = object()
23
24
25 class InfoExtractor(object):
26 """Information Extractor class.
27
28 Information extractors are the classes that, given a URL, extract
29 information about the video (or videos) the URL refers to. This
30 information includes the real video URL, the video title, author and
31 others. The information is stored in a dictionary which is then
32 passed to the FileDownloader. The FileDownloader processes this
33 information possibly downloading the video to the file system, among
34 other possible outcomes.
35
36 The dictionaries must include the following fields:
37
38 id: Video identifier.
39 title: Video title, unescaped.
40
41 Additionally, it must contain either a formats entry or a url one:
42
43 formats: A list of dictionaries for each format available, ordered
44 from worst to best quality.
45
46 Potential fields:
47 * url Mandatory. The URL of the video file
48 * ext Will be calculated from url if missing
49 * format A human-readable description of the format
50 ("mp4 container with h264/opus").
51 Calculated from the format_id, width, height.
52 and format_note fields if missing.
53 * format_id A short description of the format
54 ("mp4_h264_opus" or "19")
55 * format_note Additional info about the format
56 ("3D" or "DASH video")
57 * width Width of the video, if known
58 * height Height of the video, if known
59 * resolution Textual description of width and height
60 * tbr Average bitrate of audio and video in KBit/s
61 * abr Average audio bitrate in KBit/s
62 * acodec Name of the audio codec in use
63 * vbr Average video bitrate in KBit/s
64 * vcodec Name of the video codec in use
65 * filesize The number of bytes, if known in advance
66 * player_url SWF Player URL (used for rtmpdump).
67 * protocol The protocol that will be used for the actual
68 download, lower-case.
69 "http", "https", "rtsp", "rtmp" or so.
70 * preference Order number of this format. If this field is
71 present, the formats get sorted by this field.
72 -1 for default (order by other properties),
73 -2 or smaller for less than default.
74 url: Final video URL.
75 ext: Video filename extension.
76 format: The video format, defaults to ext (used for --get-format)
77 player_url: SWF Player URL (used for rtmpdump).
78
79 The following fields are optional:
80
81 thumbnails: A list of dictionaries (with the entries "resolution" and
82 "url") for the varying thumbnails
83 thumbnail: Full URL to a video thumbnail image.
84 description: One-line video description.
85 uploader: Full name of the video uploader.
86 upload_date: Video upload date (YYYYMMDD).
87 uploader_id: Nickname or id of the video uploader.
88 location: Physical location of the video.
89 subtitles: The subtitle file contents as a dictionary in the format
90 {language: subtitles}.
91 duration: Length of the video in seconds, as an integer.
92 view_count: How many users have watched the video on the platform.
93 like_count: Number of positive ratings of the video
94 dislike_count: Number of negative ratings of the video
95 comment_count: Number of comments on the video
96 age_limit: Age restriction for the video, as an integer (years)
97 webpage_url: The url to the video webpage, if given to youtube-dl it
98 should allow to get the same result again. (It will be set
99 by YoutubeDL if it's missing)
100
101 Unless mentioned otherwise, the fields should be Unicode strings.
102
103 Subclasses of this one should re-define the _real_initialize() and
104 _real_extract() methods and define a _VALID_URL regexp.
105 Probably, they should also be added to the list of extractors.
106
107 _real_extract() must return a *list* of information dictionaries as
108 described above.
109
110 Finally, the _WORKING attribute should be set to False for broken IEs
111 in order to warn the users and skip the tests.
112 """
113
114 _ready = False
115 _downloader = None
116 _WORKING = True
117
118 def __init__(self, downloader=None):
119 """Constructor. Receives an optional downloader."""
120 self._ready = False
121 self.set_downloader(downloader)
122
123 @classmethod
124 def suitable(cls, url):
125 """Receives a URL and returns True if suitable for this IE."""
126
127 # This does not use has/getattr intentionally - we want to know whether
128 # we have cached the regexp for *this* class, whereas getattr would also
129 # match the superclass
130 if '_VALID_URL_RE' not in cls.__dict__:
131 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
132 return cls._VALID_URL_RE.match(url) is not None
133
134 @classmethod
135 def working(cls):
136 """Getter method for _WORKING."""
137 return cls._WORKING
138
139 def initialize(self):
140 """Initializes an instance (authentication, etc)."""
141 if not self._ready:
142 self._real_initialize()
143 self._ready = True
144
145 def extract(self, url):
146 """Extracts URL information and returns it in list of dicts."""
147 self.initialize()
148 return self._real_extract(url)
149
150 def set_downloader(self, downloader):
151 """Sets the downloader for this IE."""
152 self._downloader = downloader
153
154 def _real_initialize(self):
155 """Real initialization process. Redefine in subclasses."""
156 pass
157
158 def _real_extract(self, url):
159 """Real extraction process. Redefine in subclasses."""
160 pass
161
162 @classmethod
163 def ie_key(cls):
164 """A string for getting the InfoExtractor with get_info_extractor"""
165 return cls.__name__[:-2]
166
167 @property
168 def IE_NAME(self):
169 return type(self).__name__[:-2]
170
171 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
172 """ Returns the response handle """
173 if note is None:
174 self.report_download_webpage(video_id)
175 elif note is not False:
176 if video_id is None:
177 self.to_screen(u'%s' % (note,))
178 else:
179 self.to_screen(u'%s: %s' % (video_id, note))
180 try:
181 return self._downloader.urlopen(url_or_request)
182 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
183 if errnote is False:
184 return False
185 if errnote is None:
186 errnote = u'Unable to download webpage'
187 errmsg = u'%s: %s' % (errnote, compat_str(err))
188 if fatal:
189 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
190 else:
191 self._downloader.report_warning(errmsg)
192 return False
193
194 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
195 """ Returns a tuple (page content as string, URL handle) """
196
197 # Strip hashes from the URL (#1038)
198 if isinstance(url_or_request, (compat_str, str)):
199 url_or_request = url_or_request.partition('#')[0]
200
201 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
202 if urlh is False:
203 assert not fatal
204 return False
205 content_type = urlh.headers.get('Content-Type', '')
206 webpage_bytes = urlh.read()
207 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
208 if m:
209 encoding = m.group(1)
210 else:
211 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
212 webpage_bytes[:1024])
213 if m:
214 encoding = m.group(1).decode('ascii')
215 else:
216 encoding = 'utf-8'
217 if self._downloader.params.get('dump_intermediate_pages', False):
218 try:
219 url = url_or_request.get_full_url()
220 except AttributeError:
221 url = url_or_request
222 self.to_screen(u'Dumping request to ' + url)
223 dump = base64.b64encode(webpage_bytes).decode('ascii')
224 self._downloader.to_screen(dump)
225 if self._downloader.params.get('write_pages', False):
226 try:
227 url = url_or_request.get_full_url()
228 except AttributeError:
229 url = url_or_request
230 raw_filename = ('%s_%s.dump' % (video_id, url))
231 filename = sanitize_filename(raw_filename, restricted=True)
232 self.to_screen(u'Saving request to ' + filename)
233 with open(filename, 'wb') as outf:
234 outf.write(webpage_bytes)
235
236 content = webpage_bytes.decode(encoding, 'replace')
237 return (content, urlh)
238
239 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
240 """ Returns the data of the page as a string """
241 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
242 if res is False:
243 return res
244 else:
245 content, _ = res
246 return content
247
248 def _download_xml(self, url_or_request, video_id,
249 note=u'Downloading XML', errnote=u'Unable to download XML',
250 transform_source=None):
251 """Return the xml as an xml.etree.ElementTree.Element"""
252 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
253 if transform_source:
254 xml_string = transform_source(xml_string)
255 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
256
257 def report_warning(self, msg, video_id=None):
258 idstr = u'' if video_id is None else u'%s: ' % video_id
259 self._downloader.report_warning(
260 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
261
262 def to_screen(self, msg):
263 """Print msg to screen, prefixing it with '[ie_name]'"""
264 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
265
266 def report_extraction(self, id_or_name):
267 """Report information extraction."""
268 self.to_screen(u'%s: Extracting information' % id_or_name)
269
270 def report_download_webpage(self, video_id):
271 """Report webpage download."""
272 self.to_screen(u'%s: Downloading webpage' % video_id)
273
274 def report_age_confirmation(self):
275 """Report attempt to confirm age."""
276 self.to_screen(u'Confirming age')
277
278 def report_login(self):
279 """Report attempt to log in."""
280 self.to_screen(u'Logging in')
281
282 #Methods for following #608
283 @staticmethod
284 def url_result(url, ie=None, video_id=None):
285 """Returns a url that points to a page that should be processed"""
286 #TODO: ie should be the class used for getting the info
287 video_info = {'_type': 'url',
288 'url': url,
289 'ie_key': ie}
290 if video_id is not None:
291 video_info['id'] = video_id
292 return video_info
293 @staticmethod
294 def playlist_result(entries, playlist_id=None, playlist_title=None):
295 """Returns a playlist"""
296 video_info = {'_type': 'playlist',
297 'entries': entries}
298 if playlist_id:
299 video_info['id'] = playlist_id
300 if playlist_title:
301 video_info['title'] = playlist_title
302 return video_info
303
304 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
305 """
306 Perform a regex search on the given string, using a single or a list of
307 patterns returning the first matching group.
308 In case of failure return a default value or raise a WARNING or a
309 RegexNotFoundError, depending on fatal, specifying the field name.
310 """
311 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
312 mobj = re.search(pattern, string, flags)
313 else:
314 for p in pattern:
315 mobj = re.search(p, string, flags)
316 if mobj: break
317
318 if os.name != 'nt' and sys.stderr.isatty():
319 _name = u'\033[0;34m%s\033[0m' % name
320 else:
321 _name = name
322
323 if mobj:
324 # return the first matching group
325 return next(g for g in mobj.groups() if g is not None)
326 elif default is not _NO_DEFAULT:
327 return default
328 elif fatal:
329 raise RegexNotFoundError(u'Unable to extract %s' % _name)
330 else:
331 self._downloader.report_warning(u'unable to extract %s; '
332 u'please report this issue on http://yt-dl.org/bug' % _name)
333 return None
334
335 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
336 """
337 Like _search_regex, but strips HTML tags and unescapes entities.
338 """
339 res = self._search_regex(pattern, string, name, default, fatal, flags)
340 if res:
341 return clean_html(res).strip()
342 else:
343 return res
344
345 def _get_login_info(self):
346 """
347 Get the the login info as (username, password)
348 It will look in the netrc file using the _NETRC_MACHINE value
349 If there's no info available, return (None, None)
350 """
351 if self._downloader is None:
352 return (None, None)
353
354 username = None
355 password = None
356 downloader_params = self._downloader.params
357
358 # Attempt to use provided username and password or .netrc data
359 if downloader_params.get('username', None) is not None:
360 username = downloader_params['username']
361 password = downloader_params['password']
362 elif downloader_params.get('usenetrc', False):
363 try:
364 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
365 if info is not None:
366 username = info[0]
367 password = info[2]
368 else:
369 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
370 except (IOError, netrc.NetrcParseError) as err:
371 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
372
373 return (username, password)
374
375 # Helper functions for extracting OpenGraph info
376 @staticmethod
377 def _og_regexes(prop):
378 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
379 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
380 template = r'<meta[^>]+?%s[^>]+?%s'
381 return [
382 template % (property_re, content_re),
383 template % (content_re, property_re),
384 ]
385
386 def _og_search_property(self, prop, html, name=None, **kargs):
387 if name is None:
388 name = 'OpenGraph %s' % prop
389 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
390 if escaped is None:
391 return None
392 return unescapeHTML(escaped)
393
394 def _og_search_thumbnail(self, html, **kargs):
395 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
396
397 def _og_search_description(self, html, **kargs):
398 return self._og_search_property('description', html, fatal=False, **kargs)
399
400 def _og_search_title(self, html, **kargs):
401 return self._og_search_property('title', html, **kargs)
402
403 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
404 regexes = self._og_regexes('video')
405 if secure: regexes = self._og_regexes('video:secure_url') + regexes
406 return self._html_search_regex(regexes, html, name, **kargs)
407
408 def _html_search_meta(self, name, html, display_name=None):
409 if display_name is None:
410 display_name = name
411 return self._html_search_regex(
412 r'''(?ix)<meta
413 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
414 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
415 html, display_name, fatal=False)
416
417 def _dc_search_uploader(self, html):
418 return self._html_search_meta('dc.creator', html, 'uploader')
419
420 def _rta_search(self, html):
421 # See http://www.rtalabel.org/index.php?content=howtofaq#single
422 if re.search(r'(?ix)<meta\s+name="rating"\s+'
423 r' content="RTA-5042-1996-1400-1577-RTA"',
424 html):
425 return 18
426 return 0
427
428 def _media_rating_search(self, html):
429 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
430 rating = self._html_search_meta('rating', html)
431
432 if not rating:
433 return None
434
435 RATING_TABLE = {
436 'safe for kids': 0,
437 'general': 8,
438 '14 years': 14,
439 'mature': 17,
440 'restricted': 19,
441 }
442 return RATING_TABLE.get(rating.lower(), None)
443
444 def _sort_formats(self, formats):
445 def _formats_key(f):
446 # TODO remove the following workaround
447 from ..utils import determine_ext
448 if not f.get('ext') and 'url' in f:
449 f['ext'] = determine_ext(f['url'])
450
451 preference = f.get('preference')
452 if preference is None:
453 proto = f.get('protocol')
454 if proto is None:
455 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
456
457 preference = 0 if proto in ['http', 'https'] else -0.1
458 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
459 preference -= 0.5
460
461 if f.get('vcodec') == 'none': # audio only
462 if self._downloader.params.get('prefer_free_formats'):
463 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
464 else:
465 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
466 ext_preference = 0
467 try:
468 audio_ext_preference = ORDER.index(f['ext'])
469 except ValueError:
470 audio_ext_preference = -1
471 else:
472 if self._downloader.params.get('prefer_free_formats'):
473 ORDER = [u'flv', u'mp4', u'webm']
474 else:
475 ORDER = [u'webm', u'flv', u'mp4']
476 try:
477 ext_preference = ORDER.index(f['ext'])
478 except ValueError:
479 ext_preference = -1
480 audio_ext_preference = 0
481
482 return (
483 preference,
484 f.get('height') if f.get('height') is not None else -1,
485 f.get('width') if f.get('width') is not None else -1,
486 ext_preference,
487 f.get('vbr') if f.get('vbr') is not None else -1,
488 f.get('abr') if f.get('abr') is not None else -1,
489 audio_ext_preference,
490 f.get('filesize') if f.get('filesize') is not None else -1,
491 f.get('format_id'),
492 )
493 formats.sort(key=_formats_key)
494
495
496 class SearchInfoExtractor(InfoExtractor):
497 """
498 Base class for paged search queries extractors.
499 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
500 Instances should define _SEARCH_KEY and _MAX_RESULTS.
501 """
502
503 @classmethod
504 def _make_valid_url(cls):
505 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
506
507 @classmethod
508 def suitable(cls, url):
509 return re.match(cls._make_valid_url(), url) is not None
510
511 def _real_extract(self, query):
512 mobj = re.match(self._make_valid_url(), query)
513 if mobj is None:
514 raise ExtractorError(u'Invalid search query "%s"' % query)
515
516 prefix = mobj.group('prefix')
517 query = mobj.group('query')
518 if prefix == '':
519 return self._get_n_results(query, 1)
520 elif prefix == 'all':
521 return self._get_n_results(query, self._MAX_RESULTS)
522 else:
523 n = int(prefix)
524 if n <= 0:
525 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
526 elif n > self._MAX_RESULTS:
527 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
528 n = self._MAX_RESULTS
529 return self._get_n_results(query, n)
530
531 def _get_n_results(self, query, n):
532 """Get a specified number of results for a query"""
533 raise NotImplementedError("This method must be implemented by subclasses")
534
535 @property
536 def SEARCH_KEY(self):
537 return self._SEARCH_KEY