]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
[sina] Recognize http://video.sina.com.cn/v/b/{id}-*.html urls (fixes #2212)
[yt-dlp.git] / youtube_dl / extractor / common.py
1 import base64
2 import hashlib
3 import json
4 import os
5 import re
6 import socket
7 import sys
8 import netrc
9 import xml.etree.ElementTree
10
11 from ..utils import (
12 compat_http_client,
13 compat_urllib_error,
14 compat_urllib_parse_urlparse,
15 compat_str,
16
17 clean_html,
18 compiled_regex_type,
19 ExtractorError,
20 RegexNotFoundError,
21 sanitize_filename,
22 unescapeHTML,
23 )
24 _NO_DEFAULT = object()
25
26
27 class InfoExtractor(object):
28 """Information Extractor class.
29
30 Information extractors are the classes that, given a URL, extract
31 information about the video (or videos) the URL refers to. This
32 information includes the real video URL, the video title, author and
33 others. The information is stored in a dictionary which is then
34 passed to the FileDownloader. The FileDownloader processes this
35 information possibly downloading the video to the file system, among
36 other possible outcomes.
37
38 The dictionaries must include the following fields:
39
40 id: Video identifier.
41 title: Video title, unescaped.
42
43 Additionally, it must contain either a formats entry or a url one:
44
45 formats: A list of dictionaries for each format available, ordered
46 from worst to best quality.
47
48 Potential fields:
49 * url Mandatory. The URL of the video file
50 * ext Will be calculated from url if missing
51 * format A human-readable description of the format
52 ("mp4 container with h264/opus").
53 Calculated from the format_id, width, height.
54 and format_note fields if missing.
55 * format_id A short description of the format
56 ("mp4_h264_opus" or "19").
57 Technically optional, but strongly recommended.
58 * format_note Additional info about the format
59 ("3D" or "DASH video")
60 * width Width of the video, if known
61 * height Height of the video, if known
62 * resolution Textual description of width and height
63 * tbr Average bitrate of audio and video in KBit/s
64 * abr Average audio bitrate in KBit/s
65 * acodec Name of the audio codec in use
66 * asr Audio sampling rate in Hertz
67 * vbr Average video bitrate in KBit/s
68 * vcodec Name of the video codec in use
69 * filesize The number of bytes, if known in advance
70 * player_url SWF Player URL (used for rtmpdump).
71 * protocol The protocol that will be used for the actual
72 download, lower-case.
73 "http", "https", "rtsp", "rtmp" or so.
74 * preference Order number of this format. If this field is
75 present and not None, the formats get sorted
76 by this field.
77 -1 for default (order by other properties),
78 -2 or smaller for less than default.
79 * quality Order number of the video quality of this
80 format, irrespective of the file format.
81 -1 for default (order by other properties),
82 -2 or smaller for less than default.
83 url: Final video URL.
84 ext: Video filename extension.
85 format: The video format, defaults to ext (used for --get-format)
86 player_url: SWF Player URL (used for rtmpdump).
87
88 The following fields are optional:
89
90 thumbnails: A list of dictionaries (with the entries "resolution" and
91 "url") for the varying thumbnails
92 thumbnail: Full URL to a video thumbnail image.
93 description: One-line video description.
94 uploader: Full name of the video uploader.
95 upload_date: Video upload date (YYYYMMDD).
96 uploader_id: Nickname or id of the video uploader.
97 location: Physical location of the video.
98 subtitles: The subtitle file contents as a dictionary in the format
99 {language: subtitles}.
100 duration: Length of the video in seconds, as an integer.
101 view_count: How many users have watched the video on the platform.
102 like_count: Number of positive ratings of the video
103 dislike_count: Number of negative ratings of the video
104 comment_count: Number of comments on the video
105 age_limit: Age restriction for the video, as an integer (years)
106 webpage_url: The url to the video webpage, if given to youtube-dl it
107 should allow to get the same result again. (It will be set
108 by YoutubeDL if it's missing)
109
110 Unless mentioned otherwise, the fields should be Unicode strings.
111
112 Subclasses of this one should re-define the _real_initialize() and
113 _real_extract() methods and define a _VALID_URL regexp.
114 Probably, they should also be added to the list of extractors.
115
116 _real_extract() must return a *list* of information dictionaries as
117 described above.
118
119 Finally, the _WORKING attribute should be set to False for broken IEs
120 in order to warn the users and skip the tests.
121 """
122
123 _ready = False
124 _downloader = None
125 _WORKING = True
126
127 def __init__(self, downloader=None):
128 """Constructor. Receives an optional downloader."""
129 self._ready = False
130 self.set_downloader(downloader)
131
132 @classmethod
133 def suitable(cls, url):
134 """Receives a URL and returns True if suitable for this IE."""
135
136 # This does not use has/getattr intentionally - we want to know whether
137 # we have cached the regexp for *this* class, whereas getattr would also
138 # match the superclass
139 if '_VALID_URL_RE' not in cls.__dict__:
140 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
141 return cls._VALID_URL_RE.match(url) is not None
142
143 @classmethod
144 def working(cls):
145 """Getter method for _WORKING."""
146 return cls._WORKING
147
148 def initialize(self):
149 """Initializes an instance (authentication, etc)."""
150 if not self._ready:
151 self._real_initialize()
152 self._ready = True
153
154 def extract(self, url):
155 """Extracts URL information and returns it in list of dicts."""
156 self.initialize()
157 return self._real_extract(url)
158
159 def set_downloader(self, downloader):
160 """Sets the downloader for this IE."""
161 self._downloader = downloader
162
163 def _real_initialize(self):
164 """Real initialization process. Redefine in subclasses."""
165 pass
166
167 def _real_extract(self, url):
168 """Real extraction process. Redefine in subclasses."""
169 pass
170
171 @classmethod
172 def ie_key(cls):
173 """A string for getting the InfoExtractor with get_info_extractor"""
174 return cls.__name__[:-2]
175
176 @property
177 def IE_NAME(self):
178 return type(self).__name__[:-2]
179
180 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
181 """ Returns the response handle """
182 if note is None:
183 self.report_download_webpage(video_id)
184 elif note is not False:
185 if video_id is None:
186 self.to_screen(u'%s' % (note,))
187 else:
188 self.to_screen(u'%s: %s' % (video_id, note))
189 try:
190 return self._downloader.urlopen(url_or_request)
191 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
192 if errnote is False:
193 return False
194 if errnote is None:
195 errnote = u'Unable to download webpage'
196 errmsg = u'%s: %s' % (errnote, compat_str(err))
197 if fatal:
198 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
199 else:
200 self._downloader.report_warning(errmsg)
201 return False
202
203 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
204 """ Returns a tuple (page content as string, URL handle) """
205
206 # Strip hashes from the URL (#1038)
207 if isinstance(url_or_request, (compat_str, str)):
208 url_or_request = url_or_request.partition('#')[0]
209
210 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
211 if urlh is False:
212 assert not fatal
213 return False
214 content_type = urlh.headers.get('Content-Type', '')
215 webpage_bytes = urlh.read()
216 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
217 if m:
218 encoding = m.group(1)
219 else:
220 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
221 webpage_bytes[:1024])
222 if m:
223 encoding = m.group(1).decode('ascii')
224 elif webpage_bytes.startswith(b'\xff\xfe'):
225 encoding = 'utf-16'
226 else:
227 encoding = 'utf-8'
228 if self._downloader.params.get('dump_intermediate_pages', False):
229 try:
230 url = url_or_request.get_full_url()
231 except AttributeError:
232 url = url_or_request
233 self.to_screen(u'Dumping request to ' + url)
234 dump = base64.b64encode(webpage_bytes).decode('ascii')
235 self._downloader.to_screen(dump)
236 if self._downloader.params.get('write_pages', False):
237 try:
238 url = url_or_request.get_full_url()
239 except AttributeError:
240 url = url_or_request
241 if len(url) > 200:
242 h = u'___' + hashlib.md5(url).hexdigest()
243 url = url[:200 - len(h)] + h
244 raw_filename = ('%s_%s.dump' % (video_id, url))
245 filename = sanitize_filename(raw_filename, restricted=True)
246 self.to_screen(u'Saving request to ' + filename)
247 with open(filename, 'wb') as outf:
248 outf.write(webpage_bytes)
249
250 content = webpage_bytes.decode(encoding, 'replace')
251 return (content, urlh)
252
253 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
254 """ Returns the data of the page as a string """
255 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
256 if res is False:
257 return res
258 else:
259 content, _ = res
260 return content
261
262 def _download_xml(self, url_or_request, video_id,
263 note=u'Downloading XML', errnote=u'Unable to download XML',
264 transform_source=None):
265 """Return the xml as an xml.etree.ElementTree.Element"""
266 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
267 if transform_source:
268 xml_string = transform_source(xml_string)
269 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
270
271 def _download_json(self, url_or_request, video_id,
272 note=u'Downloading JSON metadata',
273 errnote=u'Unable to download JSON metadata'):
274 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
275 try:
276 return json.loads(json_string)
277 except ValueError as ve:
278 raise ExtractorError('Failed to download JSON', cause=ve)
279
280 def report_warning(self, msg, video_id=None):
281 idstr = u'' if video_id is None else u'%s: ' % video_id
282 self._downloader.report_warning(
283 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
284
285 def to_screen(self, msg):
286 """Print msg to screen, prefixing it with '[ie_name]'"""
287 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
288
289 def report_extraction(self, id_or_name):
290 """Report information extraction."""
291 self.to_screen(u'%s: Extracting information' % id_or_name)
292
293 def report_download_webpage(self, video_id):
294 """Report webpage download."""
295 self.to_screen(u'%s: Downloading webpage' % video_id)
296
297 def report_age_confirmation(self):
298 """Report attempt to confirm age."""
299 self.to_screen(u'Confirming age')
300
301 def report_login(self):
302 """Report attempt to log in."""
303 self.to_screen(u'Logging in')
304
305 #Methods for following #608
306 @staticmethod
307 def url_result(url, ie=None, video_id=None):
308 """Returns a url that points to a page that should be processed"""
309 #TODO: ie should be the class used for getting the info
310 video_info = {'_type': 'url',
311 'url': url,
312 'ie_key': ie}
313 if video_id is not None:
314 video_info['id'] = video_id
315 return video_info
316 @staticmethod
317 def playlist_result(entries, playlist_id=None, playlist_title=None):
318 """Returns a playlist"""
319 video_info = {'_type': 'playlist',
320 'entries': entries}
321 if playlist_id:
322 video_info['id'] = playlist_id
323 if playlist_title:
324 video_info['title'] = playlist_title
325 return video_info
326
327 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
328 """
329 Perform a regex search on the given string, using a single or a list of
330 patterns returning the first matching group.
331 In case of failure return a default value or raise a WARNING or a
332 RegexNotFoundError, depending on fatal, specifying the field name.
333 """
334 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
335 mobj = re.search(pattern, string, flags)
336 else:
337 for p in pattern:
338 mobj = re.search(p, string, flags)
339 if mobj: break
340
341 if os.name != 'nt' and sys.stderr.isatty():
342 _name = u'\033[0;34m%s\033[0m' % name
343 else:
344 _name = name
345
346 if mobj:
347 # return the first matching group
348 return next(g for g in mobj.groups() if g is not None)
349 elif default is not _NO_DEFAULT:
350 return default
351 elif fatal:
352 raise RegexNotFoundError(u'Unable to extract %s' % _name)
353 else:
354 self._downloader.report_warning(u'unable to extract %s; '
355 u'please report this issue on http://yt-dl.org/bug' % _name)
356 return None
357
358 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
359 """
360 Like _search_regex, but strips HTML tags and unescapes entities.
361 """
362 res = self._search_regex(pattern, string, name, default, fatal, flags)
363 if res:
364 return clean_html(res).strip()
365 else:
366 return res
367
368 def _get_login_info(self):
369 """
370 Get the the login info as (username, password)
371 It will look in the netrc file using the _NETRC_MACHINE value
372 If there's no info available, return (None, None)
373 """
374 if self._downloader is None:
375 return (None, None)
376
377 username = None
378 password = None
379 downloader_params = self._downloader.params
380
381 # Attempt to use provided username and password or .netrc data
382 if downloader_params.get('username', None) is not None:
383 username = downloader_params['username']
384 password = downloader_params['password']
385 elif downloader_params.get('usenetrc', False):
386 try:
387 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
388 if info is not None:
389 username = info[0]
390 password = info[2]
391 else:
392 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
393 except (IOError, netrc.NetrcParseError) as err:
394 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
395
396 return (username, password)
397
398 # Helper functions for extracting OpenGraph info
399 @staticmethod
400 def _og_regexes(prop):
401 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
402 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
403 template = r'<meta[^>]+?%s[^>]+?%s'
404 return [
405 template % (property_re, content_re),
406 template % (content_re, property_re),
407 ]
408
409 def _og_search_property(self, prop, html, name=None, **kargs):
410 if name is None:
411 name = 'OpenGraph %s' % prop
412 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
413 if escaped is None:
414 return None
415 return unescapeHTML(escaped)
416
417 def _og_search_thumbnail(self, html, **kargs):
418 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
419
420 def _og_search_description(self, html, **kargs):
421 return self._og_search_property('description', html, fatal=False, **kargs)
422
423 def _og_search_title(self, html, **kargs):
424 return self._og_search_property('title', html, **kargs)
425
426 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
427 regexes = self._og_regexes('video')
428 if secure: regexes = self._og_regexes('video:secure_url') + regexes
429 return self._html_search_regex(regexes, html, name, **kargs)
430
431 def _html_search_meta(self, name, html, display_name=None):
432 if display_name is None:
433 display_name = name
434 return self._html_search_regex(
435 r'''(?ix)<meta
436 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
437 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
438 html, display_name, fatal=False)
439
440 def _dc_search_uploader(self, html):
441 return self._html_search_meta('dc.creator', html, 'uploader')
442
443 def _rta_search(self, html):
444 # See http://www.rtalabel.org/index.php?content=howtofaq#single
445 if re.search(r'(?ix)<meta\s+name="rating"\s+'
446 r' content="RTA-5042-1996-1400-1577-RTA"',
447 html):
448 return 18
449 return 0
450
451 def _media_rating_search(self, html):
452 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
453 rating = self._html_search_meta('rating', html)
454
455 if not rating:
456 return None
457
458 RATING_TABLE = {
459 'safe for kids': 0,
460 'general': 8,
461 '14 years': 14,
462 'mature': 17,
463 'restricted': 19,
464 }
465 return RATING_TABLE.get(rating.lower(), None)
466
467 def _sort_formats(self, formats):
468 def _formats_key(f):
469 # TODO remove the following workaround
470 from ..utils import determine_ext
471 if not f.get('ext') and 'url' in f:
472 f['ext'] = determine_ext(f['url'])
473
474 preference = f.get('preference')
475 if preference is None:
476 proto = f.get('protocol')
477 if proto is None:
478 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
479
480 preference = 0 if proto in ['http', 'https'] else -0.1
481 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
482 preference -= 0.5
483
484 if f.get('vcodec') == 'none': # audio only
485 if self._downloader.params.get('prefer_free_formats'):
486 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
487 else:
488 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
489 ext_preference = 0
490 try:
491 audio_ext_preference = ORDER.index(f['ext'])
492 except ValueError:
493 audio_ext_preference = -1
494 else:
495 if self._downloader.params.get('prefer_free_formats'):
496 ORDER = [u'flv', u'mp4', u'webm']
497 else:
498 ORDER = [u'webm', u'flv', u'mp4']
499 try:
500 ext_preference = ORDER.index(f['ext'])
501 except ValueError:
502 ext_preference = -1
503 audio_ext_preference = 0
504
505 return (
506 preference,
507 f.get('quality') if f.get('quality') is not None else -1,
508 f.get('height') if f.get('height') is not None else -1,
509 f.get('width') if f.get('width') is not None else -1,
510 ext_preference,
511 f.get('tbr') if f.get('tbr') is not None else -1,
512 f.get('vbr') if f.get('vbr') is not None else -1,
513 f.get('abr') if f.get('abr') is not None else -1,
514 audio_ext_preference,
515 f.get('filesize') if f.get('filesize') is not None else -1,
516 f.get('format_id'),
517 )
518 formats.sort(key=_formats_key)
519
520
521 class SearchInfoExtractor(InfoExtractor):
522 """
523 Base class for paged search queries extractors.
524 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
525 Instances should define _SEARCH_KEY and _MAX_RESULTS.
526 """
527
528 @classmethod
529 def _make_valid_url(cls):
530 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
531
532 @classmethod
533 def suitable(cls, url):
534 return re.match(cls._make_valid_url(), url) is not None
535
536 def _real_extract(self, query):
537 mobj = re.match(self._make_valid_url(), query)
538 if mobj is None:
539 raise ExtractorError(u'Invalid search query "%s"' % query)
540
541 prefix = mobj.group('prefix')
542 query = mobj.group('query')
543 if prefix == '':
544 return self._get_n_results(query, 1)
545 elif prefix == 'all':
546 return self._get_n_results(query, self._MAX_RESULTS)
547 else:
548 n = int(prefix)
549 if n <= 0:
550 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
551 elif n > self._MAX_RESULTS:
552 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
553 n = self._MAX_RESULTS
554 return self._get_n_results(query, n)
555
556 def _get_n_results(self, query, n):
557 """Get a specified number of results for a query"""
558 raise NotImplementedError("This method must be implemented by subclasses")
559
560 @property
561 def SEARCH_KEY(self):
562 return self._SEARCH_KEY