]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[youtube] Download DASH manifest
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
d6983cb4 1import base64
3ec05685 2import hashlib
3d3538e4 3import json
d6983cb4
PH
4import os
5import re
6import socket
7import sys
fc79158d 8import netrc
267ed0c5 9import xml.etree.ElementTree
d6983cb4
PH
10
11from ..utils import (
12 compat_http_client,
13 compat_urllib_error,
c7deaa4c 14 compat_urllib_parse_urlparse,
d6983cb4
PH
15 compat_str,
16
17 clean_html,
18 compiled_regex_type,
19 ExtractorError,
55b3e45b 20 RegexNotFoundError,
d41e6efc 21 sanitize_filename,
f38de77f 22 unescapeHTML,
d6983cb4 23)
46374a56 24_NO_DEFAULT = object()
d6983cb4 25
dca08720 26
d6983cb4
PH
27class InfoExtractor(object):
28 """Information Extractor class.
29
30 Information extractors are the classes that, given a URL, extract
31 information about the video (or videos) the URL refers to. This
32 information includes the real video URL, the video title, author and
33 others. The information is stored in a dictionary which is then
34 passed to the FileDownloader. The FileDownloader processes this
35 information possibly downloading the video to the file system, among
36 other possible outcomes.
37
38 The dictionaries must include the following fields:
39
40 id: Video identifier.
d6983cb4 41 title: Video title, unescaped.
d67b0b15 42
f49d89ee 43 Additionally, it must contain either a formats entry or a url one:
d67b0b15 44
f49d89ee
PH
45 formats: A list of dictionaries for each format available, ordered
46 from worst to best quality.
47
48 Potential fields:
d67b0b15
PH
49 * url Mandatory. The URL of the video file
50 * ext Will be calculated from url if missing
51 * format A human-readable description of the format
52 ("mp4 container with h264/opus").
53 Calculated from the format_id, width, height.
54 and format_note fields if missing.
55 * format_id A short description of the format
5d4f3985
PH
56 ("mp4_h264_opus" or "19").
57 Technically optional, but strongly recommended.
d67b0b15
PH
58 * format_note Additional info about the format
59 ("3D" or "DASH video")
60 * width Width of the video, if known
61 * height Height of the video, if known
f49d89ee 62 * resolution Textual description of width and height
7217e148 63 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
64 * abr Average audio bitrate in KBit/s
65 * acodec Name of the audio codec in use
dd27fd17 66 * asr Audio sampling rate in Hertz
d67b0b15
PH
67 * vbr Average video bitrate in KBit/s
68 * vcodec Name of the video codec in use
69 * filesize The number of bytes, if known in advance
70 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
71 * protocol The protocol that will be used for the actual
72 download, lower-case.
73 "http", "https", "rtsp", "rtmp" or so.
f49d89ee 74 * preference Order number of this format. If this field is
08d13955
PH
75 present and not None, the formats get sorted
76 by this field.
f49d89ee
PH
77 -1 for default (order by other properties),
78 -2 or smaller for less than default.
5d73273f
PH
79 * quality Order number of the video quality of this
80 format, irrespective of the file format.
81 -1 for default (order by other properties),
82 -2 or smaller for less than default.
c0ba0f48 83 url: Final video URL.
d6983cb4 84 ext: Video filename extension.
d67b0b15
PH
85 format: The video format, defaults to ext (used for --get-format)
86 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 87
d6983cb4
PH
88 The following fields are optional:
89
73e79f2a
PH
90 thumbnails: A list of dictionaries (with the entries "resolution" and
91 "url") for the varying thumbnails
d6983cb4
PH
92 thumbnail: Full URL to a video thumbnail image.
93 description: One-line video description.
94 uploader: Full name of the video uploader.
95 upload_date: Video upload date (YYYYMMDD).
96 uploader_id: Nickname or id of the video uploader.
97 location: Physical location of the video.
5d51a883
JMF
98 subtitles: The subtitle file contents as a dictionary in the format
99 {language: subtitles}.
c0ba0f48 100 duration: Length of the video in seconds, as an integer.
f3d29461 101 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
102 like_count: Number of positive ratings of the video
103 dislike_count: Number of negative ratings of the video
104 comment_count: Number of comments on the video
8dbe9899 105 age_limit: Age restriction for the video, as an integer (years)
9103bbc5
JMF
106 webpage_url: The url to the video webpage, if given to youtube-dl it
107 should allow to get the same result again. (It will be set
108 by YoutubeDL if it's missing)
d6983cb4 109
deefc05b 110 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4
PH
111
112 Subclasses of this one should re-define the _real_initialize() and
113 _real_extract() methods and define a _VALID_URL regexp.
114 Probably, they should also be added to the list of extractors.
115
116 _real_extract() must return a *list* of information dictionaries as
117 described above.
118
119 Finally, the _WORKING attribute should be set to False for broken IEs
120 in order to warn the users and skip the tests.
121 """
122
123 _ready = False
124 _downloader = None
125 _WORKING = True
126
127 def __init__(self, downloader=None):
128 """Constructor. Receives an optional downloader."""
129 self._ready = False
130 self.set_downloader(downloader)
131
132 @classmethod
133 def suitable(cls, url):
134 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
135
136 # This does not use has/getattr intentionally - we want to know whether
137 # we have cached the regexp for *this* class, whereas getattr would also
138 # match the superclass
139 if '_VALID_URL_RE' not in cls.__dict__:
140 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
141 return cls._VALID_URL_RE.match(url) is not None
d6983cb4
PH
142
143 @classmethod
144 def working(cls):
145 """Getter method for _WORKING."""
146 return cls._WORKING
147
148 def initialize(self):
149 """Initializes an instance (authentication, etc)."""
150 if not self._ready:
151 self._real_initialize()
152 self._ready = True
153
154 def extract(self, url):
155 """Extracts URL information and returns it in list of dicts."""
156 self.initialize()
157 return self._real_extract(url)
158
159 def set_downloader(self, downloader):
160 """Sets the downloader for this IE."""
161 self._downloader = downloader
162
163 def _real_initialize(self):
164 """Real initialization process. Redefine in subclasses."""
165 pass
166
167 def _real_extract(self, url):
168 """Real extraction process. Redefine in subclasses."""
169 pass
170
56c73665
JMF
171 @classmethod
172 def ie_key(cls):
173 """A string for getting the InfoExtractor with get_info_extractor"""
174 return cls.__name__[:-2]
175
d6983cb4
PH
176 @property
177 def IE_NAME(self):
178 return type(self).__name__[:-2]
179
7cc3570e 180 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
181 """ Returns the response handle """
182 if note is None:
183 self.report_download_webpage(video_id)
184 elif note is not False:
7cc3570e
PH
185 if video_id is None:
186 self.to_screen(u'%s' % (note,))
187 else:
188 self.to_screen(u'%s: %s' % (video_id, note))
d6983cb4 189 try:
dca08720 190 return self._downloader.urlopen(url_or_request)
d6983cb4 191 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
192 if errnote is False:
193 return False
d6983cb4
PH
194 if errnote is None:
195 errnote = u'Unable to download webpage'
7cc3570e
PH
196 errmsg = u'%s: %s' % (errnote, compat_str(err))
197 if fatal:
198 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
199 else:
200 self._downloader.report_warning(errmsg)
201 return False
d6983cb4 202
7cc3570e 203 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 204 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
205
206 # Strip hashes from the URL (#1038)
207 if isinstance(url_or_request, (compat_str, str)):
208 url_or_request = url_or_request.partition('#')[0]
209
7cc3570e
PH
210 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
211 if urlh is False:
212 assert not fatal
213 return False
d6983cb4 214 content_type = urlh.headers.get('Content-Type', '')
f143d86a 215 webpage_bytes = urlh.read()
d6983cb4
PH
216 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
217 if m:
218 encoding = m.group(1)
219 else:
0d75ae2c 220 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
221 webpage_bytes[:1024])
222 if m:
223 encoding = m.group(1).decode('ascii')
224 else:
225 encoding = 'utf-8'
d6983cb4
PH
226 if self._downloader.params.get('dump_intermediate_pages', False):
227 try:
228 url = url_or_request.get_full_url()
229 except AttributeError:
230 url = url_or_request
231 self.to_screen(u'Dumping request to ' + url)
232 dump = base64.b64encode(webpage_bytes).decode('ascii')
233 self._downloader.to_screen(dump)
d41e6efc
PH
234 if self._downloader.params.get('write_pages', False):
235 try:
236 url = url_or_request.get_full_url()
237 except AttributeError:
238 url = url_or_request
3ec05685
PH
239 if len(url) > 200:
240 h = hashlib.md5(url).hexdigest()
241 url = url[:200 - len(h)] + h
d41e6efc
PH
242 raw_filename = ('%s_%s.dump' % (video_id, url))
243 filename = sanitize_filename(raw_filename, restricted=True)
244 self.to_screen(u'Saving request to ' + filename)
245 with open(filename, 'wb') as outf:
246 outf.write(webpage_bytes)
247
d6983cb4
PH
248 content = webpage_bytes.decode(encoding, 'replace')
249 return (content, urlh)
250
7cc3570e 251 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 252 """ Returns the data of the page as a string """
7cc3570e
PH
253 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
254 if res is False:
255 return res
256 else:
257 content, _ = res
258 return content
d6983cb4 259
2a275ab0 260 def _download_xml(self, url_or_request, video_id,
e2b38da9
PH
261 note=u'Downloading XML', errnote=u'Unable to download XML',
262 transform_source=None):
267ed0c5
JMF
263 """Return the xml as an xml.etree.ElementTree.Element"""
264 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
e2b38da9
PH
265 if transform_source:
266 xml_string = transform_source(xml_string)
267ed0c5
JMF
267 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
268
3d3538e4
PH
269 def _download_json(self, url_or_request, video_id,
270 note=u'Downloading JSON metadata',
271 errnote=u'Unable to download JSON metadata'):
272 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
273 try:
274 return json.loads(json_string)
275 except ValueError as ve:
276 raise ExtractorError('Failed to download JSON', cause=ve)
277
f45f96f8
PH
278 def report_warning(self, msg, video_id=None):
279 idstr = u'' if video_id is None else u'%s: ' % video_id
280 self._downloader.report_warning(
281 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
282
d6983cb4
PH
283 def to_screen(self, msg):
284 """Print msg to screen, prefixing it with '[ie_name]'"""
285 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
286
287 def report_extraction(self, id_or_name):
288 """Report information extraction."""
289 self.to_screen(u'%s: Extracting information' % id_or_name)
290
291 def report_download_webpage(self, video_id):
292 """Report webpage download."""
293 self.to_screen(u'%s: Downloading webpage' % video_id)
294
295 def report_age_confirmation(self):
296 """Report attempt to confirm age."""
297 self.to_screen(u'Confirming age')
298
fc79158d
JMF
299 def report_login(self):
300 """Report attempt to log in."""
301 self.to_screen(u'Logging in')
302
d6983cb4 303 #Methods for following #608
c0d0b01f
JMF
304 @staticmethod
305 def url_result(url, ie=None, video_id=None):
d6983cb4
PH
306 """Returns a url that points to a page that should be processed"""
307 #TODO: ie should be the class used for getting the info
308 video_info = {'_type': 'url',
309 'url': url,
310 'ie_key': ie}
7012b23c
PH
311 if video_id is not None:
312 video_info['id'] = video_id
d6983cb4 313 return video_info
c0d0b01f
JMF
314 @staticmethod
315 def playlist_result(entries, playlist_id=None, playlist_title=None):
d6983cb4
PH
316 """Returns a playlist"""
317 video_info = {'_type': 'playlist',
318 'entries': entries}
319 if playlist_id:
320 video_info['id'] = playlist_id
321 if playlist_title:
322 video_info['title'] = playlist_title
323 return video_info
324
46374a56 325 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4
PH
326 """
327 Perform a regex search on the given string, using a single or a list of
328 patterns returning the first matching group.
329 In case of failure return a default value or raise a WARNING or a
55b3e45b 330 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
331 """
332 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
333 mobj = re.search(pattern, string, flags)
334 else:
335 for p in pattern:
336 mobj = re.search(p, string, flags)
337 if mobj: break
338
87a28127 339 if os.name != 'nt' and sys.stderr.isatty():
d6983cb4
PH
340 _name = u'\033[0;34m%s\033[0m' % name
341 else:
342 _name = name
343
344 if mobj:
345 # return the first matching group
346 return next(g for g in mobj.groups() if g is not None)
46374a56 347 elif default is not _NO_DEFAULT:
d6983cb4
PH
348 return default
349 elif fatal:
55b3e45b 350 raise RegexNotFoundError(u'Unable to extract %s' % _name)
d6983cb4
PH
351 else:
352 self._downloader.report_warning(u'unable to extract %s; '
98bcd283 353 u'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4
PH
354 return None
355
46374a56 356 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4
PH
357 """
358 Like _search_regex, but strips HTML tags and unescapes entities.
359 """
360 res = self._search_regex(pattern, string, name, default, fatal, flags)
361 if res:
362 return clean_html(res).strip()
363 else:
364 return res
365
fc79158d
JMF
366 def _get_login_info(self):
367 """
368 Get the the login info as (username, password)
369 It will look in the netrc file using the _NETRC_MACHINE value
370 If there's no info available, return (None, None)
371 """
372 if self._downloader is None:
373 return (None, None)
374
375 username = None
376 password = None
377 downloader_params = self._downloader.params
378
379 # Attempt to use provided username and password or .netrc data
380 if downloader_params.get('username', None) is not None:
381 username = downloader_params['username']
382 password = downloader_params['password']
383 elif downloader_params.get('usenetrc', False):
384 try:
385 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
386 if info is not None:
387 username = info[0]
388 password = info[2]
389 else:
390 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
391 except (IOError, netrc.NetrcParseError) as err:
392 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
393
394 return (username, password)
395
46720279
JMF
396 # Helper functions for extracting OpenGraph info
397 @staticmethod
ab2d5247 398 def _og_regexes(prop):
78fb87b2 399 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
9887c9b2 400 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
78fb87b2 401 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 402 return [
78fb87b2
JMF
403 template % (property_re, content_re),
404 template % (content_re, property_re),
ab2d5247 405 ]
46720279 406
3c4e6d83 407 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 408 if name is None:
3c4e6d83 409 name = 'OpenGraph %s' % prop
ab2d5247 410 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
411 if escaped is None:
412 return None
413 return unescapeHTML(escaped)
46720279
JMF
414
415 def _og_search_thumbnail(self, html, **kargs):
3c4e6d83 416 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
46720279
JMF
417
418 def _og_search_description(self, html, **kargs):
419 return self._og_search_property('description', html, fatal=False, **kargs)
420
421 def _og_search_title(self, html, **kargs):
422 return self._og_search_property('title', html, **kargs)
423
8ffa13e0 424 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
ab2d5247
JMF
425 regexes = self._og_regexes('video')
426 if secure: regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 427 return self._html_search_regex(regexes, html, name, **kargs)
46720279 428
59040888
PH
429 def _html_search_meta(self, name, html, display_name=None):
430 if display_name is None:
431 display_name = name
432 return self._html_search_regex(
aaebed13
PH
433 r'''(?ix)<meta
434 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
59040888
PH
435 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
436 html, display_name, fatal=False)
437
438 def _dc_search_uploader(self, html):
439 return self._html_search_meta('dc.creator', html, 'uploader')
440
8dbe9899
PH
441 def _rta_search(self, html):
442 # See http://www.rtalabel.org/index.php?content=howtofaq#single
443 if re.search(r'(?ix)<meta\s+name="rating"\s+'
444 r' content="RTA-5042-1996-1400-1577-RTA"',
445 html):
446 return 18
447 return 0
448
59040888
PH
449 def _media_rating_search(self, html):
450 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
451 rating = self._html_search_meta('rating', html)
452
453 if not rating:
454 return None
455
456 RATING_TABLE = {
457 'safe for kids': 0,
458 'general': 8,
459 '14 years': 14,
460 'mature': 17,
461 'restricted': 19,
462 }
463 return RATING_TABLE.get(rating.lower(), None)
464
4bcc7bd1
PH
465 def _sort_formats(self, formats):
466 def _formats_key(f):
e6812ac9
PH
467 # TODO remove the following workaround
468 from ..utils import determine_ext
469 if not f.get('ext') and 'url' in f:
470 f['ext'] = determine_ext(f['url'])
471
4bcc7bd1
PH
472 preference = f.get('preference')
473 if preference is None:
c7deaa4c
PH
474 proto = f.get('protocol')
475 if proto is None:
476 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
477
478 preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1
PH
479 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
480 preference -= 0.5
481
482 if f.get('vcodec') == 'none': # audio only
483 if self._downloader.params.get('prefer_free_formats'):
484 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
485 else:
486 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
487 ext_preference = 0
488 try:
489 audio_ext_preference = ORDER.index(f['ext'])
490 except ValueError:
491 audio_ext_preference = -1
492 else:
493 if self._downloader.params.get('prefer_free_formats'):
494 ORDER = [u'flv', u'mp4', u'webm']
495 else:
496 ORDER = [u'webm', u'flv', u'mp4']
497 try:
498 ext_preference = ORDER.index(f['ext'])
499 except ValueError:
500 ext_preference = -1
501 audio_ext_preference = 0
502
503 return (
504 preference,
5d73273f 505 f.get('quality') if f.get('quality') is not None else -1,
4bcc7bd1
PH
506 f.get('height') if f.get('height') is not None else -1,
507 f.get('width') if f.get('width') is not None else -1,
508 ext_preference,
9933b574 509 f.get('tbr') if f.get('tbr') is not None else -1,
4bcc7bd1
PH
510 f.get('vbr') if f.get('vbr') is not None else -1,
511 f.get('abr') if f.get('abr') is not None else -1,
512 audio_ext_preference,
513 f.get('filesize') if f.get('filesize') is not None else -1,
514 f.get('format_id'),
515 )
516 formats.sort(key=_formats_key)
59040888 517
8dbe9899 518
d6983cb4
PH
519class SearchInfoExtractor(InfoExtractor):
520 """
521 Base class for paged search queries extractors.
522 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
523 Instances should define _SEARCH_KEY and _MAX_RESULTS.
524 """
525
526 @classmethod
527 def _make_valid_url(cls):
528 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
529
530 @classmethod
531 def suitable(cls, url):
532 return re.match(cls._make_valid_url(), url) is not None
533
534 def _real_extract(self, query):
535 mobj = re.match(self._make_valid_url(), query)
536 if mobj is None:
537 raise ExtractorError(u'Invalid search query "%s"' % query)
538
539 prefix = mobj.group('prefix')
540 query = mobj.group('query')
541 if prefix == '':
542 return self._get_n_results(query, 1)
543 elif prefix == 'all':
544 return self._get_n_results(query, self._MAX_RESULTS)
545 else:
546 n = int(prefix)
547 if n <= 0:
548 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
549 elif n > self._MAX_RESULTS:
550 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
551 n = self._MAX_RESULTS
552 return self._get_n_results(query, n)
553
554 def _get_n_results(self, query, n):
555 """Get a specified number of results for a query"""
416a5efc 556 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
557
558 @property
559 def SEARCH_KEY(self):
560 return self._SEARCH_KEY