]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[orf] Use new extraction method (Fixes #2057)
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
d6983cb4
PH
1import base64
2import os
3import re
4import socket
5import sys
fc79158d 6import netrc
267ed0c5 7import xml.etree.ElementTree
d6983cb4
PH
8
9from ..utils import (
10 compat_http_client,
11 compat_urllib_error,
c7deaa4c 12 compat_urllib_parse_urlparse,
d6983cb4
PH
13 compat_str,
14
15 clean_html,
16 compiled_regex_type,
17 ExtractorError,
55b3e45b 18 RegexNotFoundError,
d41e6efc 19 sanitize_filename,
f38de77f 20 unescapeHTML,
d6983cb4 21)
46374a56 22_NO_DEFAULT = object()
d6983cb4 23
dca08720 24
d6983cb4
PH
25class InfoExtractor(object):
26 """Information Extractor class.
27
28 Information extractors are the classes that, given a URL, extract
29 information about the video (or videos) the URL refers to. This
30 information includes the real video URL, the video title, author and
31 others. The information is stored in a dictionary which is then
32 passed to the FileDownloader. The FileDownloader processes this
33 information possibly downloading the video to the file system, among
34 other possible outcomes.
35
36 The dictionaries must include the following fields:
37
38 id: Video identifier.
d6983cb4 39 title: Video title, unescaped.
d67b0b15 40
f49d89ee 41 Additionally, it must contain either a formats entry or a url one:
d67b0b15 42
f49d89ee
PH
43 formats: A list of dictionaries for each format available, ordered
44 from worst to best quality.
45
46 Potential fields:
d67b0b15
PH
47 * url Mandatory. The URL of the video file
48 * ext Will be calculated from url if missing
49 * format A human-readable description of the format
50 ("mp4 container with h264/opus").
51 Calculated from the format_id, width, height.
52 and format_note fields if missing.
53 * format_id A short description of the format
5d4f3985
PH
54 ("mp4_h264_opus" or "19").
55 Technically optional, but strongly recommended.
d67b0b15
PH
56 * format_note Additional info about the format
57 ("3D" or "DASH video")
58 * width Width of the video, if known
59 * height Height of the video, if known
f49d89ee 60 * resolution Textual description of width and height
7217e148 61 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
62 * abr Average audio bitrate in KBit/s
63 * acodec Name of the audio codec in use
64 * vbr Average video bitrate in KBit/s
65 * vcodec Name of the video codec in use
66 * filesize The number of bytes, if known in advance
67 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
68 * protocol The protocol that will be used for the actual
69 download, lower-case.
70 "http", "https", "rtsp", "rtmp" or so.
f49d89ee 71 * preference Order number of this format. If this field is
08d13955
PH
72 present and not None, the formats get sorted
73 by this field.
f49d89ee
PH
74 -1 for default (order by other properties),
75 -2 or smaller for less than default.
5d73273f
PH
76 * quality Order number of the video quality of this
77 format, irrespective of the file format.
78 -1 for default (order by other properties),
79 -2 or smaller for less than default.
c0ba0f48 80 url: Final video URL.
d6983cb4 81 ext: Video filename extension.
d67b0b15
PH
82 format: The video format, defaults to ext (used for --get-format)
83 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 84
d6983cb4
PH
85 The following fields are optional:
86
73e79f2a
PH
87 thumbnails: A list of dictionaries (with the entries "resolution" and
88 "url") for the varying thumbnails
d6983cb4
PH
89 thumbnail: Full URL to a video thumbnail image.
90 description: One-line video description.
91 uploader: Full name of the video uploader.
92 upload_date: Video upload date (YYYYMMDD).
93 uploader_id: Nickname or id of the video uploader.
94 location: Physical location of the video.
5d51a883
JMF
95 subtitles: The subtitle file contents as a dictionary in the format
96 {language: subtitles}.
c0ba0f48 97 duration: Length of the video in seconds, as an integer.
f3d29461 98 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
99 like_count: Number of positive ratings of the video
100 dislike_count: Number of negative ratings of the video
101 comment_count: Number of comments on the video
8dbe9899 102 age_limit: Age restriction for the video, as an integer (years)
9103bbc5
JMF
103 webpage_url: The url to the video webpage, if given to youtube-dl it
104 should allow to get the same result again. (It will be set
105 by YoutubeDL if it's missing)
d6983cb4 106
deefc05b 107 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4
PH
108
109 Subclasses of this one should re-define the _real_initialize() and
110 _real_extract() methods and define a _VALID_URL regexp.
111 Probably, they should also be added to the list of extractors.
112
113 _real_extract() must return a *list* of information dictionaries as
114 described above.
115
116 Finally, the _WORKING attribute should be set to False for broken IEs
117 in order to warn the users and skip the tests.
118 """
119
120 _ready = False
121 _downloader = None
122 _WORKING = True
123
124 def __init__(self, downloader=None):
125 """Constructor. Receives an optional downloader."""
126 self._ready = False
127 self.set_downloader(downloader)
128
129 @classmethod
130 def suitable(cls, url):
131 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
132
133 # This does not use has/getattr intentionally - we want to know whether
134 # we have cached the regexp for *this* class, whereas getattr would also
135 # match the superclass
136 if '_VALID_URL_RE' not in cls.__dict__:
137 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
138 return cls._VALID_URL_RE.match(url) is not None
d6983cb4
PH
139
140 @classmethod
141 def working(cls):
142 """Getter method for _WORKING."""
143 return cls._WORKING
144
145 def initialize(self):
146 """Initializes an instance (authentication, etc)."""
147 if not self._ready:
148 self._real_initialize()
149 self._ready = True
150
151 def extract(self, url):
152 """Extracts URL information and returns it in list of dicts."""
153 self.initialize()
154 return self._real_extract(url)
155
156 def set_downloader(self, downloader):
157 """Sets the downloader for this IE."""
158 self._downloader = downloader
159
160 def _real_initialize(self):
161 """Real initialization process. Redefine in subclasses."""
162 pass
163
164 def _real_extract(self, url):
165 """Real extraction process. Redefine in subclasses."""
166 pass
167
56c73665
JMF
168 @classmethod
169 def ie_key(cls):
170 """A string for getting the InfoExtractor with get_info_extractor"""
171 return cls.__name__[:-2]
172
d6983cb4
PH
173 @property
174 def IE_NAME(self):
175 return type(self).__name__[:-2]
176
7cc3570e 177 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
178 """ Returns the response handle """
179 if note is None:
180 self.report_download_webpage(video_id)
181 elif note is not False:
7cc3570e
PH
182 if video_id is None:
183 self.to_screen(u'%s' % (note,))
184 else:
185 self.to_screen(u'%s: %s' % (video_id, note))
d6983cb4 186 try:
dca08720 187 return self._downloader.urlopen(url_or_request)
d6983cb4 188 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
189 if errnote is False:
190 return False
d6983cb4
PH
191 if errnote is None:
192 errnote = u'Unable to download webpage'
7cc3570e
PH
193 errmsg = u'%s: %s' % (errnote, compat_str(err))
194 if fatal:
195 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
196 else:
197 self._downloader.report_warning(errmsg)
198 return False
d6983cb4 199
7cc3570e 200 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 201 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
202
203 # Strip hashes from the URL (#1038)
204 if isinstance(url_or_request, (compat_str, str)):
205 url_or_request = url_or_request.partition('#')[0]
206
7cc3570e
PH
207 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
208 if urlh is False:
209 assert not fatal
210 return False
d6983cb4 211 content_type = urlh.headers.get('Content-Type', '')
f143d86a 212 webpage_bytes = urlh.read()
d6983cb4
PH
213 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
214 if m:
215 encoding = m.group(1)
216 else:
0d75ae2c 217 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
218 webpage_bytes[:1024])
219 if m:
220 encoding = m.group(1).decode('ascii')
221 else:
222 encoding = 'utf-8'
d6983cb4
PH
223 if self._downloader.params.get('dump_intermediate_pages', False):
224 try:
225 url = url_or_request.get_full_url()
226 except AttributeError:
227 url = url_or_request
228 self.to_screen(u'Dumping request to ' + url)
229 dump = base64.b64encode(webpage_bytes).decode('ascii')
230 self._downloader.to_screen(dump)
d41e6efc
PH
231 if self._downloader.params.get('write_pages', False):
232 try:
233 url = url_or_request.get_full_url()
234 except AttributeError:
235 url = url_or_request
236 raw_filename = ('%s_%s.dump' % (video_id, url))
237 filename = sanitize_filename(raw_filename, restricted=True)
238 self.to_screen(u'Saving request to ' + filename)
239 with open(filename, 'wb') as outf:
240 outf.write(webpage_bytes)
241
d6983cb4
PH
242 content = webpage_bytes.decode(encoding, 'replace')
243 return (content, urlh)
244
7cc3570e 245 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 246 """ Returns the data of the page as a string """
7cc3570e
PH
247 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
248 if res is False:
249 return res
250 else:
251 content, _ = res
252 return content
d6983cb4 253
2a275ab0 254 def _download_xml(self, url_or_request, video_id,
e2b38da9
PH
255 note=u'Downloading XML', errnote=u'Unable to download XML',
256 transform_source=None):
267ed0c5
JMF
257 """Return the xml as an xml.etree.ElementTree.Element"""
258 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
e2b38da9
PH
259 if transform_source:
260 xml_string = transform_source(xml_string)
267ed0c5
JMF
261 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
262
f45f96f8
PH
263 def report_warning(self, msg, video_id=None):
264 idstr = u'' if video_id is None else u'%s: ' % video_id
265 self._downloader.report_warning(
266 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
267
d6983cb4
PH
268 def to_screen(self, msg):
269 """Print msg to screen, prefixing it with '[ie_name]'"""
270 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
271
272 def report_extraction(self, id_or_name):
273 """Report information extraction."""
274 self.to_screen(u'%s: Extracting information' % id_or_name)
275
276 def report_download_webpage(self, video_id):
277 """Report webpage download."""
278 self.to_screen(u'%s: Downloading webpage' % video_id)
279
280 def report_age_confirmation(self):
281 """Report attempt to confirm age."""
282 self.to_screen(u'Confirming age')
283
fc79158d
JMF
284 def report_login(self):
285 """Report attempt to log in."""
286 self.to_screen(u'Logging in')
287
d6983cb4 288 #Methods for following #608
c0d0b01f
JMF
289 @staticmethod
290 def url_result(url, ie=None, video_id=None):
d6983cb4
PH
291 """Returns a url that points to a page that should be processed"""
292 #TODO: ie should be the class used for getting the info
293 video_info = {'_type': 'url',
294 'url': url,
295 'ie_key': ie}
7012b23c
PH
296 if video_id is not None:
297 video_info['id'] = video_id
d6983cb4 298 return video_info
c0d0b01f
JMF
299 @staticmethod
300 def playlist_result(entries, playlist_id=None, playlist_title=None):
d6983cb4
PH
301 """Returns a playlist"""
302 video_info = {'_type': 'playlist',
303 'entries': entries}
304 if playlist_id:
305 video_info['id'] = playlist_id
306 if playlist_title:
307 video_info['title'] = playlist_title
308 return video_info
309
46374a56 310 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4
PH
311 """
312 Perform a regex search on the given string, using a single or a list of
313 patterns returning the first matching group.
314 In case of failure return a default value or raise a WARNING or a
55b3e45b 315 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
316 """
317 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
318 mobj = re.search(pattern, string, flags)
319 else:
320 for p in pattern:
321 mobj = re.search(p, string, flags)
322 if mobj: break
323
87a28127 324 if os.name != 'nt' and sys.stderr.isatty():
d6983cb4
PH
325 _name = u'\033[0;34m%s\033[0m' % name
326 else:
327 _name = name
328
329 if mobj:
330 # return the first matching group
331 return next(g for g in mobj.groups() if g is not None)
46374a56 332 elif default is not _NO_DEFAULT:
d6983cb4
PH
333 return default
334 elif fatal:
55b3e45b 335 raise RegexNotFoundError(u'Unable to extract %s' % _name)
d6983cb4
PH
336 else:
337 self._downloader.report_warning(u'unable to extract %s; '
98bcd283 338 u'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4
PH
339 return None
340
46374a56 341 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4
PH
342 """
343 Like _search_regex, but strips HTML tags and unescapes entities.
344 """
345 res = self._search_regex(pattern, string, name, default, fatal, flags)
346 if res:
347 return clean_html(res).strip()
348 else:
349 return res
350
fc79158d
JMF
351 def _get_login_info(self):
352 """
353 Get the the login info as (username, password)
354 It will look in the netrc file using the _NETRC_MACHINE value
355 If there's no info available, return (None, None)
356 """
357 if self._downloader is None:
358 return (None, None)
359
360 username = None
361 password = None
362 downloader_params = self._downloader.params
363
364 # Attempt to use provided username and password or .netrc data
365 if downloader_params.get('username', None) is not None:
366 username = downloader_params['username']
367 password = downloader_params['password']
368 elif downloader_params.get('usenetrc', False):
369 try:
370 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
371 if info is not None:
372 username = info[0]
373 password = info[2]
374 else:
375 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
376 except (IOError, netrc.NetrcParseError) as err:
377 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
378
379 return (username, password)
380
46720279
JMF
381 # Helper functions for extracting OpenGraph info
382 @staticmethod
ab2d5247 383 def _og_regexes(prop):
78fb87b2 384 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
9887c9b2 385 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
78fb87b2 386 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 387 return [
78fb87b2
JMF
388 template % (property_re, content_re),
389 template % (content_re, property_re),
ab2d5247 390 ]
46720279 391
3c4e6d83 392 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 393 if name is None:
3c4e6d83 394 name = 'OpenGraph %s' % prop
ab2d5247 395 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
396 if escaped is None:
397 return None
398 return unescapeHTML(escaped)
46720279
JMF
399
400 def _og_search_thumbnail(self, html, **kargs):
3c4e6d83 401 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
46720279
JMF
402
403 def _og_search_description(self, html, **kargs):
404 return self._og_search_property('description', html, fatal=False, **kargs)
405
406 def _og_search_title(self, html, **kargs):
407 return self._og_search_property('title', html, **kargs)
408
8ffa13e0 409 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
ab2d5247
JMF
410 regexes = self._og_regexes('video')
411 if secure: regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 412 return self._html_search_regex(regexes, html, name, **kargs)
46720279 413
59040888
PH
414 def _html_search_meta(self, name, html, display_name=None):
415 if display_name is None:
416 display_name = name
417 return self._html_search_regex(
aaebed13
PH
418 r'''(?ix)<meta
419 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
59040888
PH
420 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
421 html, display_name, fatal=False)
422
423 def _dc_search_uploader(self, html):
424 return self._html_search_meta('dc.creator', html, 'uploader')
425
8dbe9899
PH
426 def _rta_search(self, html):
427 # See http://www.rtalabel.org/index.php?content=howtofaq#single
428 if re.search(r'(?ix)<meta\s+name="rating"\s+'
429 r' content="RTA-5042-1996-1400-1577-RTA"',
430 html):
431 return 18
432 return 0
433
59040888
PH
434 def _media_rating_search(self, html):
435 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
436 rating = self._html_search_meta('rating', html)
437
438 if not rating:
439 return None
440
441 RATING_TABLE = {
442 'safe for kids': 0,
443 'general': 8,
444 '14 years': 14,
445 'mature': 17,
446 'restricted': 19,
447 }
448 return RATING_TABLE.get(rating.lower(), None)
449
4bcc7bd1
PH
450 def _sort_formats(self, formats):
451 def _formats_key(f):
e6812ac9
PH
452 # TODO remove the following workaround
453 from ..utils import determine_ext
454 if not f.get('ext') and 'url' in f:
455 f['ext'] = determine_ext(f['url'])
456
4bcc7bd1
PH
457 preference = f.get('preference')
458 if preference is None:
c7deaa4c
PH
459 proto = f.get('protocol')
460 if proto is None:
461 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
462
463 preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1
PH
464 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
465 preference -= 0.5
466
467 if f.get('vcodec') == 'none': # audio only
468 if self._downloader.params.get('prefer_free_formats'):
469 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
470 else:
471 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
472 ext_preference = 0
473 try:
474 audio_ext_preference = ORDER.index(f['ext'])
475 except ValueError:
476 audio_ext_preference = -1
477 else:
478 if self._downloader.params.get('prefer_free_formats'):
479 ORDER = [u'flv', u'mp4', u'webm']
480 else:
481 ORDER = [u'webm', u'flv', u'mp4']
482 try:
483 ext_preference = ORDER.index(f['ext'])
484 except ValueError:
485 ext_preference = -1
486 audio_ext_preference = 0
487
488 return (
489 preference,
5d73273f 490 f.get('quality') if f.get('quality') is not None else -1,
4bcc7bd1
PH
491 f.get('height') if f.get('height') is not None else -1,
492 f.get('width') if f.get('width') is not None else -1,
493 ext_preference,
494 f.get('vbr') if f.get('vbr') is not None else -1,
495 f.get('abr') if f.get('abr') is not None else -1,
496 audio_ext_preference,
497 f.get('filesize') if f.get('filesize') is not None else -1,
498 f.get('format_id'),
499 )
500 formats.sort(key=_formats_key)
59040888 501
8dbe9899 502
d6983cb4
PH
503class SearchInfoExtractor(InfoExtractor):
504 """
505 Base class for paged search queries extractors.
506 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
507 Instances should define _SEARCH_KEY and _MAX_RESULTS.
508 """
509
510 @classmethod
511 def _make_valid_url(cls):
512 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
513
514 @classmethod
515 def suitable(cls, url):
516 return re.match(cls._make_valid_url(), url) is not None
517
518 def _real_extract(self, query):
519 mobj = re.match(self._make_valid_url(), query)
520 if mobj is None:
521 raise ExtractorError(u'Invalid search query "%s"' % query)
522
523 prefix = mobj.group('prefix')
524 query = mobj.group('query')
525 if prefix == '':
526 return self._get_n_results(query, 1)
527 elif prefix == 'all':
528 return self._get_n_results(query, self._MAX_RESULTS)
529 else:
530 n = int(prefix)
531 if n <= 0:
532 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
533 elif n > self._MAX_RESULTS:
534 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
535 n = self._MAX_RESULTS
536 return self._get_n_results(query, n)
537
538 def _get_n_results(self, query, n):
539 """Get a specified number of results for a query"""
416a5efc 540 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
541
542 @property
543 def SEARCH_KEY(self):
544 return self._SEARCH_KEY