]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
Merge remote-tracking branch 'jaimeMF/load-info'
[yt-dlp.git] / youtube_dl / extractor / common.py
1 import base64
2 import os
3 import re
4 import socket
5 import sys
6 import netrc
7 import xml.etree.ElementTree
8
9 from ..utils import (
10 compat_http_client,
11 compat_urllib_error,
12 compat_str,
13
14 clean_html,
15 compiled_regex_type,
16 ExtractorError,
17 RegexNotFoundError,
18 sanitize_filename,
19 unescapeHTML,
20 )
21
22
23 class InfoExtractor(object):
24 """Information Extractor class.
25
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
33
34 The dictionaries must include the following fields:
35
36 id: Video identifier.
37 url: Final video URL.
38 title: Video title, unescaped.
39 ext: Video filename extension.
40
41 Instead of url and ext, formats can also specified.
42
43 The following fields are optional:
44
45 format: The video format, defaults to ext (used for --get-format)
46 thumbnails: A list of dictionaries (with the entries "resolution" and
47 "url") for the varying thumbnails
48 thumbnail: Full URL to a video thumbnail image.
49 description: One-line video description.
50 uploader: Full name of the video uploader.
51 upload_date: Video upload date (YYYYMMDD).
52 uploader_id: Nickname or id of the video uploader.
53 location: Physical location of the video.
54 player_url: SWF Player URL (used for rtmpdump).
55 subtitles: The subtitle file contents as a dictionary in the format
56 {language: subtitles}.
57 view_count: How many users have watched the video on the platform.
58 like_count: Number of positive ratings of the video
59 dislike_count: Number of negative ratings of the video
60 comment_count: Number of comments on the video
61 urlhandle: [internal] The urlHandle to be used to download the file,
62 like returned by urllib.request.urlopen
63 age_limit: Age restriction for the video, as an integer (years)
64 formats: A list of dictionaries for each format available, it must
65 be ordered from worst to best quality. Potential fields:
66 * url Mandatory. The URL of the video file
67 * ext Will be calculated from url if missing
68 * format A human-readable description of the format
69 ("mp4 container with h264/opus").
70 Calculated from the format_id, width, height.
71 and format_note fields if missing.
72 * format_id A short description of the format
73 ("mp4_h264_opus" or "19")
74 * format_note Additional info about the format
75 ("3D" or "DASH video")
76 * width Width of the video, if known
77 * height Height of the video, if known
78 * abr Average audio bitrate in KBit/s
79 * acodec Name of the audio codec in use
80 * vbr Average video bitrate in KBit/s
81 * vcodec Name of the video codec in use
82 * filesize The number of bytes, if known in advance
83 webpage_url: The url to the video webpage, if given to youtube-dl it
84 should allow to get the same result again. (It will be set
85 by YoutubeDL if it's missing)
86
87 Unless mentioned otherwise, the fields should be Unicode strings.
88
89 Subclasses of this one should re-define the _real_initialize() and
90 _real_extract() methods and define a _VALID_URL regexp.
91 Probably, they should also be added to the list of extractors.
92
93 _real_extract() must return a *list* of information dictionaries as
94 described above.
95
96 Finally, the _WORKING attribute should be set to False for broken IEs
97 in order to warn the users and skip the tests.
98 """
99
100 _ready = False
101 _downloader = None
102 _WORKING = True
103
104 def __init__(self, downloader=None):
105 """Constructor. Receives an optional downloader."""
106 self._ready = False
107 self.set_downloader(downloader)
108
109 @classmethod
110 def suitable(cls, url):
111 """Receives a URL and returns True if suitable for this IE."""
112
113 # This does not use has/getattr intentionally - we want to know whether
114 # we have cached the regexp for *this* class, whereas getattr would also
115 # match the superclass
116 if '_VALID_URL_RE' not in cls.__dict__:
117 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
118 return cls._VALID_URL_RE.match(url) is not None
119
120 @classmethod
121 def working(cls):
122 """Getter method for _WORKING."""
123 return cls._WORKING
124
125 def initialize(self):
126 """Initializes an instance (authentication, etc)."""
127 if not self._ready:
128 self._real_initialize()
129 self._ready = True
130
131 def extract(self, url):
132 """Extracts URL information and returns it in list of dicts."""
133 self.initialize()
134 return self._real_extract(url)
135
136 def set_downloader(self, downloader):
137 """Sets the downloader for this IE."""
138 self._downloader = downloader
139
140 def _real_initialize(self):
141 """Real initialization process. Redefine in subclasses."""
142 pass
143
144 def _real_extract(self, url):
145 """Real extraction process. Redefine in subclasses."""
146 pass
147
148 @classmethod
149 def ie_key(cls):
150 """A string for getting the InfoExtractor with get_info_extractor"""
151 return cls.__name__[:-2]
152
153 @property
154 def IE_NAME(self):
155 return type(self).__name__[:-2]
156
157 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
158 """ Returns the response handle """
159 if note is None:
160 self.report_download_webpage(video_id)
161 elif note is not False:
162 if video_id is None:
163 self.to_screen(u'%s' % (note,))
164 else:
165 self.to_screen(u'%s: %s' % (video_id, note))
166 try:
167 return self._downloader.urlopen(url_or_request)
168 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
169 if errnote is None:
170 errnote = u'Unable to download webpage'
171 errmsg = u'%s: %s' % (errnote, compat_str(err))
172 if fatal:
173 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
174 else:
175 self._downloader.report_warning(errmsg)
176 return False
177
178 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
179 """ Returns a tuple (page content as string, URL handle) """
180
181 # Strip hashes from the URL (#1038)
182 if isinstance(url_or_request, (compat_str, str)):
183 url_or_request = url_or_request.partition('#')[0]
184
185 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
186 if urlh is False:
187 assert not fatal
188 return False
189 content_type = urlh.headers.get('Content-Type', '')
190 webpage_bytes = urlh.read()
191 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
192 if m:
193 encoding = m.group(1)
194 else:
195 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
196 webpage_bytes[:1024])
197 if m:
198 encoding = m.group(1).decode('ascii')
199 else:
200 encoding = 'utf-8'
201 if self._downloader.params.get('dump_intermediate_pages', False):
202 try:
203 url = url_or_request.get_full_url()
204 except AttributeError:
205 url = url_or_request
206 self.to_screen(u'Dumping request to ' + url)
207 dump = base64.b64encode(webpage_bytes).decode('ascii')
208 self._downloader.to_screen(dump)
209 if self._downloader.params.get('write_pages', False):
210 try:
211 url = url_or_request.get_full_url()
212 except AttributeError:
213 url = url_or_request
214 raw_filename = ('%s_%s.dump' % (video_id, url))
215 filename = sanitize_filename(raw_filename, restricted=True)
216 self.to_screen(u'Saving request to ' + filename)
217 with open(filename, 'wb') as outf:
218 outf.write(webpage_bytes)
219
220 content = webpage_bytes.decode(encoding, 'replace')
221 return (content, urlh)
222
223 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
224 """ Returns the data of the page as a string """
225 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
226 if res is False:
227 return res
228 else:
229 content, _ = res
230 return content
231
232 def _download_xml(self, url_or_request, video_id,
233 note=u'Downloading XML', errnote=u'Unable to download XML'):
234 """Return the xml as an xml.etree.ElementTree.Element"""
235 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
236 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
237
238 def to_screen(self, msg):
239 """Print msg to screen, prefixing it with '[ie_name]'"""
240 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
241
242 def report_extraction(self, id_or_name):
243 """Report information extraction."""
244 self.to_screen(u'%s: Extracting information' % id_or_name)
245
246 def report_download_webpage(self, video_id):
247 """Report webpage download."""
248 self.to_screen(u'%s: Downloading webpage' % video_id)
249
250 def report_age_confirmation(self):
251 """Report attempt to confirm age."""
252 self.to_screen(u'Confirming age')
253
254 def report_login(self):
255 """Report attempt to log in."""
256 self.to_screen(u'Logging in')
257
258 #Methods for following #608
259 def url_result(self, url, ie=None, video_id=None):
260 """Returns a url that points to a page that should be processed"""
261 #TODO: ie should be the class used for getting the info
262 video_info = {'_type': 'url',
263 'url': url,
264 'ie_key': ie}
265 if video_id is not None:
266 video_info['id'] = video_id
267 return video_info
268 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
269 """Returns a playlist"""
270 video_info = {'_type': 'playlist',
271 'entries': entries}
272 if playlist_id:
273 video_info['id'] = playlist_id
274 if playlist_title:
275 video_info['title'] = playlist_title
276 return video_info
277
278 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
279 """
280 Perform a regex search on the given string, using a single or a list of
281 patterns returning the first matching group.
282 In case of failure return a default value or raise a WARNING or a
283 RegexNotFoundError, depending on fatal, specifying the field name.
284 """
285 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
286 mobj = re.search(pattern, string, flags)
287 else:
288 for p in pattern:
289 mobj = re.search(p, string, flags)
290 if mobj: break
291
292 if sys.stderr.isatty() and os.name != 'nt':
293 _name = u'\033[0;34m%s\033[0m' % name
294 else:
295 _name = name
296
297 if mobj:
298 # return the first matching group
299 return next(g for g in mobj.groups() if g is not None)
300 elif default is not None:
301 return default
302 elif fatal:
303 raise RegexNotFoundError(u'Unable to extract %s' % _name)
304 else:
305 self._downloader.report_warning(u'unable to extract %s; '
306 u'please report this issue on http://yt-dl.org/bug' % _name)
307 return None
308
309 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
310 """
311 Like _search_regex, but strips HTML tags and unescapes entities.
312 """
313 res = self._search_regex(pattern, string, name, default, fatal, flags)
314 if res:
315 return clean_html(res).strip()
316 else:
317 return res
318
319 def _get_login_info(self):
320 """
321 Get the the login info as (username, password)
322 It will look in the netrc file using the _NETRC_MACHINE value
323 If there's no info available, return (None, None)
324 """
325 if self._downloader is None:
326 return (None, None)
327
328 username = None
329 password = None
330 downloader_params = self._downloader.params
331
332 # Attempt to use provided username and password or .netrc data
333 if downloader_params.get('username', None) is not None:
334 username = downloader_params['username']
335 password = downloader_params['password']
336 elif downloader_params.get('usenetrc', False):
337 try:
338 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
339 if info is not None:
340 username = info[0]
341 password = info[2]
342 else:
343 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
344 except (IOError, netrc.NetrcParseError) as err:
345 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
346
347 return (username, password)
348
349 # Helper functions for extracting OpenGraph info
350 @staticmethod
351 def _og_regexes(prop):
352 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
353 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
354 template = r'<meta[^>]+?%s[^>]+?%s'
355 return [
356 template % (property_re, content_re),
357 template % (content_re, property_re),
358 ]
359
360 def _og_search_property(self, prop, html, name=None, **kargs):
361 if name is None:
362 name = 'OpenGraph %s' % prop
363 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
364 if escaped is None:
365 return None
366 return unescapeHTML(escaped)
367
368 def _og_search_thumbnail(self, html, **kargs):
369 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
370
371 def _og_search_description(self, html, **kargs):
372 return self._og_search_property('description', html, fatal=False, **kargs)
373
374 def _og_search_title(self, html, **kargs):
375 return self._og_search_property('title', html, **kargs)
376
377 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
378 regexes = self._og_regexes('video')
379 if secure: regexes = self._og_regexes('video:secure_url') + regexes
380 return self._html_search_regex(regexes, html, name, **kargs)
381
382 def _html_search_meta(self, name, html, display_name=None):
383 if display_name is None:
384 display_name = name
385 return self._html_search_regex(
386 r'''(?ix)<meta
387 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
388 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
389 html, display_name, fatal=False)
390
391 def _dc_search_uploader(self, html):
392 return self._html_search_meta('dc.creator', html, 'uploader')
393
394 def _rta_search(self, html):
395 # See http://www.rtalabel.org/index.php?content=howtofaq#single
396 if re.search(r'(?ix)<meta\s+name="rating"\s+'
397 r' content="RTA-5042-1996-1400-1577-RTA"',
398 html):
399 return 18
400 return 0
401
402 def _media_rating_search(self, html):
403 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
404 rating = self._html_search_meta('rating', html)
405
406 if not rating:
407 return None
408
409 RATING_TABLE = {
410 'safe for kids': 0,
411 'general': 8,
412 '14 years': 14,
413 'mature': 17,
414 'restricted': 19,
415 }
416 return RATING_TABLE.get(rating.lower(), None)
417
418
419
420 class SearchInfoExtractor(InfoExtractor):
421 """
422 Base class for paged search queries extractors.
423 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
424 Instances should define _SEARCH_KEY and _MAX_RESULTS.
425 """
426
427 @classmethod
428 def _make_valid_url(cls):
429 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
430
431 @classmethod
432 def suitable(cls, url):
433 return re.match(cls._make_valid_url(), url) is not None
434
435 def _real_extract(self, query):
436 mobj = re.match(self._make_valid_url(), query)
437 if mobj is None:
438 raise ExtractorError(u'Invalid search query "%s"' % query)
439
440 prefix = mobj.group('prefix')
441 query = mobj.group('query')
442 if prefix == '':
443 return self._get_n_results(query, 1)
444 elif prefix == 'all':
445 return self._get_n_results(query, self._MAX_RESULTS)
446 else:
447 n = int(prefix)
448 if n <= 0:
449 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
450 elif n > self._MAX_RESULTS:
451 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
452 n = self._MAX_RESULTS
453 return self._get_n_results(query, n)
454
455 def _get_n_results(self, query, n):
456 """Get a specified number of results for a query"""
457 raise NotImplementedError("This method must be implemented by subclasses")
458
459 @property
460 def SEARCH_KEY(self):
461 return self._SEARCH_KEY