]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
Move the opener to the YoutubeDL object.
[yt-dlp.git] / youtube_dl / extractor / common.py
1 import base64
2 import os
3 import re
4 import socket
5 import sys
6 import netrc
7
8 from ..utils import (
9 compat_http_client,
10 compat_urllib_error,
11 compat_str,
12
13 clean_html,
14 compiled_regex_type,
15 ExtractorError,
16 RegexNotFoundError,
17 sanitize_filename,
18 unescapeHTML,
19 )
20
21
22 class InfoExtractor(object):
23 """Information Extractor class.
24
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
32
33 The dictionaries must include the following fields:
34
35 id: Video identifier.
36 url: Final video URL.
37 title: Video title, unescaped.
38 ext: Video filename extension.
39
40 Instead of url and ext, formats can also specified.
41
42 The following fields are optional:
43
44 format: The video format, defaults to ext (used for --get-format)
45 thumbnails: A list of dictionaries (with the entries "resolution" and
46 "url") for the varying thumbnails
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents as a dictionary in the format
55 {language: subtitles}.
56 view_count: How many users have watched the video on the platform.
57 urlhandle: [internal] The urlHandle to be used to download the file,
58 like returned by urllib.request.urlopen
59 age_limit: Age restriction for the video, as an integer (years)
60 formats: A list of dictionaries for each format available, it must
61 be ordered from worst to best quality. Potential fields:
62 * url Mandatory. The URL of the video file
63 * ext Will be calculated from url if missing
64 * format A human-readable description of the format
65 ("mp4 container with h264/opus").
66 Calculated from the format_id, width, height.
67 and format_note fields if missing.
68 * format_id A short description of the format
69 ("mp4_h264_opus" or "19")
70 * format_note Additional info about the format
71 ("3D" or "DASH video")
72 * width Width of the video, if known
73 * height Height of the video, if known
74 * abr Average audio bitrate in KBit/s
75 * acodec Name of the audio codec in use
76 * vbr Average video bitrate in KBit/s
77 * vcodec Name of the video codec in use
78 webpage_url: The url to the video webpage, if given to youtube-dl it
79 should allow to get the same result again. (It will be set
80 by YoutubeDL if it's missing)
81
82 Unless mentioned otherwise, the fields should be Unicode strings.
83
84 Subclasses of this one should re-define the _real_initialize() and
85 _real_extract() methods and define a _VALID_URL regexp.
86 Probably, they should also be added to the list of extractors.
87
88 _real_extract() must return a *list* of information dictionaries as
89 described above.
90
91 Finally, the _WORKING attribute should be set to False for broken IEs
92 in order to warn the users and skip the tests.
93 """
94
95 _ready = False
96 _downloader = None
97 _WORKING = True
98
99 def __init__(self, downloader=None):
100 """Constructor. Receives an optional downloader."""
101 self._ready = False
102 self.set_downloader(downloader)
103
104 @classmethod
105 def suitable(cls, url):
106 """Receives a URL and returns True if suitable for this IE."""
107
108 # This does not use has/getattr intentionally - we want to know whether
109 # we have cached the regexp for *this* class, whereas getattr would also
110 # match the superclass
111 if '_VALID_URL_RE' not in cls.__dict__:
112 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
113 return cls._VALID_URL_RE.match(url) is not None
114
115 @classmethod
116 def working(cls):
117 """Getter method for _WORKING."""
118 return cls._WORKING
119
120 def initialize(self):
121 """Initializes an instance (authentication, etc)."""
122 if not self._ready:
123 self._real_initialize()
124 self._ready = True
125
126 def extract(self, url):
127 """Extracts URL information and returns it in list of dicts."""
128 self.initialize()
129 return self._real_extract(url)
130
131 def set_downloader(self, downloader):
132 """Sets the downloader for this IE."""
133 self._downloader = downloader
134
135 def _real_initialize(self):
136 """Real initialization process. Redefine in subclasses."""
137 pass
138
139 def _real_extract(self, url):
140 """Real extraction process. Redefine in subclasses."""
141 pass
142
143 @classmethod
144 def ie_key(cls):
145 """A string for getting the InfoExtractor with get_info_extractor"""
146 return cls.__name__[:-2]
147
148 @property
149 def IE_NAME(self):
150 return type(self).__name__[:-2]
151
152 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
153 """ Returns the response handle """
154 if note is None:
155 self.report_download_webpage(video_id)
156 elif note is not False:
157 self.to_screen(u'%s: %s' % (video_id, note))
158 try:
159 return self._downloader.urlopen(url_or_request)
160 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
161 if errnote is None:
162 errnote = u'Unable to download webpage'
163 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
164
165 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
166 """ Returns a tuple (page content as string, URL handle) """
167
168 # Strip hashes from the URL (#1038)
169 if isinstance(url_or_request, (compat_str, str)):
170 url_or_request = url_or_request.partition('#')[0]
171
172 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
173 content_type = urlh.headers.get('Content-Type', '')
174 webpage_bytes = urlh.read()
175 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
176 if m:
177 encoding = m.group(1)
178 else:
179 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
180 webpage_bytes[:1024])
181 if m:
182 encoding = m.group(1).decode('ascii')
183 else:
184 encoding = 'utf-8'
185 if self._downloader.params.get('dump_intermediate_pages', False):
186 try:
187 url = url_or_request.get_full_url()
188 except AttributeError:
189 url = url_or_request
190 self.to_screen(u'Dumping request to ' + url)
191 dump = base64.b64encode(webpage_bytes).decode('ascii')
192 self._downloader.to_screen(dump)
193 if self._downloader.params.get('write_pages', False):
194 try:
195 url = url_or_request.get_full_url()
196 except AttributeError:
197 url = url_or_request
198 raw_filename = ('%s_%s.dump' % (video_id, url))
199 filename = sanitize_filename(raw_filename, restricted=True)
200 self.to_screen(u'Saving request to ' + filename)
201 with open(filename, 'wb') as outf:
202 outf.write(webpage_bytes)
203
204 content = webpage_bytes.decode(encoding, 'replace')
205 return (content, urlh)
206
207 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
208 """ Returns the data of the page as a string """
209 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
210
211 def to_screen(self, msg):
212 """Print msg to screen, prefixing it with '[ie_name]'"""
213 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
214
215 def report_extraction(self, id_or_name):
216 """Report information extraction."""
217 self.to_screen(u'%s: Extracting information' % id_or_name)
218
219 def report_download_webpage(self, video_id):
220 """Report webpage download."""
221 self.to_screen(u'%s: Downloading webpage' % video_id)
222
223 def report_age_confirmation(self):
224 """Report attempt to confirm age."""
225 self.to_screen(u'Confirming age')
226
227 def report_login(self):
228 """Report attempt to log in."""
229 self.to_screen(u'Logging in')
230
231 #Methods for following #608
232 def url_result(self, url, ie=None):
233 """Returns a url that points to a page that should be processed"""
234 #TODO: ie should be the class used for getting the info
235 video_info = {'_type': 'url',
236 'url': url,
237 'ie_key': ie}
238 return video_info
239 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
240 """Returns a playlist"""
241 video_info = {'_type': 'playlist',
242 'entries': entries}
243 if playlist_id:
244 video_info['id'] = playlist_id
245 if playlist_title:
246 video_info['title'] = playlist_title
247 return video_info
248
249 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
250 """
251 Perform a regex search on the given string, using a single or a list of
252 patterns returning the first matching group.
253 In case of failure return a default value or raise a WARNING or a
254 RegexNotFoundError, depending on fatal, specifying the field name.
255 """
256 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
257 mobj = re.search(pattern, string, flags)
258 else:
259 for p in pattern:
260 mobj = re.search(p, string, flags)
261 if mobj: break
262
263 if sys.stderr.isatty() and os.name != 'nt':
264 _name = u'\033[0;34m%s\033[0m' % name
265 else:
266 _name = name
267
268 if mobj:
269 # return the first matching group
270 return next(g for g in mobj.groups() if g is not None)
271 elif default is not None:
272 return default
273 elif fatal:
274 raise RegexNotFoundError(u'Unable to extract %s' % _name)
275 else:
276 self._downloader.report_warning(u'unable to extract %s; '
277 u'please report this issue on http://yt-dl.org/bug' % _name)
278 return None
279
280 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
281 """
282 Like _search_regex, but strips HTML tags and unescapes entities.
283 """
284 res = self._search_regex(pattern, string, name, default, fatal, flags)
285 if res:
286 return clean_html(res).strip()
287 else:
288 return res
289
290 def _get_login_info(self):
291 """
292 Get the the login info as (username, password)
293 It will look in the netrc file using the _NETRC_MACHINE value
294 If there's no info available, return (None, None)
295 """
296 if self._downloader is None:
297 return (None, None)
298
299 username = None
300 password = None
301 downloader_params = self._downloader.params
302
303 # Attempt to use provided username and password or .netrc data
304 if downloader_params.get('username', None) is not None:
305 username = downloader_params['username']
306 password = downloader_params['password']
307 elif downloader_params.get('usenetrc', False):
308 try:
309 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
310 if info is not None:
311 username = info[0]
312 password = info[2]
313 else:
314 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
315 except (IOError, netrc.NetrcParseError) as err:
316 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
317
318 return (username, password)
319
320 # Helper functions for extracting OpenGraph info
321 @staticmethod
322 def _og_regexes(prop):
323 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
324 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
325 template = r'<meta[^>]+?%s[^>]+?%s'
326 return [
327 template % (property_re, content_re),
328 template % (content_re, property_re),
329 ]
330
331 def _og_search_property(self, prop, html, name=None, **kargs):
332 if name is None:
333 name = 'OpenGraph %s' % prop
334 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
335 if escaped is None:
336 return None
337 return unescapeHTML(escaped)
338
339 def _og_search_thumbnail(self, html, **kargs):
340 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
341
342 def _og_search_description(self, html, **kargs):
343 return self._og_search_property('description', html, fatal=False, **kargs)
344
345 def _og_search_title(self, html, **kargs):
346 return self._og_search_property('title', html, **kargs)
347
348 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
349 regexes = self._og_regexes('video')
350 if secure: regexes = self._og_regexes('video:secure_url') + regexes
351 return self._html_search_regex(regexes, html, name, **kargs)
352
353 def _html_search_meta(self, name, html, display_name=None):
354 if display_name is None:
355 display_name = name
356 return self._html_search_regex(
357 r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\'])
358 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
359 html, display_name, fatal=False)
360
361 def _dc_search_uploader(self, html):
362 return self._html_search_meta('dc.creator', html, 'uploader')
363
364 def _rta_search(self, html):
365 # See http://www.rtalabel.org/index.php?content=howtofaq#single
366 if re.search(r'(?ix)<meta\s+name="rating"\s+'
367 r' content="RTA-5042-1996-1400-1577-RTA"',
368 html):
369 return 18
370 return 0
371
372 def _media_rating_search(self, html):
373 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
374 rating = self._html_search_meta('rating', html)
375
376 if not rating:
377 return None
378
379 RATING_TABLE = {
380 'safe for kids': 0,
381 'general': 8,
382 '14 years': 14,
383 'mature': 17,
384 'restricted': 19,
385 }
386 return RATING_TABLE.get(rating.lower(), None)
387
388
389
390 class SearchInfoExtractor(InfoExtractor):
391 """
392 Base class for paged search queries extractors.
393 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
394 Instances should define _SEARCH_KEY and _MAX_RESULTS.
395 """
396
397 @classmethod
398 def _make_valid_url(cls):
399 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
400
401 @classmethod
402 def suitable(cls, url):
403 return re.match(cls._make_valid_url(), url) is not None
404
405 def _real_extract(self, query):
406 mobj = re.match(self._make_valid_url(), query)
407 if mobj is None:
408 raise ExtractorError(u'Invalid search query "%s"' % query)
409
410 prefix = mobj.group('prefix')
411 query = mobj.group('query')
412 if prefix == '':
413 return self._get_n_results(query, 1)
414 elif prefix == 'all':
415 return self._get_n_results(query, self._MAX_RESULTS)
416 else:
417 n = int(prefix)
418 if n <= 0:
419 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
420 elif n > self._MAX_RESULTS:
421 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
422 n = self._MAX_RESULTS
423 return self._get_n_results(query, n)
424
425 def _get_n_results(self, query, n):
426 """Get a specified number of results for a query"""
427 raise NotImplementedError("This method must be implemented by subclasses")
428
429 @property
430 def SEARCH_KEY(self):
431 return self._SEARCH_KEY