]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
Allow users to specify an age limit (fixes #1545)
[yt-dlp.git] / youtube_dl / extractor / common.py
1 import base64
2 import os
3 import re
4 import socket
5 import sys
6 import netrc
7
8 from ..utils import (
9 compat_http_client,
10 compat_urllib_error,
11 compat_urllib_request,
12 compat_str,
13
14 clean_html,
15 compiled_regex_type,
16 ExtractorError,
17 unescapeHTML,
18 )
19
20 class InfoExtractor(object):
21 """Information Extractor class.
22
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
30
31 The dictionaries must include the following fields:
32
33 id: Video identifier.
34 url: Final video URL.
35 title: Video title, unescaped.
36 ext: Video filename extension.
37
38 Instead of url and ext, formats can also specified.
39
40 The following fields are optional:
41
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnails: A list of dictionaries (with the entries "resolution" and
44 "url") for the varying thumbnails
45 thumbnail: Full URL to a video thumbnail image.
46 description: One-line video description.
47 uploader: Full name of the video uploader.
48 upload_date: Video upload date (YYYYMMDD).
49 uploader_id: Nickname or id of the video uploader.
50 location: Physical location of the video.
51 player_url: SWF Player URL (used for rtmpdump).
52 subtitles: The subtitle file contents as a dictionary in the format
53 {language: subtitles}.
54 view_count: How many users have watched the video on the platform.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
57 age_limit: Age restriction for the video, as an integer (years)
58 formats: A list of dictionaries for each format available, it must
59 be ordered from worst to best quality. Potential fields:
60 * url Mandatory. The URL of the video file
61 * ext Will be calculated from url if missing
62 * format A human-readable description of the format
63 ("mp4 container with h264/opus").
64 Calculated from width and height if missing.
65 * format_id A short description of the format
66 ("mp4_h264_opus" or "19")
67 * width Width of the video, if known
68 * height Height of the video, if known
69
70 Unless mentioned otherwise, the fields should be Unicode strings.
71
72 Subclasses of this one should re-define the _real_initialize() and
73 _real_extract() methods and define a _VALID_URL regexp.
74 Probably, they should also be added to the list of extractors.
75
76 _real_extract() must return a *list* of information dictionaries as
77 described above.
78
79 Finally, the _WORKING attribute should be set to False for broken IEs
80 in order to warn the users and skip the tests.
81 """
82
83 _ready = False
84 _downloader = None
85 _WORKING = True
86
87 def __init__(self, downloader=None):
88 """Constructor. Receives an optional downloader."""
89 self._ready = False
90 self.set_downloader(downloader)
91
92 @classmethod
93 def suitable(cls, url):
94 """Receives a URL and returns True if suitable for this IE."""
95
96 # This does not use has/getattr intentionally - we want to know whether
97 # we have cached the regexp for *this* class, whereas getattr would also
98 # match the superclass
99 if '_VALID_URL_RE' not in cls.__dict__:
100 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
101 return cls._VALID_URL_RE.match(url) is not None
102
103 @classmethod
104 def working(cls):
105 """Getter method for _WORKING."""
106 return cls._WORKING
107
108 def initialize(self):
109 """Initializes an instance (authentication, etc)."""
110 if not self._ready:
111 self._real_initialize()
112 self._ready = True
113
114 def extract(self, url):
115 """Extracts URL information and returns it in list of dicts."""
116 self.initialize()
117 return self._real_extract(url)
118
119 def set_downloader(self, downloader):
120 """Sets the downloader for this IE."""
121 self._downloader = downloader
122
123 def _real_initialize(self):
124 """Real initialization process. Redefine in subclasses."""
125 pass
126
127 def _real_extract(self, url):
128 """Real extraction process. Redefine in subclasses."""
129 pass
130
131 @classmethod
132 def ie_key(cls):
133 """A string for getting the InfoExtractor with get_info_extractor"""
134 return cls.__name__[:-2]
135
136 @property
137 def IE_NAME(self):
138 return type(self).__name__[:-2]
139
140 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
141 """ Returns the response handle """
142 if note is None:
143 self.report_download_webpage(video_id)
144 elif note is not False:
145 self.to_screen(u'%s: %s' % (video_id, note))
146 try:
147 return compat_urllib_request.urlopen(url_or_request)
148 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
149 if errnote is None:
150 errnote = u'Unable to download webpage'
151 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
152
153 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
154 """ Returns a tuple (page content as string, URL handle) """
155
156 # Strip hashes from the URL (#1038)
157 if isinstance(url_or_request, (compat_str, str)):
158 url_or_request = url_or_request.partition('#')[0]
159
160 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
161 content_type = urlh.headers.get('Content-Type', '')
162 webpage_bytes = urlh.read()
163 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
164 if m:
165 encoding = m.group(1)
166 else:
167 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
168 webpage_bytes[:1024])
169 if m:
170 encoding = m.group(1).decode('ascii')
171 else:
172 encoding = 'utf-8'
173 if self._downloader.params.get('dump_intermediate_pages', False):
174 try:
175 url = url_or_request.get_full_url()
176 except AttributeError:
177 url = url_or_request
178 self.to_screen(u'Dumping request to ' + url)
179 dump = base64.b64encode(webpage_bytes).decode('ascii')
180 self._downloader.to_screen(dump)
181 content = webpage_bytes.decode(encoding, 'replace')
182 return (content, urlh)
183
184 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
185 """ Returns the data of the page as a string """
186 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
187
188 def to_screen(self, msg):
189 """Print msg to screen, prefixing it with '[ie_name]'"""
190 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
191
192 def report_extraction(self, id_or_name):
193 """Report information extraction."""
194 self.to_screen(u'%s: Extracting information' % id_or_name)
195
196 def report_download_webpage(self, video_id):
197 """Report webpage download."""
198 self.to_screen(u'%s: Downloading webpage' % video_id)
199
200 def report_age_confirmation(self):
201 """Report attempt to confirm age."""
202 self.to_screen(u'Confirming age')
203
204 def report_login(self):
205 """Report attempt to log in."""
206 self.to_screen(u'Logging in')
207
208 #Methods for following #608
209 def url_result(self, url, ie=None):
210 """Returns a url that points to a page that should be processed"""
211 #TODO: ie should be the class used for getting the info
212 video_info = {'_type': 'url',
213 'url': url,
214 'ie_key': ie}
215 return video_info
216 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
217 """Returns a playlist"""
218 video_info = {'_type': 'playlist',
219 'entries': entries}
220 if playlist_id:
221 video_info['id'] = playlist_id
222 if playlist_title:
223 video_info['title'] = playlist_title
224 return video_info
225
226 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 """
228 Perform a regex search on the given string, using a single or a list of
229 patterns returning the first matching group.
230 In case of failure return a default value or raise a WARNING or a
231 ExtractorError, depending on fatal, specifying the field name.
232 """
233 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
234 mobj = re.search(pattern, string, flags)
235 else:
236 for p in pattern:
237 mobj = re.search(p, string, flags)
238 if mobj: break
239
240 if sys.stderr.isatty() and os.name != 'nt':
241 _name = u'\033[0;34m%s\033[0m' % name
242 else:
243 _name = name
244
245 if mobj:
246 # return the first matching group
247 return next(g for g in mobj.groups() if g is not None)
248 elif default is not None:
249 return default
250 elif fatal:
251 raise ExtractorError(u'Unable to extract %s' % _name)
252 else:
253 self._downloader.report_warning(u'unable to extract %s; '
254 u'please report this issue on http://yt-dl.org/bug' % _name)
255 return None
256
257 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
258 """
259 Like _search_regex, but strips HTML tags and unescapes entities.
260 """
261 res = self._search_regex(pattern, string, name, default, fatal, flags)
262 if res:
263 return clean_html(res).strip()
264 else:
265 return res
266
267 def _get_login_info(self):
268 """
269 Get the the login info as (username, password)
270 It will look in the netrc file using the _NETRC_MACHINE value
271 If there's no info available, return (None, None)
272 """
273 if self._downloader is None:
274 return (None, None)
275
276 username = None
277 password = None
278 downloader_params = self._downloader.params
279
280 # Attempt to use provided username and password or .netrc data
281 if downloader_params.get('username', None) is not None:
282 username = downloader_params['username']
283 password = downloader_params['password']
284 elif downloader_params.get('usenetrc', False):
285 try:
286 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
287 if info is not None:
288 username = info[0]
289 password = info[2]
290 else:
291 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
292 except (IOError, netrc.NetrcParseError) as err:
293 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
294
295 return (username, password)
296
297 # Helper functions for extracting OpenGraph info
298 @staticmethod
299 def _og_regex(prop):
300 return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
301
302 def _og_search_property(self, prop, html, name=None, **kargs):
303 if name is None:
304 name = 'OpenGraph %s' % prop
305 escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
306 return unescapeHTML(escaped)
307
308 def _og_search_thumbnail(self, html, **kargs):
309 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
310
311 def _og_search_description(self, html, **kargs):
312 return self._og_search_property('description', html, fatal=False, **kargs)
313
314 def _og_search_title(self, html, **kargs):
315 return self._og_search_property('title', html, **kargs)
316
317 def _og_search_video_url(self, html, name='video url', **kargs):
318 return self._html_search_regex([self._og_regex('video:secure_url'),
319 self._og_regex('video')],
320 html, name, **kargs)
321
322 def _rta_search(self, html):
323 # See http://www.rtalabel.org/index.php?content=howtofaq#single
324 if re.search(r'(?ix)<meta\s+name="rating"\s+'
325 r' content="RTA-5042-1996-1400-1577-RTA"',
326 html):
327 return 18
328 return 0
329
330
331 class SearchInfoExtractor(InfoExtractor):
332 """
333 Base class for paged search queries extractors.
334 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
335 Instances should define _SEARCH_KEY and _MAX_RESULTS.
336 """
337
338 @classmethod
339 def _make_valid_url(cls):
340 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
341
342 @classmethod
343 def suitable(cls, url):
344 return re.match(cls._make_valid_url(), url) is not None
345
346 def _real_extract(self, query):
347 mobj = re.match(self._make_valid_url(), query)
348 if mobj is None:
349 raise ExtractorError(u'Invalid search query "%s"' % query)
350
351 prefix = mobj.group('prefix')
352 query = mobj.group('query')
353 if prefix == '':
354 return self._get_n_results(query, 1)
355 elif prefix == 'all':
356 return self._get_n_results(query, self._MAX_RESULTS)
357 else:
358 n = int(prefix)
359 if n <= 0:
360 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
361 elif n > self._MAX_RESULTS:
362 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
363 n = self._MAX_RESULTS
364 return self._get_n_results(query, n)
365
366 def _get_n_results(self, query, n):
367 """Get a specified number of results for a query"""
368 raise NotImplementedError("This method must be implemented by sublclasses")
369
370 @property
371 def SEARCH_KEY(self):
372 return self._SEARCH_KEY