]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
69cdcdc1b5b75d1cac5733b34565f087c9dcddec
[yt-dlp.git] / youtube_dl / extractor / common.py
1 import base64
2 import os
3 import re
4 import socket
5 import sys
6 import netrc
7
8 from ..utils import (
9 compat_http_client,
10 compat_urllib_error,
11 compat_urllib_request,
12 compat_str,
13
14 clean_html,
15 compiled_regex_type,
16 ExtractorError,
17 unescapeHTML,
18 )
19
20 class InfoExtractor(object):
21 """Information Extractor class.
22
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
30
31 The dictionaries must include the following fields:
32
33 id: Video identifier.
34 url: Final video URL.
35 title: Video title, unescaped.
36 ext: Video filename extension.
37
38 Instead of url and ext, formats can also specified.
39
40 The following fields are optional:
41
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnails: A list of dictionaries (with the entries "resolution" and
44 "url") for the varying thumbnails
45 thumbnail: Full URL to a video thumbnail image.
46 description: One-line video description.
47 uploader: Full name of the video uploader.
48 upload_date: Video upload date (YYYYMMDD).
49 uploader_id: Nickname or id of the video uploader.
50 location: Physical location of the video.
51 player_url: SWF Player URL (used for rtmpdump).
52 subtitles: The subtitle file contents as a dictionary in the format
53 {language: subtitles}.
54 view_count: How many users have watched the video on the platform.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
57 formats: A list of dictionaries for each format available, it must
58 be ordered from worst to best quality. Potential fields:
59 * url Mandatory. The URL of the video file
60 * ext Will be calculated from url if missing
61 * format A human-readable description of the format
62 ("mp4 container with h264/opus").
63 Calculated from width and height if missing.
64 * format_id A short description of the format
65 ("mp4_h264_opus" or "19")
66 * width Width of the video, if known
67 * height Height of the video, if known
68
69 Unless mentioned otherwise, the fields should be Unicode strings.
70
71 Subclasses of this one should re-define the _real_initialize() and
72 _real_extract() methods and define a _VALID_URL regexp.
73 Probably, they should also be added to the list of extractors.
74
75 _real_extract() must return a *list* of information dictionaries as
76 described above.
77
78 Finally, the _WORKING attribute should be set to False for broken IEs
79 in order to warn the users and skip the tests.
80 """
81
82 _ready = False
83 _downloader = None
84 _WORKING = True
85
86 def __init__(self, downloader=None):
87 """Constructor. Receives an optional downloader."""
88 self._ready = False
89 self.set_downloader(downloader)
90
91 @classmethod
92 def suitable(cls, url):
93 """Receives a URL and returns True if suitable for this IE."""
94
95 # This does not use has/getattr intentionally - we want to know whether
96 # we have cached the regexp for *this* class, whereas getattr would also
97 # match the superclass
98 if '_VALID_URL_RE' not in cls.__dict__:
99 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
100 return cls._VALID_URL_RE.match(url) is not None
101
102 @classmethod
103 def working(cls):
104 """Getter method for _WORKING."""
105 return cls._WORKING
106
107 def initialize(self):
108 """Initializes an instance (authentication, etc)."""
109 if not self._ready:
110 self._real_initialize()
111 self._ready = True
112
113 def extract(self, url):
114 """Extracts URL information and returns it in list of dicts."""
115 self.initialize()
116 return self._real_extract(url)
117
118 def set_downloader(self, downloader):
119 """Sets the downloader for this IE."""
120 self._downloader = downloader
121
122 def _real_initialize(self):
123 """Real initialization process. Redefine in subclasses."""
124 pass
125
126 def _real_extract(self, url):
127 """Real extraction process. Redefine in subclasses."""
128 pass
129
130 @classmethod
131 def ie_key(cls):
132 """A string for getting the InfoExtractor with get_info_extractor"""
133 return cls.__name__[:-2]
134
135 @property
136 def IE_NAME(self):
137 return type(self).__name__[:-2]
138
139 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
140 """ Returns the response handle """
141 if note is None:
142 self.report_download_webpage(video_id)
143 elif note is not False:
144 self.to_screen(u'%s: %s' % (video_id, note))
145 try:
146 return compat_urllib_request.urlopen(url_or_request)
147 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
148 if errnote is None:
149 errnote = u'Unable to download webpage'
150 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
151
152 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
153 """ Returns a tuple (page content as string, URL handle) """
154
155 # Strip hashes from the URL (#1038)
156 if isinstance(url_or_request, (compat_str, str)):
157 url_or_request = url_or_request.partition('#')[0]
158
159 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
160 content_type = urlh.headers.get('Content-Type', '')
161 webpage_bytes = urlh.read()
162 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
163 if m:
164 encoding = m.group(1)
165 else:
166 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
167 webpage_bytes[:1024])
168 if m:
169 encoding = m.group(1).decode('ascii')
170 else:
171 encoding = 'utf-8'
172 if self._downloader.params.get('dump_intermediate_pages', False):
173 try:
174 url = url_or_request.get_full_url()
175 except AttributeError:
176 url = url_or_request
177 self.to_screen(u'Dumping request to ' + url)
178 dump = base64.b64encode(webpage_bytes).decode('ascii')
179 self._downloader.to_screen(dump)
180 content = webpage_bytes.decode(encoding, 'replace')
181 return (content, urlh)
182
183 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
184 """ Returns the data of the page as a string """
185 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
186
187 def to_screen(self, msg):
188 """Print msg to screen, prefixing it with '[ie_name]'"""
189 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
190
191 def report_extraction(self, id_or_name):
192 """Report information extraction."""
193 self.to_screen(u'%s: Extracting information' % id_or_name)
194
195 def report_download_webpage(self, video_id):
196 """Report webpage download."""
197 self.to_screen(u'%s: Downloading webpage' % video_id)
198
199 def report_age_confirmation(self):
200 """Report attempt to confirm age."""
201 self.to_screen(u'Confirming age')
202
203 def report_login(self):
204 """Report attempt to log in."""
205 self.to_screen(u'Logging in')
206
207 #Methods for following #608
208 def url_result(self, url, ie=None):
209 """Returns a url that points to a page that should be processed"""
210 #TODO: ie should be the class used for getting the info
211 video_info = {'_type': 'url',
212 'url': url,
213 'ie_key': ie}
214 return video_info
215 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
216 """Returns a playlist"""
217 video_info = {'_type': 'playlist',
218 'entries': entries}
219 if playlist_id:
220 video_info['id'] = playlist_id
221 if playlist_title:
222 video_info['title'] = playlist_title
223 return video_info
224
225 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
226 """
227 Perform a regex search on the given string, using a single or a list of
228 patterns returning the first matching group.
229 In case of failure return a default value or raise a WARNING or a
230 ExtractorError, depending on fatal, specifying the field name.
231 """
232 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
233 mobj = re.search(pattern, string, flags)
234 else:
235 for p in pattern:
236 mobj = re.search(p, string, flags)
237 if mobj: break
238
239 if sys.stderr.isatty() and os.name != 'nt':
240 _name = u'\033[0;34m%s\033[0m' % name
241 else:
242 _name = name
243
244 if mobj:
245 # return the first matching group
246 return next(g for g in mobj.groups() if g is not None)
247 elif default is not None:
248 return default
249 elif fatal:
250 raise ExtractorError(u'Unable to extract %s' % _name)
251 else:
252 self._downloader.report_warning(u'unable to extract %s; '
253 u'please report this issue on http://yt-dl.org/bug' % _name)
254 return None
255
256 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
257 """
258 Like _search_regex, but strips HTML tags and unescapes entities.
259 """
260 res = self._search_regex(pattern, string, name, default, fatal, flags)
261 if res:
262 return clean_html(res).strip()
263 else:
264 return res
265
266 def _get_login_info(self):
267 """
268 Get the the login info as (username, password)
269 It will look in the netrc file using the _NETRC_MACHINE value
270 If there's no info available, return (None, None)
271 """
272 if self._downloader is None:
273 return (None, None)
274
275 username = None
276 password = None
277 downloader_params = self._downloader.params
278
279 # Attempt to use provided username and password or .netrc data
280 if downloader_params.get('username', None) is not None:
281 username = downloader_params['username']
282 password = downloader_params['password']
283 elif downloader_params.get('usenetrc', False):
284 try:
285 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
286 if info is not None:
287 username = info[0]
288 password = info[2]
289 else:
290 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
291 except (IOError, netrc.NetrcParseError) as err:
292 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
293
294 return (username, password)
295
296 # Helper functions for extracting OpenGraph info
297 @staticmethod
298 def _og_regex(prop):
299 return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
300
301 def _og_search_property(self, prop, html, name=None, **kargs):
302 if name is None:
303 name = 'OpenGraph %s' % prop
304 escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
305 return unescapeHTML(escaped)
306
307 def _og_search_thumbnail(self, html, **kargs):
308 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
309
310 def _og_search_description(self, html, **kargs):
311 return self._og_search_property('description', html, fatal=False, **kargs)
312
313 def _og_search_title(self, html, **kargs):
314 return self._og_search_property('title', html, **kargs)
315
316 def _og_search_video_url(self, html, name='video url', **kargs):
317 return self._html_search_regex([self._og_regex('video:secure_url'),
318 self._og_regex('video')],
319 html, name, **kargs)
320
321 class SearchInfoExtractor(InfoExtractor):
322 """
323 Base class for paged search queries extractors.
324 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
325 Instances should define _SEARCH_KEY and _MAX_RESULTS.
326 """
327
328 @classmethod
329 def _make_valid_url(cls):
330 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
331
332 @classmethod
333 def suitable(cls, url):
334 return re.match(cls._make_valid_url(), url) is not None
335
336 def _real_extract(self, query):
337 mobj = re.match(self._make_valid_url(), query)
338 if mobj is None:
339 raise ExtractorError(u'Invalid search query "%s"' % query)
340
341 prefix = mobj.group('prefix')
342 query = mobj.group('query')
343 if prefix == '':
344 return self._get_n_results(query, 1)
345 elif prefix == 'all':
346 return self._get_n_results(query, self._MAX_RESULTS)
347 else:
348 n = int(prefix)
349 if n <= 0:
350 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
351 elif n > self._MAX_RESULTS:
352 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
353 n = self._MAX_RESULTS
354 return self._get_n_results(query, n)
355
356 def _get_n_results(self, query, n):
357 """Get a specified number of results for a query"""
358 raise NotImplementedError("This method must be implemented by sublclasses")
359
360 @property
361 def SEARCH_KEY(self):
362 return self._SEARCH_KEY