]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
[vimeo] Fix pro videos and player.vimeo.com urls
[yt-dlp.git] / youtube_dl / extractor / common.py
1 import base64
2 import os
3 import re
4 import socket
5 import sys
6 import netrc
7
8 from ..utils import (
9 compat_http_client,
10 compat_urllib_error,
11 compat_urllib_request,
12 compat_str,
13
14 clean_html,
15 compiled_regex_type,
16 ExtractorError,
17 RegexNotFoundError,
18 unescapeHTML,
19 )
20
21 class InfoExtractor(object):
22 """Information Extractor class.
23
24 Information extractors are the classes that, given a URL, extract
25 information about the video (or videos) the URL refers to. This
26 information includes the real video URL, the video title, author and
27 others. The information is stored in a dictionary which is then
28 passed to the FileDownloader. The FileDownloader processes this
29 information possibly downloading the video to the file system, among
30 other possible outcomes.
31
32 The dictionaries must include the following fields:
33
34 id: Video identifier.
35 url: Final video URL.
36 title: Video title, unescaped.
37 ext: Video filename extension.
38
39 Instead of url and ext, formats can also specified.
40
41 The following fields are optional:
42
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnails: A list of dictionaries (with the entries "resolution" and
45 "url") for the varying thumbnails
46 thumbnail: Full URL to a video thumbnail image.
47 description: One-line video description.
48 uploader: Full name of the video uploader.
49 upload_date: Video upload date (YYYYMMDD).
50 uploader_id: Nickname or id of the video uploader.
51 location: Physical location of the video.
52 player_url: SWF Player URL (used for rtmpdump).
53 subtitles: The subtitle file contents as a dictionary in the format
54 {language: subtitles}.
55 view_count: How many users have watched the video on the platform.
56 urlhandle: [internal] The urlHandle to be used to download the file,
57 like returned by urllib.request.urlopen
58 age_limit: Age restriction for the video, as an integer (years)
59 formats: A list of dictionaries for each format available, it must
60 be ordered from worst to best quality. Potential fields:
61 * url Mandatory. The URL of the video file
62 * ext Will be calculated from url if missing
63 * format A human-readable description of the format
64 ("mp4 container with h264/opus").
65 Calculated from the format_id, width, height
66 and format_note fields if missing.
67 * format_id A short description of the format
68 ("mp4_h264_opus" or "19")
69 * format_note Additional info about the format
70 ("3D" or "DASH video")
71 * width Width of the video, if known
72 * height Height of the video, if known
73
74 Unless mentioned otherwise, the fields should be Unicode strings.
75
76 Subclasses of this one should re-define the _real_initialize() and
77 _real_extract() methods and define a _VALID_URL regexp.
78 Probably, they should also be added to the list of extractors.
79
80 _real_extract() must return a *list* of information dictionaries as
81 described above.
82
83 Finally, the _WORKING attribute should be set to False for broken IEs
84 in order to warn the users and skip the tests.
85 """
86
87 _ready = False
88 _downloader = None
89 _WORKING = True
90
91 def __init__(self, downloader=None):
92 """Constructor. Receives an optional downloader."""
93 self._ready = False
94 self.set_downloader(downloader)
95
96 @classmethod
97 def suitable(cls, url):
98 """Receives a URL and returns True if suitable for this IE."""
99
100 # This does not use has/getattr intentionally - we want to know whether
101 # we have cached the regexp for *this* class, whereas getattr would also
102 # match the superclass
103 if '_VALID_URL_RE' not in cls.__dict__:
104 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
105 return cls._VALID_URL_RE.match(url) is not None
106
107 @classmethod
108 def working(cls):
109 """Getter method for _WORKING."""
110 return cls._WORKING
111
112 def initialize(self):
113 """Initializes an instance (authentication, etc)."""
114 if not self._ready:
115 self._real_initialize()
116 self._ready = True
117
118 def extract(self, url):
119 """Extracts URL information and returns it in list of dicts."""
120 self.initialize()
121 return self._real_extract(url)
122
123 def set_downloader(self, downloader):
124 """Sets the downloader for this IE."""
125 self._downloader = downloader
126
127 def _real_initialize(self):
128 """Real initialization process. Redefine in subclasses."""
129 pass
130
131 def _real_extract(self, url):
132 """Real extraction process. Redefine in subclasses."""
133 pass
134
135 @classmethod
136 def ie_key(cls):
137 """A string for getting the InfoExtractor with get_info_extractor"""
138 return cls.__name__[:-2]
139
140 @property
141 def IE_NAME(self):
142 return type(self).__name__[:-2]
143
144 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
145 """ Returns the response handle """
146 if note is None:
147 self.report_download_webpage(video_id)
148 elif note is not False:
149 self.to_screen(u'%s: %s' % (video_id, note))
150 try:
151 return compat_urllib_request.urlopen(url_or_request)
152 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
153 if errnote is None:
154 errnote = u'Unable to download webpage'
155 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
156
157 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
158 """ Returns a tuple (page content as string, URL handle) """
159
160 # Strip hashes from the URL (#1038)
161 if isinstance(url_or_request, (compat_str, str)):
162 url_or_request = url_or_request.partition('#')[0]
163
164 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
165 content_type = urlh.headers.get('Content-Type', '')
166 webpage_bytes = urlh.read()
167 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
168 if m:
169 encoding = m.group(1)
170 else:
171 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
172 webpage_bytes[:1024])
173 if m:
174 encoding = m.group(1).decode('ascii')
175 else:
176 encoding = 'utf-8'
177 if self._downloader.params.get('dump_intermediate_pages', False):
178 try:
179 url = url_or_request.get_full_url()
180 except AttributeError:
181 url = url_or_request
182 self.to_screen(u'Dumping request to ' + url)
183 dump = base64.b64encode(webpage_bytes).decode('ascii')
184 self._downloader.to_screen(dump)
185 content = webpage_bytes.decode(encoding, 'replace')
186 return (content, urlh)
187
188 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
189 """ Returns the data of the page as a string """
190 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
191
192 def to_screen(self, msg):
193 """Print msg to screen, prefixing it with '[ie_name]'"""
194 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
195
196 def report_extraction(self, id_or_name):
197 """Report information extraction."""
198 self.to_screen(u'%s: Extracting information' % id_or_name)
199
200 def report_download_webpage(self, video_id):
201 """Report webpage download."""
202 self.to_screen(u'%s: Downloading webpage' % video_id)
203
204 def report_age_confirmation(self):
205 """Report attempt to confirm age."""
206 self.to_screen(u'Confirming age')
207
208 def report_login(self):
209 """Report attempt to log in."""
210 self.to_screen(u'Logging in')
211
212 #Methods for following #608
213 def url_result(self, url, ie=None):
214 """Returns a url that points to a page that should be processed"""
215 #TODO: ie should be the class used for getting the info
216 video_info = {'_type': 'url',
217 'url': url,
218 'ie_key': ie}
219 return video_info
220 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
221 """Returns a playlist"""
222 video_info = {'_type': 'playlist',
223 'entries': entries}
224 if playlist_id:
225 video_info['id'] = playlist_id
226 if playlist_title:
227 video_info['title'] = playlist_title
228 return video_info
229
230 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
231 """
232 Perform a regex search on the given string, using a single or a list of
233 patterns returning the first matching group.
234 In case of failure return a default value or raise a WARNING or a
235 RegexNotFoundError, depending on fatal, specifying the field name.
236 """
237 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
238 mobj = re.search(pattern, string, flags)
239 else:
240 for p in pattern:
241 mobj = re.search(p, string, flags)
242 if mobj: break
243
244 if sys.stderr.isatty() and os.name != 'nt':
245 _name = u'\033[0;34m%s\033[0m' % name
246 else:
247 _name = name
248
249 if mobj:
250 # return the first matching group
251 return next(g for g in mobj.groups() if g is not None)
252 elif default is not None:
253 return default
254 elif fatal:
255 raise RegexNotFoundError(u'Unable to extract %s' % _name)
256 else:
257 self._downloader.report_warning(u'unable to extract %s; '
258 u'please report this issue on http://yt-dl.org/bug' % _name)
259 return None
260
261 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
262 """
263 Like _search_regex, but strips HTML tags and unescapes entities.
264 """
265 res = self._search_regex(pattern, string, name, default, fatal, flags)
266 if res:
267 return clean_html(res).strip()
268 else:
269 return res
270
271 def _get_login_info(self):
272 """
273 Get the the login info as (username, password)
274 It will look in the netrc file using the _NETRC_MACHINE value
275 If there's no info available, return (None, None)
276 """
277 if self._downloader is None:
278 return (None, None)
279
280 username = None
281 password = None
282 downloader_params = self._downloader.params
283
284 # Attempt to use provided username and password or .netrc data
285 if downloader_params.get('username', None) is not None:
286 username = downloader_params['username']
287 password = downloader_params['password']
288 elif downloader_params.get('usenetrc', False):
289 try:
290 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
291 if info is not None:
292 username = info[0]
293 password = info[2]
294 else:
295 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
296 except (IOError, netrc.NetrcParseError) as err:
297 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
298
299 return (username, password)
300
301 # Helper functions for extracting OpenGraph info
302 @staticmethod
303 def _og_regex(prop):
304 return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
305
306 def _og_search_property(self, prop, html, name=None, **kargs):
307 if name is None:
308 name = 'OpenGraph %s' % prop
309 escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
310 return unescapeHTML(escaped)
311
312 def _og_search_thumbnail(self, html, **kargs):
313 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
314
315 def _og_search_description(self, html, **kargs):
316 return self._og_search_property('description', html, fatal=False, **kargs)
317
318 def _og_search_title(self, html, **kargs):
319 return self._og_search_property('title', html, **kargs)
320
321 def _og_search_video_url(self, html, name='video url', **kargs):
322 return self._html_search_regex([self._og_regex('video:secure_url'),
323 self._og_regex('video')],
324 html, name, **kargs)
325
326 def _rta_search(self, html):
327 # See http://www.rtalabel.org/index.php?content=howtofaq#single
328 if re.search(r'(?ix)<meta\s+name="rating"\s+'
329 r' content="RTA-5042-1996-1400-1577-RTA"',
330 html):
331 return 18
332 return 0
333
334
335 class SearchInfoExtractor(InfoExtractor):
336 """
337 Base class for paged search queries extractors.
338 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
339 Instances should define _SEARCH_KEY and _MAX_RESULTS.
340 """
341
342 @classmethod
343 def _make_valid_url(cls):
344 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
345
346 @classmethod
347 def suitable(cls, url):
348 return re.match(cls._make_valid_url(), url) is not None
349
350 def _real_extract(self, query):
351 mobj = re.match(self._make_valid_url(), query)
352 if mobj is None:
353 raise ExtractorError(u'Invalid search query "%s"' % query)
354
355 prefix = mobj.group('prefix')
356 query = mobj.group('query')
357 if prefix == '':
358 return self._get_n_results(query, 1)
359 elif prefix == 'all':
360 return self._get_n_results(query, self._MAX_RESULTS)
361 else:
362 n = int(prefix)
363 if n <= 0:
364 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
365 elif n > self._MAX_RESULTS:
366 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
367 n = self._MAX_RESULTS
368 return self._get_n_results(query, n)
369
370 def _get_n_results(self, query, n):
371 """Get a specified number of results for a query"""
372 raise NotImplementedError("This method must be implemented by subclasses")
373
374 @property
375 def SEARCH_KEY(self):
376 return self._SEARCH_KEY