]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
release 2014.07.11.2
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
d6983cb4 1import base64
3ec05685 2import hashlib
3d3538e4 3import json
4094b6e3 4import netrc
d6983cb4
PH
5import os
6import re
7import socket
8import sys
4094b6e3 9import time
267ed0c5 10import xml.etree.ElementTree
d6983cb4
PH
11
12from ..utils import (
13 compat_http_client,
14 compat_urllib_error,
c7deaa4c 15 compat_urllib_parse_urlparse,
d6983cb4
PH
16 compat_str,
17
18 clean_html,
19 compiled_regex_type,
20 ExtractorError,
55b3e45b 21 RegexNotFoundError,
d41e6efc 22 sanitize_filename,
f38de77f 23 unescapeHTML,
d6983cb4 24)
46374a56 25_NO_DEFAULT = object()
d6983cb4 26
dca08720 27
d6983cb4
PH
28class InfoExtractor(object):
29 """Information Extractor class.
30
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
38
39 The dictionaries must include the following fields:
40
41 id: Video identifier.
d6983cb4 42 title: Video title, unescaped.
d67b0b15 43
f49d89ee 44 Additionally, it must contain either a formats entry or a url one:
d67b0b15 45
f49d89ee
PH
46 formats: A list of dictionaries for each format available, ordered
47 from worst to best quality.
48
49 Potential fields:
d67b0b15
PH
50 * url Mandatory. The URL of the video file
51 * ext Will be calculated from url if missing
52 * format A human-readable description of the format
53 ("mp4 container with h264/opus").
54 Calculated from the format_id, width, height.
55 and format_note fields if missing.
56 * format_id A short description of the format
5d4f3985
PH
57 ("mp4_h264_opus" or "19").
58 Technically optional, but strongly recommended.
d67b0b15
PH
59 * format_note Additional info about the format
60 ("3D" or "DASH video")
61 * width Width of the video, if known
62 * height Height of the video, if known
f49d89ee 63 * resolution Textual description of width and height
7217e148 64 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
65 * abr Average audio bitrate in KBit/s
66 * acodec Name of the audio codec in use
dd27fd17 67 * asr Audio sampling rate in Hertz
d67b0b15
PH
68 * vbr Average video bitrate in KBit/s
69 * vcodec Name of the video codec in use
1394ce65 70 * container Name of the container format
d67b0b15
PH
71 * filesize The number of bytes, if known in advance
72 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
73 * protocol The protocol that will be used for the actual
74 download, lower-case.
db1f3888 75 "http", "https", "rtsp", "rtmp", "m3u8" or so.
f49d89ee 76 * preference Order number of this format. If this field is
08d13955 77 present and not None, the formats get sorted
38d63d84 78 by this field, regardless of all other values.
f49d89ee
PH
79 -1 for default (order by other properties),
80 -2 or smaller for less than default.
5d73273f
PH
81 * quality Order number of the video quality of this
82 format, irrespective of the file format.
83 -1 for default (order by other properties),
84 -2 or smaller for less than default.
c0ba0f48 85 url: Final video URL.
d6983cb4 86 ext: Video filename extension.
d67b0b15
PH
87 format: The video format, defaults to ext (used for --get-format)
88 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 89
d6983cb4
PH
90 The following fields are optional:
91
0afef30b
PH
92 display_id An alternative identifier for the video, not necessarily
93 unique, but available before title. Typically, id is
94 something like "4234987", title "Dancing naked mole rats",
95 and display_id "dancing-naked-mole-rats"
d5519808
PH
96 thumbnails: A list of dictionaries, with the following entries:
97 * "url"
98 * "width" (optional, int)
99 * "height" (optional, int)
100 * "resolution" (optional, string "{width}x{height"},
101 deprecated)
d6983cb4
PH
102 thumbnail: Full URL to a video thumbnail image.
103 description: One-line video description.
104 uploader: Full name of the video uploader.
955c4514 105 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 106 upload_date: Video upload date (YYYYMMDD).
955c4514 107 If not explicitly set, calculated from timestamp.
d6983cb4
PH
108 uploader_id: Nickname or id of the video uploader.
109 location: Physical location of the video.
5d51a883
JMF
110 subtitles: The subtitle file contents as a dictionary in the format
111 {language: subtitles}.
c0ba0f48 112 duration: Length of the video in seconds, as an integer.
f3d29461 113 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
114 like_count: Number of positive ratings of the video
115 dislike_count: Number of negative ratings of the video
116 comment_count: Number of comments on the video
8dbe9899 117 age_limit: Age restriction for the video, as an integer (years)
9103bbc5
JMF
118 webpage_url: The url to the video webpage, if given to youtube-dl it
119 should allow to get the same result again. (It will be set
120 by YoutubeDL if it's missing)
ad3bc6ac
PH
121 categories: A list of categories that the video falls in, for example
122 ["Sports", "Berlin"]
d6983cb4 123
deefc05b 124 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4
PH
125
126 Subclasses of this one should re-define the _real_initialize() and
127 _real_extract() methods and define a _VALID_URL regexp.
128 Probably, they should also be added to the list of extractors.
129
d6983cb4
PH
130 Finally, the _WORKING attribute should be set to False for broken IEs
131 in order to warn the users and skip the tests.
132 """
133
134 _ready = False
135 _downloader = None
136 _WORKING = True
137
138 def __init__(self, downloader=None):
139 """Constructor. Receives an optional downloader."""
140 self._ready = False
141 self.set_downloader(downloader)
142
143 @classmethod
144 def suitable(cls, url):
145 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
146
147 # This does not use has/getattr intentionally - we want to know whether
148 # we have cached the regexp for *this* class, whereas getattr would also
149 # match the superclass
150 if '_VALID_URL_RE' not in cls.__dict__:
151 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
152 return cls._VALID_URL_RE.match(url) is not None
d6983cb4
PH
153
154 @classmethod
155 def working(cls):
156 """Getter method for _WORKING."""
157 return cls._WORKING
158
159 def initialize(self):
160 """Initializes an instance (authentication, etc)."""
161 if not self._ready:
162 self._real_initialize()
163 self._ready = True
164
165 def extract(self, url):
166 """Extracts URL information and returns it in list of dicts."""
167 self.initialize()
168 return self._real_extract(url)
169
170 def set_downloader(self, downloader):
171 """Sets the downloader for this IE."""
172 self._downloader = downloader
173
174 def _real_initialize(self):
175 """Real initialization process. Redefine in subclasses."""
176 pass
177
178 def _real_extract(self, url):
179 """Real extraction process. Redefine in subclasses."""
180 pass
181
56c73665
JMF
182 @classmethod
183 def ie_key(cls):
184 """A string for getting the InfoExtractor with get_info_extractor"""
185 return cls.__name__[:-2]
186
d6983cb4
PH
187 @property
188 def IE_NAME(self):
189 return type(self).__name__[:-2]
190
7cc3570e 191 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
192 """ Returns the response handle """
193 if note is None:
194 self.report_download_webpage(video_id)
195 elif note is not False:
7cc3570e
PH
196 if video_id is None:
197 self.to_screen(u'%s' % (note,))
198 else:
199 self.to_screen(u'%s: %s' % (video_id, note))
d6983cb4 200 try:
dca08720 201 return self._downloader.urlopen(url_or_request)
d6983cb4 202 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
203 if errnote is False:
204 return False
d6983cb4
PH
205 if errnote is None:
206 errnote = u'Unable to download webpage'
7cc3570e
PH
207 errmsg = u'%s: %s' % (errnote, compat_str(err))
208 if fatal:
209 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
210 else:
211 self._downloader.report_warning(errmsg)
212 return False
d6983cb4 213
7cc3570e 214 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 215 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
216
217 # Strip hashes from the URL (#1038)
218 if isinstance(url_or_request, (compat_str, str)):
219 url_or_request = url_or_request.partition('#')[0]
220
7cc3570e
PH
221 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
222 if urlh is False:
223 assert not fatal
224 return False
d6983cb4 225 content_type = urlh.headers.get('Content-Type', '')
f143d86a 226 webpage_bytes = urlh.read()
d6983cb4
PH
227 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
228 if m:
229 encoding = m.group(1)
230 else:
0d75ae2c 231 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
232 webpage_bytes[:1024])
233 if m:
234 encoding = m.group(1).decode('ascii')
b60016e8
PH
235 elif webpage_bytes.startswith(b'\xff\xfe'):
236 encoding = 'utf-16'
f143d86a
PH
237 else:
238 encoding = 'utf-8'
d6983cb4
PH
239 if self._downloader.params.get('dump_intermediate_pages', False):
240 try:
241 url = url_or_request.get_full_url()
242 except AttributeError:
243 url = url_or_request
244 self.to_screen(u'Dumping request to ' + url)
245 dump = base64.b64encode(webpage_bytes).decode('ascii')
246 self._downloader.to_screen(dump)
d41e6efc
PH
247 if self._downloader.params.get('write_pages', False):
248 try:
249 url = url_or_request.get_full_url()
250 except AttributeError:
251 url = url_or_request
5afa7f8b 252 basen = '%s_%s' % (video_id, url)
c1bce22f
PH
253 if len(basen) > 240:
254 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
255 basen = basen[:240 - len(h)] + h
256 raw_filename = basen + '.dump'
d41e6efc
PH
257 filename = sanitize_filename(raw_filename, restricted=True)
258 self.to_screen(u'Saving request to ' + filename)
259 with open(filename, 'wb') as outf:
260 outf.write(webpage_bytes)
261
ec0fafbb
AA
262 try:
263 content = webpage_bytes.decode(encoding, 'replace')
264 except LookupError:
265 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d
PH
266
267 if (u'<title>Access to this site is blocked</title>' in content and
268 u'Websense' in content[:512]):
b6cfde99 269 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
270 blocked_iframe = self._html_search_regex(
271 r'<iframe src="([^"]+)"', content,
272 u'Websense information URL', default=None)
273 if blocked_iframe:
274 msg += u' Visit %s for more details' % blocked_iframe
275 raise ExtractorError(msg, expected=True)
276
d6983cb4
PH
277 return (content, urlh)
278
7cc3570e 279 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 280 """ Returns the data of the page as a string """
7cc3570e
PH
281 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
282 if res is False:
283 return res
284 else:
285 content, _ = res
286 return content
d6983cb4 287
2a275ab0 288 def _download_xml(self, url_or_request, video_id,
e2b38da9 289 note=u'Downloading XML', errnote=u'Unable to download XML',
28746fbd 290 transform_source=None, fatal=True):
267ed0c5 291 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd
PH
292 xml_string = self._download_webpage(
293 url_or_request, video_id, note, errnote, fatal=fatal)
294 if xml_string is False:
295 return xml_string
e2b38da9
PH
296 if transform_source:
297 xml_string = transform_source(xml_string)
267ed0c5
JMF
298 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
299
3d3538e4
PH
300 def _download_json(self, url_or_request, video_id,
301 note=u'Downloading JSON metadata',
81c2f20b
PH
302 errnote=u'Unable to download JSON metadata',
303 transform_source=None):
3d3538e4 304 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
81c2f20b
PH
305 if transform_source:
306 json_string = transform_source(json_string)
3d3538e4
PH
307 try:
308 return json.loads(json_string)
309 except ValueError as ve:
310 raise ExtractorError('Failed to download JSON', cause=ve)
311
f45f96f8
PH
312 def report_warning(self, msg, video_id=None):
313 idstr = u'' if video_id is None else u'%s: ' % video_id
314 self._downloader.report_warning(
315 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
316
d6983cb4
PH
317 def to_screen(self, msg):
318 """Print msg to screen, prefixing it with '[ie_name]'"""
319 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
320
321 def report_extraction(self, id_or_name):
322 """Report information extraction."""
323 self.to_screen(u'%s: Extracting information' % id_or_name)
324
325 def report_download_webpage(self, video_id):
326 """Report webpage download."""
327 self.to_screen(u'%s: Downloading webpage' % video_id)
328
329 def report_age_confirmation(self):
330 """Report attempt to confirm age."""
331 self.to_screen(u'Confirming age')
332
fc79158d
JMF
333 def report_login(self):
334 """Report attempt to log in."""
335 self.to_screen(u'Logging in')
336
d6983cb4 337 #Methods for following #608
c0d0b01f
JMF
338 @staticmethod
339 def url_result(url, ie=None, video_id=None):
d6983cb4
PH
340 """Returns a url that points to a page that should be processed"""
341 #TODO: ie should be the class used for getting the info
342 video_info = {'_type': 'url',
343 'url': url,
344 'ie_key': ie}
7012b23c
PH
345 if video_id is not None:
346 video_info['id'] = video_id
d6983cb4 347 return video_info
c0d0b01f
JMF
348 @staticmethod
349 def playlist_result(entries, playlist_id=None, playlist_title=None):
d6983cb4
PH
350 """Returns a playlist"""
351 video_info = {'_type': 'playlist',
352 'entries': entries}
353 if playlist_id:
354 video_info['id'] = playlist_id
355 if playlist_title:
356 video_info['title'] = playlist_title
357 return video_info
358
46374a56 359 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4
PH
360 """
361 Perform a regex search on the given string, using a single or a list of
362 patterns returning the first matching group.
363 In case of failure return a default value or raise a WARNING or a
55b3e45b 364 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
365 """
366 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
367 mobj = re.search(pattern, string, flags)
368 else:
369 for p in pattern:
370 mobj = re.search(p, string, flags)
371 if mobj: break
372
87a28127 373 if os.name != 'nt' and sys.stderr.isatty():
d6983cb4
PH
374 _name = u'\033[0;34m%s\033[0m' % name
375 else:
376 _name = name
377
378 if mobj:
379 # return the first matching group
380 return next(g for g in mobj.groups() if g is not None)
46374a56 381 elif default is not _NO_DEFAULT:
d6983cb4
PH
382 return default
383 elif fatal:
55b3e45b 384 raise RegexNotFoundError(u'Unable to extract %s' % _name)
d6983cb4
PH
385 else:
386 self._downloader.report_warning(u'unable to extract %s; '
98bcd283 387 u'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4
PH
388 return None
389
46374a56 390 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4
PH
391 """
392 Like _search_regex, but strips HTML tags and unescapes entities.
393 """
394 res = self._search_regex(pattern, string, name, default, fatal, flags)
395 if res:
396 return clean_html(res).strip()
397 else:
398 return res
399
fc79158d
JMF
400 def _get_login_info(self):
401 """
402 Get the the login info as (username, password)
403 It will look in the netrc file using the _NETRC_MACHINE value
404 If there's no info available, return (None, None)
405 """
406 if self._downloader is None:
407 return (None, None)
408
409 username = None
410 password = None
411 downloader_params = self._downloader.params
412
413 # Attempt to use provided username and password or .netrc data
414 if downloader_params.get('username', None) is not None:
415 username = downloader_params['username']
416 password = downloader_params['password']
417 elif downloader_params.get('usenetrc', False):
418 try:
419 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
420 if info is not None:
421 username = info[0]
422 password = info[2]
423 else:
424 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
425 except (IOError, netrc.NetrcParseError) as err:
426 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
427
428 return (username, password)
429
46720279
JMF
430 # Helper functions for extracting OpenGraph info
431 @staticmethod
ab2d5247 432 def _og_regexes(prop):
c1206423 433 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
9887c9b2 434 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
78fb87b2 435 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 436 return [
78fb87b2
JMF
437 template % (property_re, content_re),
438 template % (content_re, property_re),
ab2d5247 439 ]
46720279 440
3c4e6d83 441 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 442 if name is None:
3c4e6d83 443 name = 'OpenGraph %s' % prop
ab2d5247 444 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
445 if escaped is None:
446 return None
447 return unescapeHTML(escaped)
46720279
JMF
448
449 def _og_search_thumbnail(self, html, **kargs):
3c4e6d83 450 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
46720279
JMF
451
452 def _og_search_description(self, html, **kargs):
453 return self._og_search_property('description', html, fatal=False, **kargs)
454
455 def _og_search_title(self, html, **kargs):
456 return self._og_search_property('title', html, **kargs)
457
8ffa13e0 458 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
ab2d5247
JMF
459 regexes = self._og_regexes('video')
460 if secure: regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 461 return self._html_search_regex(regexes, html, name, **kargs)
46720279 462
78338f71
JMF
463 def _og_search_url(self, html, **kargs):
464 return self._og_search_property('url', html, **kargs)
465
9f62eaf4 466 def _html_search_meta(self, name, html, display_name=None, fatal=False):
59040888
PH
467 if display_name is None:
468 display_name = name
469 return self._html_search_regex(
aaebed13
PH
470 r'''(?ix)<meta
471 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
59040888 472 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
9f62eaf4 473 html, display_name, fatal=fatal)
59040888
PH
474
475 def _dc_search_uploader(self, html):
476 return self._html_search_meta('dc.creator', html, 'uploader')
477
8dbe9899
PH
478 def _rta_search(self, html):
479 # See http://www.rtalabel.org/index.php?content=howtofaq#single
480 if re.search(r'(?ix)<meta\s+name="rating"\s+'
481 r' content="RTA-5042-1996-1400-1577-RTA"',
482 html):
483 return 18
484 return 0
485
59040888
PH
486 def _media_rating_search(self, html):
487 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
488 rating = self._html_search_meta('rating', html)
489
490 if not rating:
491 return None
492
493 RATING_TABLE = {
494 'safe for kids': 0,
495 'general': 8,
496 '14 years': 14,
497 'mature': 17,
498 'restricted': 19,
499 }
500 return RATING_TABLE.get(rating.lower(), None)
501
0c708f11
JMF
502 def _twitter_search_player(self, html):
503 return self._html_search_meta('twitter:player', html,
504 'twitter card player')
505
4bcc7bd1 506 def _sort_formats(self, formats):
7e8caf30
PH
507 if not formats:
508 raise ExtractorError(u'No video formats found')
509
4bcc7bd1 510 def _formats_key(f):
e6812ac9
PH
511 # TODO remove the following workaround
512 from ..utils import determine_ext
513 if not f.get('ext') and 'url' in f:
514 f['ext'] = determine_ext(f['url'])
515
4bcc7bd1
PH
516 preference = f.get('preference')
517 if preference is None:
c7deaa4c
PH
518 proto = f.get('protocol')
519 if proto is None:
520 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
521
522 preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1
PH
523 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
524 preference -= 0.5
525
526 if f.get('vcodec') == 'none': # audio only
527 if self._downloader.params.get('prefer_free_formats'):
528 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
529 else:
530 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
531 ext_preference = 0
532 try:
533 audio_ext_preference = ORDER.index(f['ext'])
534 except ValueError:
535 audio_ext_preference = -1
536 else:
537 if self._downloader.params.get('prefer_free_formats'):
538 ORDER = [u'flv', u'mp4', u'webm']
539 else:
540 ORDER = [u'webm', u'flv', u'mp4']
541 try:
542 ext_preference = ORDER.index(f['ext'])
543 except ValueError:
544 ext_preference = -1
545 audio_ext_preference = 0
546
547 return (
548 preference,
5d73273f 549 f.get('quality') if f.get('quality') is not None else -1,
4bcc7bd1
PH
550 f.get('height') if f.get('height') is not None else -1,
551 f.get('width') if f.get('width') is not None else -1,
552 ext_preference,
9933b574 553 f.get('tbr') if f.get('tbr') is not None else -1,
4bcc7bd1
PH
554 f.get('vbr') if f.get('vbr') is not None else -1,
555 f.get('abr') if f.get('abr') is not None else -1,
556 audio_ext_preference,
557 f.get('filesize') if f.get('filesize') is not None else -1,
558 f.get('format_id'),
559 )
560 formats.sort(key=_formats_key)
59040888 561
20991253
PH
562 def http_scheme(self):
563 """ Either "https:" or "https:", depending on the user's preferences """
564 return (
565 'http:'
566 if self._downloader.params.get('prefer_insecure', False)
567 else 'https:')
568
57c7411f
PH
569 def _proto_relative_url(self, url, scheme=None):
570 if url is None:
571 return url
572 if url.startswith('//'):
573 if scheme is None:
574 scheme = self.http_scheme()
575 return scheme + url
576 else:
577 return url
578
4094b6e3
PH
579 def _sleep(self, timeout, video_id, msg_template=None):
580 if msg_template is None:
581 msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
582 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
583 self.to_screen(msg)
584 time.sleep(timeout)
585
8dbe9899 586
d6983cb4
PH
587class SearchInfoExtractor(InfoExtractor):
588 """
589 Base class for paged search queries extractors.
590 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
591 Instances should define _SEARCH_KEY and _MAX_RESULTS.
592 """
593
594 @classmethod
595 def _make_valid_url(cls):
596 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
597
598 @classmethod
599 def suitable(cls, url):
600 return re.match(cls._make_valid_url(), url) is not None
601
602 def _real_extract(self, query):
603 mobj = re.match(self._make_valid_url(), query)
604 if mobj is None:
605 raise ExtractorError(u'Invalid search query "%s"' % query)
606
607 prefix = mobj.group('prefix')
608 query = mobj.group('query')
609 if prefix == '':
610 return self._get_n_results(query, 1)
611 elif prefix == 'all':
612 return self._get_n_results(query, self._MAX_RESULTS)
613 else:
614 n = int(prefix)
615 if n <= 0:
616 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
617 elif n > self._MAX_RESULTS:
618 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
619 n = self._MAX_RESULTS
620 return self._get_n_results(query, n)
621
622 def _get_n_results(self, query, n):
623 """Get a specified number of results for a query"""
416a5efc 624 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
625
626 @property
627 def SEARCH_KEY(self):
628 return self._SEARCH_KEY