]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
Merge branch 'vlive' of https://github.com/ping/youtube-dl into ping-vlive
[yt-dlp.git] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import re
10 import socket
11 import sys
12 import time
13 import xml.etree.ElementTree
14
15 from ..compat import (
16 compat_cookiejar,
17 compat_cookies,
18 compat_getpass,
19 compat_HTTPError,
20 compat_http_client,
21 compat_urllib_error,
22 compat_urllib_parse,
23 compat_urllib_parse_urlparse,
24 compat_urllib_request,
25 compat_urlparse,
26 compat_str,
27 )
28 from ..utils import (
29 NO_DEFAULT,
30 age_restricted,
31 bug_reports_message,
32 clean_html,
33 compiled_regex_type,
34 determine_ext,
35 ExtractorError,
36 fix_xml_ampersands,
37 float_or_none,
38 int_or_none,
39 RegexNotFoundError,
40 sanitize_filename,
41 unescapeHTML,
42 url_basename,
43 xpath_text,
44 xpath_with_ns,
45 )
46
47
48 class InfoExtractor(object):
49 """Information Extractor class.
50
51 Information extractors are the classes that, given a URL, extract
52 information about the video (or videos) the URL refers to. This
53 information includes the real video URL, the video title, author and
54 others. The information is stored in a dictionary which is then
55 passed to the YoutubeDL. The YoutubeDL processes this
56 information possibly downloading the video to the file system, among
57 other possible outcomes.
58
59 The type field determines the type of the result.
60 By far the most common value (and the default if _type is missing) is
61 "video", which indicates a single video.
62
63 For a video, the dictionaries must include the following fields:
64
65 id: Video identifier.
66 title: Video title, unescaped.
67
68 Additionally, it must contain either a formats entry or a url one:
69
70 formats: A list of dictionaries for each format available, ordered
71 from worst to best quality.
72
73 Potential fields:
74 * url Mandatory. The URL of the video file
75 * ext Will be calculated from URL if missing
76 * format A human-readable description of the format
77 ("mp4 container with h264/opus").
78 Calculated from the format_id, width, height.
79 and format_note fields if missing.
80 * format_id A short description of the format
81 ("mp4_h264_opus" or "19").
82 Technically optional, but strongly recommended.
83 * format_note Additional info about the format
84 ("3D" or "DASH video")
85 * width Width of the video, if known
86 * height Height of the video, if known
87 * resolution Textual description of width and height
88 * tbr Average bitrate of audio and video in KBit/s
89 * abr Average audio bitrate in KBit/s
90 * acodec Name of the audio codec in use
91 * asr Audio sampling rate in Hertz
92 * vbr Average video bitrate in KBit/s
93 * fps Frame rate
94 * vcodec Name of the video codec in use
95 * container Name of the container format
96 * filesize The number of bytes, if known in advance
97 * filesize_approx An estimate for the number of bytes
98 * player_url SWF Player URL (used for rtmpdump).
99 * protocol The protocol that will be used for the actual
100 download, lower-case.
101 "http", "https", "rtsp", "rtmp", "rtmpe",
102 "m3u8", or "m3u8_native".
103 * preference Order number of this format. If this field is
104 present and not None, the formats get sorted
105 by this field, regardless of all other values.
106 -1 for default (order by other properties),
107 -2 or smaller for less than default.
108 < -1000 to hide the format (if there is
109 another one which is strictly better)
110 * language_preference Is this in the correct requested
111 language?
112 10 if it's what the URL is about,
113 -1 for default (don't know),
114 -10 otherwise, other values reserved for now.
115 * quality Order number of the video quality of this
116 format, irrespective of the file format.
117 -1 for default (order by other properties),
118 -2 or smaller for less than default.
119 * source_preference Order number for this video source
120 (quality takes higher priority)
121 -1 for default (order by other properties),
122 -2 or smaller for less than default.
123 * http_headers A dictionary of additional HTTP headers
124 to add to the request.
125 * stretched_ratio If given and not 1, indicates that the
126 video's pixels are not square.
127 width : height ratio as float.
128 * no_resume The server does not support resuming the
129 (HTTP or RTMP) download. Boolean.
130
131 url: Final video URL.
132 ext: Video filename extension.
133 format: The video format, defaults to ext (used for --get-format)
134 player_url: SWF Player URL (used for rtmpdump).
135
136 The following fields are optional:
137
138 alt_title: A secondary title of the video.
139 display_id An alternative identifier for the video, not necessarily
140 unique, but available before title. Typically, id is
141 something like "4234987", title "Dancing naked mole rats",
142 and display_id "dancing-naked-mole-rats"
143 thumbnails: A list of dictionaries, with the following entries:
144 * "id" (optional, string) - Thumbnail format ID
145 * "url"
146 * "preference" (optional, int) - quality of the image
147 * "width" (optional, int)
148 * "height" (optional, int)
149 * "resolution" (optional, string "{width}x{height"},
150 deprecated)
151 thumbnail: Full URL to a video thumbnail image.
152 description: Full video description.
153 uploader: Full name of the video uploader.
154 creator: The main artist who created the video.
155 timestamp: UNIX timestamp of the moment the video became available.
156 upload_date: Video upload date (YYYYMMDD).
157 If not explicitly set, calculated from timestamp.
158 uploader_id: Nickname or id of the video uploader.
159 location: Physical location where the video was filmed.
160 subtitles: The available subtitles as a dictionary in the format
161 {language: subformats}. "subformats" is a list sorted from
162 lower to higher preference, each element is a dictionary
163 with the "ext" entry and one of:
164 * "data": The subtitles file contents
165 * "url": A URL pointing to the subtitles file
166 automatic_captions: Like 'subtitles', used by the YoutubeIE for
167 automatically generated captions
168 duration: Length of the video in seconds, as an integer.
169 view_count: How many users have watched the video on the platform.
170 like_count: Number of positive ratings of the video
171 dislike_count: Number of negative ratings of the video
172 average_rating: Average rating give by users, the scale used depends on the webpage
173 comment_count: Number of comments on the video
174 comments: A list of comments, each with one or more of the following
175 properties (all but one of text or html optional):
176 * "author" - human-readable name of the comment author
177 * "author_id" - user ID of the comment author
178 * "id" - Comment ID
179 * "html" - Comment as HTML
180 * "text" - Plain text of the comment
181 * "timestamp" - UNIX timestamp of comment
182 * "parent" - ID of the comment this one is replying to.
183 Set to "root" to indicate that this is a
184 comment to the original video.
185 age_limit: Age restriction for the video, as an integer (years)
186 webpage_url: The URL to the video webpage, if given to youtube-dl it
187 should allow to get the same result again. (It will be set
188 by YoutubeDL if it's missing)
189 categories: A list of categories that the video falls in, for example
190 ["Sports", "Berlin"]
191 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
192 is_live: True, False, or None (=unknown). Whether this video is a
193 live stream that goes on instead of a fixed-length video.
194 start_time: Time in seconds where the reproduction should start, as
195 specified in the URL.
196 end_time: Time in seconds where the reproduction should end, as
197 specified in the URL.
198
199 Unless mentioned otherwise, the fields should be Unicode strings.
200
201 Unless mentioned otherwise, None is equivalent to absence of information.
202
203
204 _type "playlist" indicates multiple videos.
205 There must be a key "entries", which is a list, an iterable, or a PagedList
206 object, each element of which is a valid dictionary by this specification.
207
208 Additionally, playlists can have "title", "description" and "id" attributes
209 with the same semantics as videos (see above).
210
211
212 _type "multi_video" indicates that there are multiple videos that
213 form a single show, for examples multiple acts of an opera or TV episode.
214 It must have an entries key like a playlist and contain all the keys
215 required for a video at the same time.
216
217
218 _type "url" indicates that the video must be extracted from another
219 location, possibly by a different extractor. Its only required key is:
220 "url" - the next URL to extract.
221 The key "ie_key" can be set to the class name (minus the trailing "IE",
222 e.g. "Youtube") if the extractor class is known in advance.
223 Additionally, the dictionary may have any properties of the resolved entity
224 known in advance, for example "title" if the title of the referred video is
225 known ahead of time.
226
227
228 _type "url_transparent" entities have the same specification as "url", but
229 indicate that the given additional information is more precise than the one
230 associated with the resolved URL.
231 This is useful when a site employs a video service that hosts the video and
232 its technical metadata, but that video service does not embed a useful
233 title, description etc.
234
235
236 Subclasses of this one should re-define the _real_initialize() and
237 _real_extract() methods and define a _VALID_URL regexp.
238 Probably, they should also be added to the list of extractors.
239
240 Finally, the _WORKING attribute should be set to False for broken IEs
241 in order to warn the users and skip the tests.
242 """
243
244 _ready = False
245 _downloader = None
246 _WORKING = True
247
248 def __init__(self, downloader=None):
249 """Constructor. Receives an optional downloader."""
250 self._ready = False
251 self.set_downloader(downloader)
252
253 @classmethod
254 def suitable(cls, url):
255 """Receives a URL and returns True if suitable for this IE."""
256
257 # This does not use has/getattr intentionally - we want to know whether
258 # we have cached the regexp for *this* class, whereas getattr would also
259 # match the superclass
260 if '_VALID_URL_RE' not in cls.__dict__:
261 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
262 return cls._VALID_URL_RE.match(url) is not None
263
264 @classmethod
265 def _match_id(cls, url):
266 if '_VALID_URL_RE' not in cls.__dict__:
267 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
268 m = cls._VALID_URL_RE.match(url)
269 assert m
270 return m.group('id')
271
272 @classmethod
273 def working(cls):
274 """Getter method for _WORKING."""
275 return cls._WORKING
276
277 def initialize(self):
278 """Initializes an instance (authentication, etc)."""
279 if not self._ready:
280 self._real_initialize()
281 self._ready = True
282
283 def extract(self, url):
284 """Extracts URL information and returns it in list of dicts."""
285 try:
286 self.initialize()
287 return self._real_extract(url)
288 except ExtractorError:
289 raise
290 except compat_http_client.IncompleteRead as e:
291 raise ExtractorError('A network error has occured.', cause=e, expected=True)
292 except (KeyError, StopIteration) as e:
293 raise ExtractorError('An extractor error has occured.', cause=e)
294
295 def set_downloader(self, downloader):
296 """Sets the downloader for this IE."""
297 self._downloader = downloader
298
299 def _real_initialize(self):
300 """Real initialization process. Redefine in subclasses."""
301 pass
302
303 def _real_extract(self, url):
304 """Real extraction process. Redefine in subclasses."""
305 pass
306
307 @classmethod
308 def ie_key(cls):
309 """A string for getting the InfoExtractor with get_info_extractor"""
310 return cls.__name__[:-2]
311
312 @property
313 def IE_NAME(self):
314 return type(self).__name__[:-2]
315
316 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
317 """ Returns the response handle """
318 if note is None:
319 self.report_download_webpage(video_id)
320 elif note is not False:
321 if video_id is None:
322 self.to_screen('%s' % (note,))
323 else:
324 self.to_screen('%s: %s' % (video_id, note))
325 try:
326 return self._downloader.urlopen(url_or_request)
327 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
328 if errnote is False:
329 return False
330 if errnote is None:
331 errnote = 'Unable to download webpage'
332 errmsg = '%s: %s' % (errnote, compat_str(err))
333 if fatal:
334 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
335 else:
336 self._downloader.report_warning(errmsg)
337 return False
338
339 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
340 """ Returns a tuple (page content as string, URL handle) """
341 # Strip hashes from the URL (#1038)
342 if isinstance(url_or_request, (compat_str, str)):
343 url_or_request = url_or_request.partition('#')[0]
344
345 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
346 if urlh is False:
347 assert not fatal
348 return False
349 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
350 return (content, urlh)
351
352 @staticmethod
353 def _guess_encoding_from_content(content_type, webpage_bytes):
354 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
355 if m:
356 encoding = m.group(1)
357 else:
358 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
359 webpage_bytes[:1024])
360 if m:
361 encoding = m.group(1).decode('ascii')
362 elif webpage_bytes.startswith(b'\xff\xfe'):
363 encoding = 'utf-16'
364 else:
365 encoding = 'utf-8'
366
367 return encoding
368
369 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
370 content_type = urlh.headers.get('Content-Type', '')
371 webpage_bytes = urlh.read()
372 if prefix is not None:
373 webpage_bytes = prefix + webpage_bytes
374 if not encoding:
375 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
376 if self._downloader.params.get('dump_intermediate_pages', False):
377 try:
378 url = url_or_request.get_full_url()
379 except AttributeError:
380 url = url_or_request
381 self.to_screen('Dumping request to ' + url)
382 dump = base64.b64encode(webpage_bytes).decode('ascii')
383 self._downloader.to_screen(dump)
384 if self._downloader.params.get('write_pages', False):
385 try:
386 url = url_or_request.get_full_url()
387 except AttributeError:
388 url = url_or_request
389 basen = '%s_%s' % (video_id, url)
390 if len(basen) > 240:
391 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
392 basen = basen[:240 - len(h)] + h
393 raw_filename = basen + '.dump'
394 filename = sanitize_filename(raw_filename, restricted=True)
395 self.to_screen('Saving request to ' + filename)
396 # Working around MAX_PATH limitation on Windows (see
397 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
398 if os.name == 'nt':
399 absfilepath = os.path.abspath(filename)
400 if len(absfilepath) > 259:
401 filename = '\\\\?\\' + absfilepath
402 with open(filename, 'wb') as outf:
403 outf.write(webpage_bytes)
404
405 try:
406 content = webpage_bytes.decode(encoding, 'replace')
407 except LookupError:
408 content = webpage_bytes.decode('utf-8', 'replace')
409
410 if ('<title>Access to this site is blocked</title>' in content and
411 'Websense' in content[:512]):
412 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
413 blocked_iframe = self._html_search_regex(
414 r'<iframe src="([^"]+)"', content,
415 'Websense information URL', default=None)
416 if blocked_iframe:
417 msg += ' Visit %s for more details' % blocked_iframe
418 raise ExtractorError(msg, expected=True)
419 if '<title>The URL you requested has been blocked</title>' in content[:512]:
420 msg = (
421 'Access to this webpage has been blocked by Indian censorship. '
422 'Use a VPN or proxy server (with --proxy) to route around it.')
423 block_msg = self._html_search_regex(
424 r'</h1><p>(.*?)</p>',
425 content, 'block message', default=None)
426 if block_msg:
427 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
428 raise ExtractorError(msg, expected=True)
429
430 return content
431
432 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
433 """ Returns the data of the page as a string """
434 success = False
435 try_count = 0
436 while success is False:
437 try:
438 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
439 success = True
440 except compat_http_client.IncompleteRead as e:
441 try_count += 1
442 if try_count >= tries:
443 raise e
444 self._sleep(timeout, video_id)
445 if res is False:
446 return res
447 else:
448 content, _ = res
449 return content
450
451 def _download_xml(self, url_or_request, video_id,
452 note='Downloading XML', errnote='Unable to download XML',
453 transform_source=None, fatal=True, encoding=None):
454 """Return the xml as an xml.etree.ElementTree.Element"""
455 xml_string = self._download_webpage(
456 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
457 if xml_string is False:
458 return xml_string
459 if transform_source:
460 xml_string = transform_source(xml_string)
461 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
462
463 def _download_json(self, url_or_request, video_id,
464 note='Downloading JSON metadata',
465 errnote='Unable to download JSON metadata',
466 transform_source=None,
467 fatal=True, encoding=None):
468 json_string = self._download_webpage(
469 url_or_request, video_id, note, errnote, fatal=fatal,
470 encoding=encoding)
471 if (not fatal) and json_string is False:
472 return None
473 return self._parse_json(
474 json_string, video_id, transform_source=transform_source, fatal=fatal)
475
476 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
477 if transform_source:
478 json_string = transform_source(json_string)
479 try:
480 return json.loads(json_string)
481 except ValueError as ve:
482 errmsg = '%s: Failed to parse JSON ' % video_id
483 if fatal:
484 raise ExtractorError(errmsg, cause=ve)
485 else:
486 self.report_warning(errmsg + str(ve))
487
488 def report_warning(self, msg, video_id=None):
489 idstr = '' if video_id is None else '%s: ' % video_id
490 self._downloader.report_warning(
491 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
492
493 def to_screen(self, msg):
494 """Print msg to screen, prefixing it with '[ie_name]'"""
495 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
496
497 def report_extraction(self, id_or_name):
498 """Report information extraction."""
499 self.to_screen('%s: Extracting information' % id_or_name)
500
501 def report_download_webpage(self, video_id):
502 """Report webpage download."""
503 self.to_screen('%s: Downloading webpage' % video_id)
504
505 def report_age_confirmation(self):
506 """Report attempt to confirm age."""
507 self.to_screen('Confirming age')
508
509 def report_login(self):
510 """Report attempt to log in."""
511 self.to_screen('Logging in')
512
513 @staticmethod
514 def raise_login_required(msg='This video is only available for registered users'):
515 raise ExtractorError(
516 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
517 expected=True)
518
519 # Methods for following #608
520 @staticmethod
521 def url_result(url, ie=None, video_id=None, video_title=None):
522 """Returns a URL that points to a page that should be processed"""
523 # TODO: ie should be the class used for getting the info
524 video_info = {'_type': 'url',
525 'url': url,
526 'ie_key': ie}
527 if video_id is not None:
528 video_info['id'] = video_id
529 if video_title is not None:
530 video_info['title'] = video_title
531 return video_info
532
533 @staticmethod
534 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
535 """Returns a playlist"""
536 video_info = {'_type': 'playlist',
537 'entries': entries}
538 if playlist_id:
539 video_info['id'] = playlist_id
540 if playlist_title:
541 video_info['title'] = playlist_title
542 if playlist_description:
543 video_info['description'] = playlist_description
544 return video_info
545
546 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
547 """
548 Perform a regex search on the given string, using a single or a list of
549 patterns returning the first matching group.
550 In case of failure return a default value or raise a WARNING or a
551 RegexNotFoundError, depending on fatal, specifying the field name.
552 """
553 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
554 mobj = re.search(pattern, string, flags)
555 else:
556 for p in pattern:
557 mobj = re.search(p, string, flags)
558 if mobj:
559 break
560
561 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
562 _name = '\033[0;34m%s\033[0m' % name
563 else:
564 _name = name
565
566 if mobj:
567 if group is None:
568 # return the first matching group
569 return next(g for g in mobj.groups() if g is not None)
570 else:
571 return mobj.group(group)
572 elif default is not NO_DEFAULT:
573 return default
574 elif fatal:
575 raise RegexNotFoundError('Unable to extract %s' % _name)
576 else:
577 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
578 return None
579
580 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
581 """
582 Like _search_regex, but strips HTML tags and unescapes entities.
583 """
584 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
585 if res:
586 return clean_html(res).strip()
587 else:
588 return res
589
590 def _get_login_info(self):
591 """
592 Get the login info as (username, password)
593 It will look in the netrc file using the _NETRC_MACHINE value
594 If there's no info available, return (None, None)
595 """
596 if self._downloader is None:
597 return (None, None)
598
599 username = None
600 password = None
601 downloader_params = self._downloader.params
602
603 # Attempt to use provided username and password or .netrc data
604 if downloader_params.get('username', None) is not None:
605 username = downloader_params['username']
606 password = downloader_params['password']
607 elif downloader_params.get('usenetrc', False):
608 try:
609 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
610 if info is not None:
611 username = info[0]
612 password = info[2]
613 else:
614 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
615 except (IOError, netrc.NetrcParseError) as err:
616 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
617
618 return (username, password)
619
620 def _get_tfa_info(self, note='two-factor verification code'):
621 """
622 Get the two-factor authentication info
623 TODO - asking the user will be required for sms/phone verify
624 currently just uses the command line option
625 If there's no info available, return None
626 """
627 if self._downloader is None:
628 return None
629 downloader_params = self._downloader.params
630
631 if downloader_params.get('twofactor', None) is not None:
632 return downloader_params['twofactor']
633
634 return compat_getpass('Type %s and press [Return]: ' % note)
635
636 # Helper functions for extracting OpenGraph info
637 @staticmethod
638 def _og_regexes(prop):
639 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
640 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
641 template = r'<meta[^>]+?%s[^>]+?%s'
642 return [
643 template % (property_re, content_re),
644 template % (content_re, property_re),
645 ]
646
647 @staticmethod
648 def _meta_regex(prop):
649 return r'''(?isx)<meta
650 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
651 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
652
653 def _og_search_property(self, prop, html, name=None, **kargs):
654 if name is None:
655 name = 'OpenGraph %s' % prop
656 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
657 if escaped is None:
658 return None
659 return unescapeHTML(escaped)
660
661 def _og_search_thumbnail(self, html, **kargs):
662 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
663
664 def _og_search_description(self, html, **kargs):
665 return self._og_search_property('description', html, fatal=False, **kargs)
666
667 def _og_search_title(self, html, **kargs):
668 return self._og_search_property('title', html, **kargs)
669
670 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
671 regexes = self._og_regexes('video') + self._og_regexes('video:url')
672 if secure:
673 regexes = self._og_regexes('video:secure_url') + regexes
674 return self._html_search_regex(regexes, html, name, **kargs)
675
676 def _og_search_url(self, html, **kargs):
677 return self._og_search_property('url', html, **kargs)
678
679 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
680 if display_name is None:
681 display_name = name
682 return self._html_search_regex(
683 self._meta_regex(name),
684 html, display_name, fatal=fatal, group='content', **kwargs)
685
686 def _dc_search_uploader(self, html):
687 return self._html_search_meta('dc.creator', html, 'uploader')
688
689 def _rta_search(self, html):
690 # See http://www.rtalabel.org/index.php?content=howtofaq#single
691 if re.search(r'(?ix)<meta\s+name="rating"\s+'
692 r' content="RTA-5042-1996-1400-1577-RTA"',
693 html):
694 return 18
695 return 0
696
697 def _media_rating_search(self, html):
698 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
699 rating = self._html_search_meta('rating', html)
700
701 if not rating:
702 return None
703
704 RATING_TABLE = {
705 'safe for kids': 0,
706 'general': 8,
707 '14 years': 14,
708 'mature': 17,
709 'restricted': 19,
710 }
711 return RATING_TABLE.get(rating.lower(), None)
712
713 def _family_friendly_search(self, html):
714 # See http://schema.org/VideoObject
715 family_friendly = self._html_search_meta('isFamilyFriendly', html)
716
717 if not family_friendly:
718 return None
719
720 RATING_TABLE = {
721 '1': 0,
722 'true': 0,
723 '0': 18,
724 'false': 18,
725 }
726 return RATING_TABLE.get(family_friendly.lower(), None)
727
728 def _twitter_search_player(self, html):
729 return self._html_search_meta('twitter:player', html,
730 'twitter card player')
731
732 @staticmethod
733 def _hidden_inputs(html):
734 hidden_inputs = {}
735 for input in re.findall(r'<input([^>]+)>', html):
736 if not re.search(r'type=(["\'])hidden\1', input):
737 continue
738 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
739 if not name:
740 continue
741 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
742 if not value:
743 continue
744 hidden_inputs[name.group('value')] = value.group('value')
745 return hidden_inputs
746
747 def _form_hidden_inputs(self, form_id, html):
748 form = self._search_regex(
749 r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
750 html, '%s form' % form_id, group='form')
751 return self._hidden_inputs(form)
752
753 def _sort_formats(self, formats, field_preference=None):
754 if not formats:
755 raise ExtractorError('No video formats found')
756
757 def _formats_key(f):
758 # TODO remove the following workaround
759 from ..utils import determine_ext
760 if not f.get('ext') and 'url' in f:
761 f['ext'] = determine_ext(f['url'])
762
763 if isinstance(field_preference, (list, tuple)):
764 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
765
766 preference = f.get('preference')
767 if preference is None:
768 proto = f.get('protocol')
769 if proto is None:
770 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
771
772 preference = 0 if proto in ['http', 'https'] else -0.1
773 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
774 preference -= 0.5
775
776 if f.get('vcodec') == 'none': # audio only
777 if self._downloader.params.get('prefer_free_formats'):
778 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
779 else:
780 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
781 ext_preference = 0
782 try:
783 audio_ext_preference = ORDER.index(f['ext'])
784 except ValueError:
785 audio_ext_preference = -1
786 else:
787 if self._downloader.params.get('prefer_free_formats'):
788 ORDER = ['flv', 'mp4', 'webm']
789 else:
790 ORDER = ['webm', 'flv', 'mp4']
791 try:
792 ext_preference = ORDER.index(f['ext'])
793 except ValueError:
794 ext_preference = -1
795 audio_ext_preference = 0
796
797 return (
798 preference,
799 f.get('language_preference') if f.get('language_preference') is not None else -1,
800 f.get('quality') if f.get('quality') is not None else -1,
801 f.get('tbr') if f.get('tbr') is not None else -1,
802 f.get('filesize') if f.get('filesize') is not None else -1,
803 f.get('vbr') if f.get('vbr') is not None else -1,
804 f.get('height') if f.get('height') is not None else -1,
805 f.get('width') if f.get('width') is not None else -1,
806 ext_preference,
807 f.get('abr') if f.get('abr') is not None else -1,
808 audio_ext_preference,
809 f.get('fps') if f.get('fps') is not None else -1,
810 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
811 f.get('source_preference') if f.get('source_preference') is not None else -1,
812 f.get('format_id') if f.get('format_id') is not None else '',
813 )
814 formats.sort(key=_formats_key)
815
816 def _check_formats(self, formats, video_id):
817 if formats:
818 formats[:] = filter(
819 lambda f: self._is_valid_url(
820 f['url'], video_id,
821 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
822 formats)
823
824 def _is_valid_url(self, url, video_id, item='video'):
825 url = self._proto_relative_url(url, scheme='http:')
826 # For now assume non HTTP(S) URLs always valid
827 if not (url.startswith('http://') or url.startswith('https://')):
828 return True
829 try:
830 self._request_webpage(url, video_id, 'Checking %s URL' % item)
831 return True
832 except ExtractorError as e:
833 if isinstance(e.cause, compat_HTTPError):
834 self.to_screen(
835 '%s: %s URL is invalid, skipping' % (video_id, item))
836 return False
837 raise
838
839 def http_scheme(self):
840 """ Either "http:" or "https:", depending on the user's preferences """
841 return (
842 'http:'
843 if self._downloader.params.get('prefer_insecure', False)
844 else 'https:')
845
846 def _proto_relative_url(self, url, scheme=None):
847 if url is None:
848 return url
849 if url.startswith('//'):
850 if scheme is None:
851 scheme = self.http_scheme()
852 return scheme + url
853 else:
854 return url
855
856 def _sleep(self, timeout, video_id, msg_template=None):
857 if msg_template is None:
858 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
859 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
860 self.to_screen(msg)
861 time.sleep(timeout)
862
863 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
864 transform_source=lambda s: fix_xml_ampersands(s).strip()):
865 manifest = self._download_xml(
866 manifest_url, video_id, 'Downloading f4m manifest',
867 'Unable to download f4m manifest',
868 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
869 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
870 transform_source=transform_source)
871
872 formats = []
873 manifest_version = '1.0'
874 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
875 if not media_nodes:
876 manifest_version = '2.0'
877 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
878 for i, media_el in enumerate(media_nodes):
879 if manifest_version == '2.0':
880 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
881 if not media_url:
882 continue
883 manifest_url = (
884 media_url if media_url.startswith('http://') or media_url.startswith('https://')
885 else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
886 # If media_url is itself a f4m manifest do the recursive extraction
887 # since bitrates in parent manifest (this one) and media_url manifest
888 # may differ leading to inability to resolve the format by requested
889 # bitrate in f4m downloader
890 if determine_ext(manifest_url) == 'f4m':
891 formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
892 continue
893 tbr = int_or_none(media_el.attrib.get('bitrate'))
894 formats.append({
895 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
896 'url': manifest_url,
897 'ext': 'flv',
898 'tbr': tbr,
899 'width': int_or_none(media_el.attrib.get('width')),
900 'height': int_or_none(media_el.attrib.get('height')),
901 'preference': preference,
902 })
903 self._sort_formats(formats)
904
905 return formats
906
907 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
908 entry_protocol='m3u8', preference=None,
909 m3u8_id=None, note=None, errnote=None,
910 fatal=True):
911
912 formats = [{
913 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
914 'url': m3u8_url,
915 'ext': ext,
916 'protocol': 'm3u8',
917 'preference': preference - 1 if preference else -1,
918 'resolution': 'multiple',
919 'format_note': 'Quality selection URL',
920 }]
921
922 format_url = lambda u: (
923 u
924 if re.match(r'^https?://', u)
925 else compat_urlparse.urljoin(m3u8_url, u))
926
927 m3u8_doc = self._download_webpage(
928 m3u8_url, video_id,
929 note=note or 'Downloading m3u8 information',
930 errnote=errnote or 'Failed to download m3u8 information',
931 fatal=fatal)
932 if m3u8_doc is False:
933 return m3u8_doc
934 last_info = None
935 last_media = None
936 kv_rex = re.compile(
937 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
938 for line in m3u8_doc.splitlines():
939 if line.startswith('#EXT-X-STREAM-INF:'):
940 last_info = {}
941 for m in kv_rex.finditer(line):
942 v = m.group('val')
943 if v.startswith('"'):
944 v = v[1:-1]
945 last_info[m.group('key')] = v
946 elif line.startswith('#EXT-X-MEDIA:'):
947 last_media = {}
948 for m in kv_rex.finditer(line):
949 v = m.group('val')
950 if v.startswith('"'):
951 v = v[1:-1]
952 last_media[m.group('key')] = v
953 elif line.startswith('#') or not line.strip():
954 continue
955 else:
956 if last_info is None:
957 formats.append({'url': format_url(line)})
958 continue
959 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
960 format_id = []
961 if m3u8_id:
962 format_id.append(m3u8_id)
963 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
964 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
965 f = {
966 'format_id': '-'.join(format_id),
967 'url': format_url(line.strip()),
968 'tbr': tbr,
969 'ext': ext,
970 'protocol': entry_protocol,
971 'preference': preference,
972 }
973 codecs = last_info.get('CODECS')
974 if codecs:
975 # TODO: looks like video codec is not always necessarily goes first
976 va_codecs = codecs.split(',')
977 if va_codecs[0]:
978 f['vcodec'] = va_codecs[0].partition('.')[0]
979 if len(va_codecs) > 1 and va_codecs[1]:
980 f['acodec'] = va_codecs[1].partition('.')[0]
981 resolution = last_info.get('RESOLUTION')
982 if resolution:
983 width_str, height_str = resolution.split('x')
984 f['width'] = int(width_str)
985 f['height'] = int(height_str)
986 if last_media is not None:
987 f['m3u8_media'] = last_media
988 last_media = None
989 formats.append(f)
990 last_info = {}
991 self._sort_formats(formats)
992 return formats
993
994 @staticmethod
995 def _xpath_ns(path, namespace=None):
996 if not namespace:
997 return path
998 out = []
999 for c in path.split('/'):
1000 if not c or c == '.':
1001 out.append(c)
1002 else:
1003 out.append('{%s}%s' % (namespace, c))
1004 return '/'.join(out)
1005
1006 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1007 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1008
1009 if smil is False:
1010 assert not fatal
1011 return []
1012
1013 namespace = self._parse_smil_namespace(smil)
1014
1015 return self._parse_smil_formats(
1016 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1017
1018 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1019 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1020 if smil is False:
1021 return {}
1022 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1023
1024 def _download_smil(self, smil_url, video_id, fatal=True):
1025 return self._download_xml(
1026 smil_url, video_id, 'Downloading SMIL file',
1027 'Unable to download SMIL file', fatal=fatal)
1028
1029 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1030 namespace = self._parse_smil_namespace(smil)
1031
1032 formats = self._parse_smil_formats(
1033 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1034 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1035
1036 video_id = os.path.splitext(url_basename(smil_url))[0]
1037 title = None
1038 description = None
1039 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1040 name = meta.attrib.get('name')
1041 content = meta.attrib.get('content')
1042 if not name or not content:
1043 continue
1044 if not title and name == 'title':
1045 title = content
1046 elif not description and name in ('description', 'abstract'):
1047 description = content
1048
1049 return {
1050 'id': video_id,
1051 'title': title or video_id,
1052 'description': description,
1053 'formats': formats,
1054 'subtitles': subtitles,
1055 }
1056
1057 def _parse_smil_namespace(self, smil):
1058 return self._search_regex(
1059 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1060
1061 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1062 base = smil_url
1063 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1064 b = meta.get('base') or meta.get('httpBase')
1065 if b:
1066 base = b
1067 break
1068
1069 formats = []
1070 rtmp_count = 0
1071 http_count = 0
1072
1073 videos = smil.findall(self._xpath_ns('.//video', namespace))
1074 for video in videos:
1075 src = video.get('src')
1076 if not src:
1077 continue
1078
1079 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1080 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1081 width = int_or_none(video.get('width'))
1082 height = int_or_none(video.get('height'))
1083 proto = video.get('proto')
1084 ext = video.get('ext')
1085 src_ext = determine_ext(src)
1086 streamer = video.get('streamer') or base
1087
1088 if proto == 'rtmp' or streamer.startswith('rtmp'):
1089 rtmp_count += 1
1090 formats.append({
1091 'url': streamer,
1092 'play_path': src,
1093 'ext': 'flv',
1094 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1095 'tbr': bitrate,
1096 'filesize': filesize,
1097 'width': width,
1098 'height': height,
1099 })
1100 if transform_rtmp_url:
1101 streamer, src = transform_rtmp_url(streamer, src)
1102 formats[-1].update({
1103 'url': streamer,
1104 'play_path': src,
1105 })
1106 continue
1107
1108 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1109
1110 if proto == 'm3u8' or src_ext == 'm3u8':
1111 formats.extend(self._extract_m3u8_formats(
1112 src_url, video_id, ext or 'mp4', m3u8_id='hls'))
1113 continue
1114
1115 if src_ext == 'f4m':
1116 f4m_url = src_url
1117 if not f4m_params:
1118 f4m_params = {
1119 'hdcore': '3.2.0',
1120 'plugin': 'flowplayer-3.2.0.1',
1121 }
1122 f4m_url += '&' if '?' in f4m_url else '?'
1123 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1124 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
1125 continue
1126
1127 if src_url.startswith('http'):
1128 http_count += 1
1129 formats.append({
1130 'url': src_url,
1131 'ext': ext or src_ext or 'flv',
1132 'format_id': 'http-%d' % (bitrate or http_count),
1133 'tbr': bitrate,
1134 'filesize': filesize,
1135 'width': width,
1136 'height': height,
1137 })
1138 continue
1139
1140 self._sort_formats(formats)
1141
1142 return formats
1143
1144 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1145 subtitles = {}
1146 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1147 src = textstream.get('src')
1148 if not src:
1149 continue
1150 ext = textstream.get('ext') or determine_ext(src)
1151 if not ext:
1152 type_ = textstream.get('type')
1153 SUBTITLES_TYPES = {
1154 'text/vtt': 'vtt',
1155 'text/srt': 'srt',
1156 'application/smptett+xml': 'tt',
1157 }
1158 if type_ in SUBTITLES_TYPES:
1159 ext = SUBTITLES_TYPES[type_]
1160 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1161 subtitles.setdefault(lang, []).append({
1162 'url': src,
1163 'ext': ext,
1164 })
1165 return subtitles
1166
1167 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1168 xspf = self._download_xml(
1169 playlist_url, playlist_id, 'Downloading xpsf playlist',
1170 'Unable to download xspf manifest', fatal=fatal)
1171 if xspf is False:
1172 return []
1173 return self._parse_xspf(xspf, playlist_id)
1174
1175 def _parse_xspf(self, playlist, playlist_id):
1176 NS_MAP = {
1177 'xspf': 'http://xspf.org/ns/0/',
1178 's1': 'http://static.streamone.nl/player/ns/0',
1179 }
1180
1181 entries = []
1182 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1183 title = xpath_text(
1184 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1185 description = xpath_text(
1186 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1187 thumbnail = xpath_text(
1188 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1189 duration = float_or_none(
1190 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1191
1192 formats = [{
1193 'url': location.text,
1194 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1195 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1196 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1197 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1198 self._sort_formats(formats)
1199
1200 entries.append({
1201 'id': playlist_id,
1202 'title': title,
1203 'description': description,
1204 'thumbnail': thumbnail,
1205 'duration': duration,
1206 'formats': formats,
1207 })
1208 return entries
1209
1210 def _live_title(self, name):
1211 """ Generate the title for a live video """
1212 now = datetime.datetime.now()
1213 now_str = now.strftime("%Y-%m-%d %H:%M")
1214 return name + ' ' + now_str
1215
1216 def _int(self, v, name, fatal=False, **kwargs):
1217 res = int_or_none(v, **kwargs)
1218 if 'get_attr' in kwargs:
1219 print(getattr(v, kwargs['get_attr']))
1220 if res is None:
1221 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1222 if fatal:
1223 raise ExtractorError(msg)
1224 else:
1225 self._downloader.report_warning(msg)
1226 return res
1227
1228 def _float(self, v, name, fatal=False, **kwargs):
1229 res = float_or_none(v, **kwargs)
1230 if res is None:
1231 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1232 if fatal:
1233 raise ExtractorError(msg)
1234 else:
1235 self._downloader.report_warning(msg)
1236 return res
1237
1238 def _set_cookie(self, domain, name, value, expire_time=None):
1239 cookie = compat_cookiejar.Cookie(
1240 0, name, value, None, None, domain, None,
1241 None, '/', True, False, expire_time, '', None, None, None)
1242 self._downloader.cookiejar.set_cookie(cookie)
1243
1244 def _get_cookies(self, url):
1245 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1246 req = compat_urllib_request.Request(url)
1247 self._downloader.cookiejar.add_cookie_header(req)
1248 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1249
1250 def get_testcases(self, include_onlymatching=False):
1251 t = getattr(self, '_TEST', None)
1252 if t:
1253 assert not hasattr(self, '_TESTS'), \
1254 '%s has _TEST and _TESTS' % type(self).__name__
1255 tests = [t]
1256 else:
1257 tests = getattr(self, '_TESTS', [])
1258 for t in tests:
1259 if not include_onlymatching and t.get('only_matching', False):
1260 continue
1261 t['name'] = type(self).__name__[:-len('IE')]
1262 yield t
1263
1264 def is_suitable(self, age_limit):
1265 """ Test whether the extractor is generally suitable for the given
1266 age limit (i.e. pornographic sites are not, all others usually are) """
1267
1268 any_restricted = False
1269 for tc in self.get_testcases(include_onlymatching=False):
1270 if 'playlist' in tc:
1271 tc = tc['playlist'][0]
1272 is_restricted = age_restricted(
1273 tc.get('info_dict', {}).get('age_limit'), age_limit)
1274 if not is_restricted:
1275 return True
1276 any_restricted = any_restricted or is_restricted
1277 return not any_restricted
1278
1279 def extract_subtitles(self, *args, **kwargs):
1280 if (self._downloader.params.get('writesubtitles', False) or
1281 self._downloader.params.get('listsubtitles')):
1282 return self._get_subtitles(*args, **kwargs)
1283 return {}
1284
1285 def _get_subtitles(self, *args, **kwargs):
1286 raise NotImplementedError("This method must be implemented by subclasses")
1287
1288 @staticmethod
1289 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1290 """ Merge subtitle items for one language. Items with duplicated URLs
1291 will be dropped. """
1292 list1_urls = set([item['url'] for item in subtitle_list1])
1293 ret = list(subtitle_list1)
1294 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1295 return ret
1296
1297 @classmethod
1298 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1299 """ Merge two subtitle dictionaries, language by language. """
1300 ret = dict(subtitle_dict1)
1301 for lang in subtitle_dict2:
1302 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1303 return ret
1304
1305 def extract_automatic_captions(self, *args, **kwargs):
1306 if (self._downloader.params.get('writeautomaticsub', False) or
1307 self._downloader.params.get('listsubtitles')):
1308 return self._get_automatic_captions(*args, **kwargs)
1309 return {}
1310
1311 def _get_automatic_captions(self, *args, **kwargs):
1312 raise NotImplementedError("This method must be implemented by subclasses")
1313
1314
1315 class SearchInfoExtractor(InfoExtractor):
1316 """
1317 Base class for paged search queries extractors.
1318 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1319 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1320 """
1321
1322 @classmethod
1323 def _make_valid_url(cls):
1324 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1325
1326 @classmethod
1327 def suitable(cls, url):
1328 return re.match(cls._make_valid_url(), url) is not None
1329
1330 def _real_extract(self, query):
1331 mobj = re.match(self._make_valid_url(), query)
1332 if mobj is None:
1333 raise ExtractorError('Invalid search query "%s"' % query)
1334
1335 prefix = mobj.group('prefix')
1336 query = mobj.group('query')
1337 if prefix == '':
1338 return self._get_n_results(query, 1)
1339 elif prefix == 'all':
1340 return self._get_n_results(query, self._MAX_RESULTS)
1341 else:
1342 n = int(prefix)
1343 if n <= 0:
1344 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1345 elif n > self._MAX_RESULTS:
1346 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1347 n = self._MAX_RESULTS
1348 return self._get_n_results(query, n)
1349
1350 def _get_n_results(self, query, n):
1351 """Get a specified number of results for a query"""
1352 raise NotImplementedError("This method must be implemented by subclasses")
1353
1354 @property
1355 def SEARCH_KEY(self):
1356 return self._SEARCH_KEY