]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
fix increment operator
[yt-dlp.git] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import hashlib
5 import json
6 import netrc
7 import os
8 import re
9 import socket
10 import sys
11 import time
12 import xml.etree.ElementTree
13
14 from ..utils import (
15 compat_http_client,
16 compat_urllib_error,
17 compat_urllib_parse_urlparse,
18 compat_urlparse,
19 compat_str,
20
21 clean_html,
22 compiled_regex_type,
23 ExtractorError,
24 int_or_none,
25 RegexNotFoundError,
26 sanitize_filename,
27 unescapeHTML,
28 )
29 _NO_DEFAULT = object()
30
31
32 class InfoExtractor(object):
33 """Information Extractor class.
34
35 Information extractors are the classes that, given a URL, extract
36 information about the video (or videos) the URL refers to. This
37 information includes the real video URL, the video title, author and
38 others. The information is stored in a dictionary which is then
39 passed to the FileDownloader. The FileDownloader processes this
40 information possibly downloading the video to the file system, among
41 other possible outcomes.
42
43 The dictionaries must include the following fields:
44
45 id: Video identifier.
46 title: Video title, unescaped.
47
48 Additionally, it must contain either a formats entry or a url one:
49
50 formats: A list of dictionaries for each format available, ordered
51 from worst to best quality.
52
53 Potential fields:
54 * url Mandatory. The URL of the video file
55 * ext Will be calculated from url if missing
56 * format A human-readable description of the format
57 ("mp4 container with h264/opus").
58 Calculated from the format_id, width, height.
59 and format_note fields if missing.
60 * format_id A short description of the format
61 ("mp4_h264_opus" or "19").
62 Technically optional, but strongly recommended.
63 * format_note Additional info about the format
64 ("3D" or "DASH video")
65 * width Width of the video, if known
66 * height Height of the video, if known
67 * resolution Textual description of width and height
68 * tbr Average bitrate of audio and video in KBit/s
69 * abr Average audio bitrate in KBit/s
70 * acodec Name of the audio codec in use
71 * asr Audio sampling rate in Hertz
72 * vbr Average video bitrate in KBit/s
73 * vcodec Name of the video codec in use
74 * container Name of the container format
75 * filesize The number of bytes, if known in advance
76 * filesize_approx An estimate for the number of bytes
77 * player_url SWF Player URL (used for rtmpdump).
78 * protocol The protocol that will be used for the actual
79 download, lower-case.
80 "http", "https", "rtsp", "rtmp", "m3u8" or so.
81 * preference Order number of this format. If this field is
82 present and not None, the formats get sorted
83 by this field, regardless of all other values.
84 -1 for default (order by other properties),
85 -2 or smaller for less than default.
86 * quality Order number of the video quality of this
87 format, irrespective of the file format.
88 -1 for default (order by other properties),
89 -2 or smaller for less than default.
90 * http_referer HTTP Referer header value to set.
91 * http_method HTTP method to use for the download.
92 * http_headers A dictionary of additional HTTP headers
93 to add to the request.
94 * http_post_data Additional data to send with a POST
95 request.
96 url: Final video URL.
97 ext: Video filename extension.
98 format: The video format, defaults to ext (used for --get-format)
99 player_url: SWF Player URL (used for rtmpdump).
100
101 The following fields are optional:
102
103 display_id An alternative identifier for the video, not necessarily
104 unique, but available before title. Typically, id is
105 something like "4234987", title "Dancing naked mole rats",
106 and display_id "dancing-naked-mole-rats"
107 thumbnails: A list of dictionaries, with the following entries:
108 * "url"
109 * "width" (optional, int)
110 * "height" (optional, int)
111 * "resolution" (optional, string "{width}x{height"},
112 deprecated)
113 thumbnail: Full URL to a video thumbnail image.
114 description: One-line video description.
115 uploader: Full name of the video uploader.
116 timestamp: UNIX timestamp of the moment the video became available.
117 upload_date: Video upload date (YYYYMMDD).
118 If not explicitly set, calculated from timestamp.
119 uploader_id: Nickname or id of the video uploader.
120 location: Physical location where the video was filmed.
121 subtitles: The subtitle file contents as a dictionary in the format
122 {language: subtitles}.
123 duration: Length of the video in seconds, as an integer.
124 view_count: How many users have watched the video on the platform.
125 like_count: Number of positive ratings of the video
126 dislike_count: Number of negative ratings of the video
127 comment_count: Number of comments on the video
128 age_limit: Age restriction for the video, as an integer (years)
129 webpage_url: The url to the video webpage, if given to youtube-dl it
130 should allow to get the same result again. (It will be set
131 by YoutubeDL if it's missing)
132 categories: A list of categories that the video falls in, for example
133 ["Sports", "Berlin"]
134 is_live: True, False, or None (=unknown). Whether this video is a
135 live stream that goes on instead of a fixed-length video.
136
137 Unless mentioned otherwise, the fields should be Unicode strings.
138
139 Subclasses of this one should re-define the _real_initialize() and
140 _real_extract() methods and define a _VALID_URL regexp.
141 Probably, they should also be added to the list of extractors.
142
143 Finally, the _WORKING attribute should be set to False for broken IEs
144 in order to warn the users and skip the tests.
145 """
146
147 _ready = False
148 _downloader = None
149 _WORKING = True
150
151 def __init__(self, downloader=None):
152 """Constructor. Receives an optional downloader."""
153 self._ready = False
154 self.set_downloader(downloader)
155
156 @classmethod
157 def suitable(cls, url):
158 """Receives a URL and returns True if suitable for this IE."""
159
160 # This does not use has/getattr intentionally - we want to know whether
161 # we have cached the regexp for *this* class, whereas getattr would also
162 # match the superclass
163 if '_VALID_URL_RE' not in cls.__dict__:
164 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
165 return cls._VALID_URL_RE.match(url) is not None
166
167 @classmethod
168 def working(cls):
169 """Getter method for _WORKING."""
170 return cls._WORKING
171
172 def initialize(self):
173 """Initializes an instance (authentication, etc)."""
174 if not self._ready:
175 self._real_initialize()
176 self._ready = True
177
178 def extract(self, url):
179 """Extracts URL information and returns it in list of dicts."""
180 self.initialize()
181 return self._real_extract(url)
182
183 def set_downloader(self, downloader):
184 """Sets the downloader for this IE."""
185 self._downloader = downloader
186
187 def _real_initialize(self):
188 """Real initialization process. Redefine in subclasses."""
189 pass
190
191 def _real_extract(self, url):
192 """Real extraction process. Redefine in subclasses."""
193 pass
194
195 @classmethod
196 def ie_key(cls):
197 """A string for getting the InfoExtractor with get_info_extractor"""
198 return cls.__name__[:-2]
199
200 @property
201 def IE_NAME(self):
202 return type(self).__name__[:-2]
203
204 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
205 """ Returns the response handle """
206 if note is None:
207 self.report_download_webpage(video_id)
208 elif note is not False:
209 if video_id is None:
210 self.to_screen('%s' % (note,))
211 else:
212 self.to_screen('%s: %s' % (video_id, note))
213 try:
214 return self._downloader.urlopen(url_or_request)
215 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
216 if errnote is False:
217 return False
218 if errnote is None:
219 errnote = 'Unable to download webpage'
220 errmsg = '%s: %s' % (errnote, compat_str(err))
221 if fatal:
222 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
223 else:
224 self._downloader.report_warning(errmsg)
225 return False
226
227 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
228 """ Returns a tuple (page content as string, URL handle) """
229
230 # Strip hashes from the URL (#1038)
231 if isinstance(url_or_request, (compat_str, str)):
232 url_or_request = url_or_request.partition('#')[0]
233
234 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
235 if urlh is False:
236 assert not fatal
237 return False
238 content_type = urlh.headers.get('Content-Type', '')
239 webpage_bytes = urlh.read()
240 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
241 if m:
242 encoding = m.group(1)
243 else:
244 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
245 webpage_bytes[:1024])
246 if m:
247 encoding = m.group(1).decode('ascii')
248 elif webpage_bytes.startswith(b'\xff\xfe'):
249 encoding = 'utf-16'
250 else:
251 encoding = 'utf-8'
252 if self._downloader.params.get('dump_intermediate_pages', False):
253 try:
254 url = url_or_request.get_full_url()
255 except AttributeError:
256 url = url_or_request
257 self.to_screen('Dumping request to ' + url)
258 dump = base64.b64encode(webpage_bytes).decode('ascii')
259 self._downloader.to_screen(dump)
260 if self._downloader.params.get('write_pages', False):
261 try:
262 url = url_or_request.get_full_url()
263 except AttributeError:
264 url = url_or_request
265 basen = '%s_%s' % (video_id, url)
266 if len(basen) > 240:
267 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
268 basen = basen[:240 - len(h)] + h
269 raw_filename = basen + '.dump'
270 filename = sanitize_filename(raw_filename, restricted=True)
271 self.to_screen('Saving request to ' + filename)
272 with open(filename, 'wb') as outf:
273 outf.write(webpage_bytes)
274
275 try:
276 content = webpage_bytes.decode(encoding, 'replace')
277 except LookupError:
278 content = webpage_bytes.decode('utf-8', 'replace')
279
280 if ('<title>Access to this site is blocked</title>' in content and
281 'Websense' in content[:512]):
282 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
283 blocked_iframe = self._html_search_regex(
284 r'<iframe src="([^"]+)"', content,
285 'Websense information URL', default=None)
286 if blocked_iframe:
287 msg += ' Visit %s for more details' % blocked_iframe
288 raise ExtractorError(msg, expected=True)
289
290 return (content, urlh)
291
292 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
293 """ Returns the data of the page as a string """
294 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
295 if res is False:
296 return res
297 else:
298 content, _ = res
299 return content
300
301 def _download_xml(self, url_or_request, video_id,
302 note='Downloading XML', errnote='Unable to download XML',
303 transform_source=None, fatal=True):
304 """Return the xml as an xml.etree.ElementTree.Element"""
305 xml_string = self._download_webpage(
306 url_or_request, video_id, note, errnote, fatal=fatal)
307 if xml_string is False:
308 return xml_string
309 if transform_source:
310 xml_string = transform_source(xml_string)
311 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
312
313 def _download_json(self, url_or_request, video_id,
314 note='Downloading JSON metadata',
315 errnote='Unable to download JSON metadata',
316 transform_source=None,
317 fatal=True):
318 json_string = self._download_webpage(
319 url_or_request, video_id, note, errnote, fatal=fatal)
320 if (not fatal) and json_string is False:
321 return None
322 if transform_source:
323 json_string = transform_source(json_string)
324 try:
325 return json.loads(json_string)
326 except ValueError as ve:
327 raise ExtractorError('Failed to download JSON', cause=ve)
328
329 def report_warning(self, msg, video_id=None):
330 idstr = '' if video_id is None else '%s: ' % video_id
331 self._downloader.report_warning(
332 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
333
334 def to_screen(self, msg):
335 """Print msg to screen, prefixing it with '[ie_name]'"""
336 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
337
338 def report_extraction(self, id_or_name):
339 """Report information extraction."""
340 self.to_screen('%s: Extracting information' % id_or_name)
341
342 def report_download_webpage(self, video_id):
343 """Report webpage download."""
344 self.to_screen('%s: Downloading webpage' % video_id)
345
346 def report_age_confirmation(self):
347 """Report attempt to confirm age."""
348 self.to_screen('Confirming age')
349
350 def report_login(self):
351 """Report attempt to log in."""
352 self.to_screen('Logging in')
353
354 #Methods for following #608
355 @staticmethod
356 def url_result(url, ie=None, video_id=None):
357 """Returns a url that points to a page that should be processed"""
358 #TODO: ie should be the class used for getting the info
359 video_info = {'_type': 'url',
360 'url': url,
361 'ie_key': ie}
362 if video_id is not None:
363 video_info['id'] = video_id
364 return video_info
365 @staticmethod
366 def playlist_result(entries, playlist_id=None, playlist_title=None):
367 """Returns a playlist"""
368 video_info = {'_type': 'playlist',
369 'entries': entries}
370 if playlist_id:
371 video_info['id'] = playlist_id
372 if playlist_title:
373 video_info['title'] = playlist_title
374 return video_info
375
376 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
377 """
378 Perform a regex search on the given string, using a single or a list of
379 patterns returning the first matching group.
380 In case of failure return a default value or raise a WARNING or a
381 RegexNotFoundError, depending on fatal, specifying the field name.
382 """
383 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
384 mobj = re.search(pattern, string, flags)
385 else:
386 for p in pattern:
387 mobj = re.search(p, string, flags)
388 if mobj:
389 break
390
391 if os.name != 'nt' and sys.stderr.isatty():
392 _name = '\033[0;34m%s\033[0m' % name
393 else:
394 _name = name
395
396 if mobj:
397 # return the first matching group
398 return next(g for g in mobj.groups() if g is not None)
399 elif default is not _NO_DEFAULT:
400 return default
401 elif fatal:
402 raise RegexNotFoundError('Unable to extract %s' % _name)
403 else:
404 self._downloader.report_warning('unable to extract %s; '
405 'please report this issue on http://yt-dl.org/bug' % _name)
406 return None
407
408 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
409 """
410 Like _search_regex, but strips HTML tags and unescapes entities.
411 """
412 res = self._search_regex(pattern, string, name, default, fatal, flags)
413 if res:
414 return clean_html(res).strip()
415 else:
416 return res
417
418 def _get_login_info(self):
419 """
420 Get the the login info as (username, password)
421 It will look in the netrc file using the _NETRC_MACHINE value
422 If there's no info available, return (None, None)
423 """
424 if self._downloader is None:
425 return (None, None)
426
427 username = None
428 password = None
429 downloader_params = self._downloader.params
430
431 # Attempt to use provided username and password or .netrc data
432 if downloader_params.get('username', None) is not None:
433 username = downloader_params['username']
434 password = downloader_params['password']
435 elif downloader_params.get('usenetrc', False):
436 try:
437 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
438 if info is not None:
439 username = info[0]
440 password = info[2]
441 else:
442 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
443 except (IOError, netrc.NetrcParseError) as err:
444 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
445
446 return (username, password)
447
448 def _get_tfa_info(self):
449 """
450 Get the two-factor authentication info
451 TODO - asking the user will be required for sms/phone verify
452 currently just uses the command line option
453 If there's no info available, return None
454 """
455 if self._downloader is None:
456 return None
457 downloader_params = self._downloader.params
458
459 if downloader_params.get('twofactor', None) is not None:
460 return downloader_params['twofactor']
461
462 return None
463
464 # Helper functions for extracting OpenGraph info
465 @staticmethod
466 def _og_regexes(prop):
467 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
468 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
469 template = r'<meta[^>]+?%s[^>]+?%s'
470 return [
471 template % (property_re, content_re),
472 template % (content_re, property_re),
473 ]
474
475 def _og_search_property(self, prop, html, name=None, **kargs):
476 if name is None:
477 name = 'OpenGraph %s' % prop
478 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
479 if escaped is None:
480 return None
481 return unescapeHTML(escaped)
482
483 def _og_search_thumbnail(self, html, **kargs):
484 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
485
486 def _og_search_description(self, html, **kargs):
487 return self._og_search_property('description', html, fatal=False, **kargs)
488
489 def _og_search_title(self, html, **kargs):
490 return self._og_search_property('title', html, **kargs)
491
492 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
493 regexes = self._og_regexes('video') + self._og_regexes('video:url')
494 if secure:
495 regexes = self._og_regexes('video:secure_url') + regexes
496 return self._html_search_regex(regexes, html, name, **kargs)
497
498 def _og_search_url(self, html, **kargs):
499 return self._og_search_property('url', html, **kargs)
500
501 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
502 if display_name is None:
503 display_name = name
504 return self._html_search_regex(
505 r'''(?ix)<meta
506 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
507 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
508 html, display_name, fatal=fatal, **kwargs)
509
510 def _dc_search_uploader(self, html):
511 return self._html_search_meta('dc.creator', html, 'uploader')
512
513 def _rta_search(self, html):
514 # See http://www.rtalabel.org/index.php?content=howtofaq#single
515 if re.search(r'(?ix)<meta\s+name="rating"\s+'
516 r' content="RTA-5042-1996-1400-1577-RTA"',
517 html):
518 return 18
519 return 0
520
521 def _media_rating_search(self, html):
522 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
523 rating = self._html_search_meta('rating', html)
524
525 if not rating:
526 return None
527
528 RATING_TABLE = {
529 'safe for kids': 0,
530 'general': 8,
531 '14 years': 14,
532 'mature': 17,
533 'restricted': 19,
534 }
535 return RATING_TABLE.get(rating.lower(), None)
536
537 def _twitter_search_player(self, html):
538 return self._html_search_meta('twitter:player', html,
539 'twitter card player')
540
541 def _sort_formats(self, formats):
542 if not formats:
543 raise ExtractorError('No video formats found')
544
545 def _formats_key(f):
546 # TODO remove the following workaround
547 from ..utils import determine_ext
548 if not f.get('ext') and 'url' in f:
549 f['ext'] = determine_ext(f['url'])
550
551 preference = f.get('preference')
552 if preference is None:
553 proto = f.get('protocol')
554 if proto is None:
555 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
556
557 preference = 0 if proto in ['http', 'https'] else -0.1
558 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
559 preference -= 0.5
560
561 if f.get('vcodec') == 'none': # audio only
562 if self._downloader.params.get('prefer_free_formats'):
563 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
564 else:
565 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
566 ext_preference = 0
567 try:
568 audio_ext_preference = ORDER.index(f['ext'])
569 except ValueError:
570 audio_ext_preference = -1
571 else:
572 if self._downloader.params.get('prefer_free_formats'):
573 ORDER = ['flv', 'mp4', 'webm']
574 else:
575 ORDER = ['webm', 'flv', 'mp4']
576 try:
577 ext_preference = ORDER.index(f['ext'])
578 except ValueError:
579 ext_preference = -1
580 audio_ext_preference = 0
581
582 return (
583 preference,
584 f.get('quality') if f.get('quality') is not None else -1,
585 f.get('height') if f.get('height') is not None else -1,
586 f.get('width') if f.get('width') is not None else -1,
587 ext_preference,
588 f.get('tbr') if f.get('tbr') is not None else -1,
589 f.get('vbr') if f.get('vbr') is not None else -1,
590 f.get('abr') if f.get('abr') is not None else -1,
591 audio_ext_preference,
592 f.get('filesize') if f.get('filesize') is not None else -1,
593 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
594 f.get('format_id'),
595 )
596 formats.sort(key=_formats_key)
597
598 def http_scheme(self):
599 """ Either "https:" or "https:", depending on the user's preferences """
600 return (
601 'http:'
602 if self._downloader.params.get('prefer_insecure', False)
603 else 'https:')
604
605 def _proto_relative_url(self, url, scheme=None):
606 if url is None:
607 return url
608 if url.startswith('//'):
609 if scheme is None:
610 scheme = self.http_scheme()
611 return scheme + url
612 else:
613 return url
614
615 def _sleep(self, timeout, video_id, msg_template=None):
616 if msg_template is None:
617 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
618 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
619 self.to_screen(msg)
620 time.sleep(timeout)
621
622 def _extract_f4m_formats(self, manifest_url, video_id):
623 manifest = self._download_xml(
624 manifest_url, video_id, 'Downloading f4m manifest',
625 'Unable to download f4m manifest')
626
627 formats = []
628 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
629 for i, media_el in enumerate(media_nodes):
630 tbr = int_or_none(media_el.attrib.get('bitrate'))
631 format_id = 'f4m-%d' % (i if tbr is None else tbr)
632 formats.append({
633 'format_id': format_id,
634 'url': manifest_url,
635 'ext': 'flv',
636 'tbr': tbr,
637 'width': int_or_none(media_el.attrib.get('width')),
638 'height': int_or_none(media_el.attrib.get('height')),
639 })
640 self._sort_formats(formats)
641
642 return formats
643
644 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
645 entry_protocol='m3u8', preference=None):
646
647 formats = [{
648 'format_id': 'm3u8-meta',
649 'url': m3u8_url,
650 'ext': ext,
651 'protocol': 'm3u8',
652 'preference': -1,
653 'resolution': 'multiple',
654 'format_note': 'Quality selection URL',
655 }]
656
657 format_url = lambda u: (
658 u
659 if re.match(r'^https?://', u)
660 else compat_urlparse.urljoin(m3u8_url, u))
661
662 m3u8_doc = self._download_webpage(m3u8_url, video_id)
663 last_info = None
664 kv_rex = re.compile(
665 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
666 for line in m3u8_doc.splitlines():
667 if line.startswith('#EXT-X-STREAM-INF:'):
668 last_info = {}
669 for m in kv_rex.finditer(line):
670 v = m.group('val')
671 if v.startswith('"'):
672 v = v[1:-1]
673 last_info[m.group('key')] = v
674 elif line.startswith('#') or not line.strip():
675 continue
676 else:
677 if last_info is None:
678 formats.append({'url': format_url(line)})
679 continue
680 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
681
682 f = {
683 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
684 'url': format_url(line.strip()),
685 'tbr': tbr,
686 'ext': ext,
687 'protocol': entry_protocol,
688 'preference': preference,
689 }
690 codecs = last_info.get('CODECS')
691 if codecs:
692 # TODO: looks like video codec is not always necessarily goes first
693 va_codecs = codecs.split(',')
694 if va_codecs[0]:
695 f['vcodec'] = va_codecs[0].partition('.')[0]
696 if len(va_codecs) > 1 and va_codecs[1]:
697 f['acodec'] = va_codecs[1].partition('.')[0]
698 resolution = last_info.get('RESOLUTION')
699 if resolution:
700 width_str, height_str = resolution.split('x')
701 f['width'] = int(width_str)
702 f['height'] = int(height_str)
703 formats.append(f)
704 last_info = {}
705 self._sort_formats(formats)
706 return formats
707
708
709 class SearchInfoExtractor(InfoExtractor):
710 """
711 Base class for paged search queries extractors.
712 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
713 Instances should define _SEARCH_KEY and _MAX_RESULTS.
714 """
715
716 @classmethod
717 def _make_valid_url(cls):
718 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
719
720 @classmethod
721 def suitable(cls, url):
722 return re.match(cls._make_valid_url(), url) is not None
723
724 def _real_extract(self, query):
725 mobj = re.match(self._make_valid_url(), query)
726 if mobj is None:
727 raise ExtractorError('Invalid search query "%s"' % query)
728
729 prefix = mobj.group('prefix')
730 query = mobj.group('query')
731 if prefix == '':
732 return self._get_n_results(query, 1)
733 elif prefix == 'all':
734 return self._get_n_results(query, self._MAX_RESULTS)
735 else:
736 n = int(prefix)
737 if n <= 0:
738 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
739 elif n > self._MAX_RESULTS:
740 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
741 n = self._MAX_RESULTS
742 return self._get_n_results(query, n)
743
744 def _get_n_results(self, query, n):
745 """Get a specified number of results for a query"""
746 raise NotImplementedError("This method must be implemented by subclasses")
747
748 @property
749 def SEARCH_KEY(self):
750 return self._SEARCH_KEY