]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
[utils] Fix js_to_json
[yt-dlp.git] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import re
10 import socket
11 import sys
12 import time
13 import xml.etree.ElementTree
14
15 from ..utils import (
16 compat_http_client,
17 compat_urllib_error,
18 compat_urllib_parse_urlparse,
19 compat_urlparse,
20 compat_str,
21
22 clean_html,
23 compiled_regex_type,
24 ExtractorError,
25 float_or_none,
26 int_or_none,
27 RegexNotFoundError,
28 sanitize_filename,
29 unescapeHTML,
30 )
31 _NO_DEFAULT = object()
32
33
34 class InfoExtractor(object):
35 """Information Extractor class.
36
37 Information extractors are the classes that, given a URL, extract
38 information about the video (or videos) the URL refers to. This
39 information includes the real video URL, the video title, author and
40 others. The information is stored in a dictionary which is then
41 passed to the FileDownloader. The FileDownloader processes this
42 information possibly downloading the video to the file system, among
43 other possible outcomes.
44
45 The dictionaries must include the following fields:
46
47 id: Video identifier.
48 title: Video title, unescaped.
49
50 Additionally, it must contain either a formats entry or a url one:
51
52 formats: A list of dictionaries for each format available, ordered
53 from worst to best quality.
54
55 Potential fields:
56 * url Mandatory. The URL of the video file
57 * ext Will be calculated from url if missing
58 * format A human-readable description of the format
59 ("mp4 container with h264/opus").
60 Calculated from the format_id, width, height.
61 and format_note fields if missing.
62 * format_id A short description of the format
63 ("mp4_h264_opus" or "19").
64 Technically optional, but strongly recommended.
65 * format_note Additional info about the format
66 ("3D" or "DASH video")
67 * width Width of the video, if known
68 * height Height of the video, if known
69 * resolution Textual description of width and height
70 * tbr Average bitrate of audio and video in KBit/s
71 * abr Average audio bitrate in KBit/s
72 * acodec Name of the audio codec in use
73 * asr Audio sampling rate in Hertz
74 * vbr Average video bitrate in KBit/s
75 * vcodec Name of the video codec in use
76 * container Name of the container format
77 * filesize The number of bytes, if known in advance
78 * filesize_approx An estimate for the number of bytes
79 * player_url SWF Player URL (used for rtmpdump).
80 * protocol The protocol that will be used for the actual
81 download, lower-case.
82 "http", "https", "rtsp", "rtmp", "m3u8" or so.
83 * preference Order number of this format. If this field is
84 present and not None, the formats get sorted
85 by this field, regardless of all other values.
86 -1 for default (order by other properties),
87 -2 or smaller for less than default.
88 * quality Order number of the video quality of this
89 format, irrespective of the file format.
90 -1 for default (order by other properties),
91 -2 or smaller for less than default.
92 * http_referer HTTP Referer header value to set.
93 * http_method HTTP method to use for the download.
94 * http_headers A dictionary of additional HTTP headers
95 to add to the request.
96 * http_post_data Additional data to send with a POST
97 request.
98 url: Final video URL.
99 ext: Video filename extension.
100 format: The video format, defaults to ext (used for --get-format)
101 player_url: SWF Player URL (used for rtmpdump).
102
103 The following fields are optional:
104
105 display_id An alternative identifier for the video, not necessarily
106 unique, but available before title. Typically, id is
107 something like "4234987", title "Dancing naked mole rats",
108 and display_id "dancing-naked-mole-rats"
109 thumbnails: A list of dictionaries, with the following entries:
110 * "url"
111 * "width" (optional, int)
112 * "height" (optional, int)
113 * "resolution" (optional, string "{width}x{height"},
114 deprecated)
115 thumbnail: Full URL to a video thumbnail image.
116 description: One-line video description.
117 uploader: Full name of the video uploader.
118 timestamp: UNIX timestamp of the moment the video became available.
119 upload_date: Video upload date (YYYYMMDD).
120 If not explicitly set, calculated from timestamp.
121 uploader_id: Nickname or id of the video uploader.
122 location: Physical location where the video was filmed.
123 subtitles: The subtitle file contents as a dictionary in the format
124 {language: subtitles}.
125 duration: Length of the video in seconds, as an integer.
126 view_count: How many users have watched the video on the platform.
127 like_count: Number of positive ratings of the video
128 dislike_count: Number of negative ratings of the video
129 comment_count: Number of comments on the video
130 age_limit: Age restriction for the video, as an integer (years)
131 webpage_url: The url to the video webpage, if given to youtube-dl it
132 should allow to get the same result again. (It will be set
133 by YoutubeDL if it's missing)
134 categories: A list of categories that the video falls in, for example
135 ["Sports", "Berlin"]
136 is_live: True, False, or None (=unknown). Whether this video is a
137 live stream that goes on instead of a fixed-length video.
138
139 Unless mentioned otherwise, the fields should be Unicode strings.
140
141 Subclasses of this one should re-define the _real_initialize() and
142 _real_extract() methods and define a _VALID_URL regexp.
143 Probably, they should also be added to the list of extractors.
144
145 Finally, the _WORKING attribute should be set to False for broken IEs
146 in order to warn the users and skip the tests.
147 """
148
149 _ready = False
150 _downloader = None
151 _WORKING = True
152
153 def __init__(self, downloader=None):
154 """Constructor. Receives an optional downloader."""
155 self._ready = False
156 self.set_downloader(downloader)
157
158 @classmethod
159 def suitable(cls, url):
160 """Receives a URL and returns True if suitable for this IE."""
161
162 # This does not use has/getattr intentionally - we want to know whether
163 # we have cached the regexp for *this* class, whereas getattr would also
164 # match the superclass
165 if '_VALID_URL_RE' not in cls.__dict__:
166 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
167 return cls._VALID_URL_RE.match(url) is not None
168
169 @classmethod
170 def _match_id(cls, url):
171 if '_VALID_URL_RE' not in cls.__dict__:
172 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
173 m = cls._VALID_URL_RE.match(url)
174 assert m
175 return m.group('id')
176
177 @classmethod
178 def working(cls):
179 """Getter method for _WORKING."""
180 return cls._WORKING
181
182 def initialize(self):
183 """Initializes an instance (authentication, etc)."""
184 if not self._ready:
185 self._real_initialize()
186 self._ready = True
187
188 def extract(self, url):
189 """Extracts URL information and returns it in list of dicts."""
190 self.initialize()
191 return self._real_extract(url)
192
193 def set_downloader(self, downloader):
194 """Sets the downloader for this IE."""
195 self._downloader = downloader
196
197 def _real_initialize(self):
198 """Real initialization process. Redefine in subclasses."""
199 pass
200
201 def _real_extract(self, url):
202 """Real extraction process. Redefine in subclasses."""
203 pass
204
205 @classmethod
206 def ie_key(cls):
207 """A string for getting the InfoExtractor with get_info_extractor"""
208 return cls.__name__[:-2]
209
210 @property
211 def IE_NAME(self):
212 return type(self).__name__[:-2]
213
214 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
215 """ Returns the response handle """
216 if note is None:
217 self.report_download_webpage(video_id)
218 elif note is not False:
219 if video_id is None:
220 self.to_screen('%s' % (note,))
221 else:
222 self.to_screen('%s: %s' % (video_id, note))
223 try:
224 return self._downloader.urlopen(url_or_request)
225 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
226 if errnote is False:
227 return False
228 if errnote is None:
229 errnote = 'Unable to download webpage'
230 errmsg = '%s: %s' % (errnote, compat_str(err))
231 if fatal:
232 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
233 else:
234 self._downloader.report_warning(errmsg)
235 return False
236
237 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
238 """ Returns a tuple (page content as string, URL handle) """
239
240 # Strip hashes from the URL (#1038)
241 if isinstance(url_or_request, (compat_str, str)):
242 url_or_request = url_or_request.partition('#')[0]
243
244 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
245 if urlh is False:
246 assert not fatal
247 return False
248 content_type = urlh.headers.get('Content-Type', '')
249 webpage_bytes = urlh.read()
250 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
251 if m:
252 encoding = m.group(1)
253 else:
254 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
255 webpage_bytes[:1024])
256 if m:
257 encoding = m.group(1).decode('ascii')
258 elif webpage_bytes.startswith(b'\xff\xfe'):
259 encoding = 'utf-16'
260 else:
261 encoding = 'utf-8'
262 if self._downloader.params.get('dump_intermediate_pages', False):
263 try:
264 url = url_or_request.get_full_url()
265 except AttributeError:
266 url = url_or_request
267 self.to_screen('Dumping request to ' + url)
268 dump = base64.b64encode(webpage_bytes).decode('ascii')
269 self._downloader.to_screen(dump)
270 if self._downloader.params.get('write_pages', False):
271 try:
272 url = url_or_request.get_full_url()
273 except AttributeError:
274 url = url_or_request
275 basen = '%s_%s' % (video_id, url)
276 if len(basen) > 240:
277 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
278 basen = basen[:240 - len(h)] + h
279 raw_filename = basen + '.dump'
280 filename = sanitize_filename(raw_filename, restricted=True)
281 self.to_screen('Saving request to ' + filename)
282 with open(filename, 'wb') as outf:
283 outf.write(webpage_bytes)
284
285 try:
286 content = webpage_bytes.decode(encoding, 'replace')
287 except LookupError:
288 content = webpage_bytes.decode('utf-8', 'replace')
289
290 if ('<title>Access to this site is blocked</title>' in content and
291 'Websense' in content[:512]):
292 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
293 blocked_iframe = self._html_search_regex(
294 r'<iframe src="([^"]+)"', content,
295 'Websense information URL', default=None)
296 if blocked_iframe:
297 msg += ' Visit %s for more details' % blocked_iframe
298 raise ExtractorError(msg, expected=True)
299
300 return (content, urlh)
301
302 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
303 """ Returns the data of the page as a string """
304 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
305 if res is False:
306 return res
307 else:
308 content, _ = res
309 return content
310
311 def _download_xml(self, url_or_request, video_id,
312 note='Downloading XML', errnote='Unable to download XML',
313 transform_source=None, fatal=True):
314 """Return the xml as an xml.etree.ElementTree.Element"""
315 xml_string = self._download_webpage(
316 url_or_request, video_id, note, errnote, fatal=fatal)
317 if xml_string is False:
318 return xml_string
319 if transform_source:
320 xml_string = transform_source(xml_string)
321 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
322
323 def _download_json(self, url_or_request, video_id,
324 note='Downloading JSON metadata',
325 errnote='Unable to download JSON metadata',
326 transform_source=None,
327 fatal=True):
328 json_string = self._download_webpage(
329 url_or_request, video_id, note, errnote, fatal=fatal)
330 if (not fatal) and json_string is False:
331 return None
332 if transform_source:
333 json_string = transform_source(json_string)
334 try:
335 return json.loads(json_string)
336 except ValueError as ve:
337 raise ExtractorError('Failed to download JSON', cause=ve)
338
339 def report_warning(self, msg, video_id=None):
340 idstr = '' if video_id is None else '%s: ' % video_id
341 self._downloader.report_warning(
342 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
343
344 def to_screen(self, msg):
345 """Print msg to screen, prefixing it with '[ie_name]'"""
346 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
347
348 def report_extraction(self, id_or_name):
349 """Report information extraction."""
350 self.to_screen('%s: Extracting information' % id_or_name)
351
352 def report_download_webpage(self, video_id):
353 """Report webpage download."""
354 self.to_screen('%s: Downloading webpage' % video_id)
355
356 def report_age_confirmation(self):
357 """Report attempt to confirm age."""
358 self.to_screen('Confirming age')
359
360 def report_login(self):
361 """Report attempt to log in."""
362 self.to_screen('Logging in')
363
364 #Methods for following #608
365 @staticmethod
366 def url_result(url, ie=None, video_id=None):
367 """Returns a url that points to a page that should be processed"""
368 #TODO: ie should be the class used for getting the info
369 video_info = {'_type': 'url',
370 'url': url,
371 'ie_key': ie}
372 if video_id is not None:
373 video_info['id'] = video_id
374 return video_info
375 @staticmethod
376 def playlist_result(entries, playlist_id=None, playlist_title=None):
377 """Returns a playlist"""
378 video_info = {'_type': 'playlist',
379 'entries': entries}
380 if playlist_id:
381 video_info['id'] = playlist_id
382 if playlist_title:
383 video_info['title'] = playlist_title
384 return video_info
385
386 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
387 """
388 Perform a regex search on the given string, using a single or a list of
389 patterns returning the first matching group.
390 In case of failure return a default value or raise a WARNING or a
391 RegexNotFoundError, depending on fatal, specifying the field name.
392 """
393 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
394 mobj = re.search(pattern, string, flags)
395 else:
396 for p in pattern:
397 mobj = re.search(p, string, flags)
398 if mobj:
399 break
400
401 if os.name != 'nt' and sys.stderr.isatty():
402 _name = '\033[0;34m%s\033[0m' % name
403 else:
404 _name = name
405
406 if mobj:
407 # return the first matching group
408 return next(g for g in mobj.groups() if g is not None)
409 elif default is not _NO_DEFAULT:
410 return default
411 elif fatal:
412 raise RegexNotFoundError('Unable to extract %s' % _name)
413 else:
414 self._downloader.report_warning('unable to extract %s; '
415 'please report this issue on http://yt-dl.org/bug' % _name)
416 return None
417
418 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
419 """
420 Like _search_regex, but strips HTML tags and unescapes entities.
421 """
422 res = self._search_regex(pattern, string, name, default, fatal, flags)
423 if res:
424 return clean_html(res).strip()
425 else:
426 return res
427
428 def _get_login_info(self):
429 """
430 Get the the login info as (username, password)
431 It will look in the netrc file using the _NETRC_MACHINE value
432 If there's no info available, return (None, None)
433 """
434 if self._downloader is None:
435 return (None, None)
436
437 username = None
438 password = None
439 downloader_params = self._downloader.params
440
441 # Attempt to use provided username and password or .netrc data
442 if downloader_params.get('username', None) is not None:
443 username = downloader_params['username']
444 password = downloader_params['password']
445 elif downloader_params.get('usenetrc', False):
446 try:
447 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
448 if info is not None:
449 username = info[0]
450 password = info[2]
451 else:
452 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
453 except (IOError, netrc.NetrcParseError) as err:
454 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
455
456 return (username, password)
457
458 def _get_tfa_info(self):
459 """
460 Get the two-factor authentication info
461 TODO - asking the user will be required for sms/phone verify
462 currently just uses the command line option
463 If there's no info available, return None
464 """
465 if self._downloader is None:
466 return None
467 downloader_params = self._downloader.params
468
469 if downloader_params.get('twofactor', None) is not None:
470 return downloader_params['twofactor']
471
472 return None
473
474 # Helper functions for extracting OpenGraph info
475 @staticmethod
476 def _og_regexes(prop):
477 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
478 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
479 template = r'<meta[^>]+?%s[^>]+?%s'
480 return [
481 template % (property_re, content_re),
482 template % (content_re, property_re),
483 ]
484
485 def _og_search_property(self, prop, html, name=None, **kargs):
486 if name is None:
487 name = 'OpenGraph %s' % prop
488 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
489 if escaped is None:
490 return None
491 return unescapeHTML(escaped)
492
493 def _og_search_thumbnail(self, html, **kargs):
494 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
495
496 def _og_search_description(self, html, **kargs):
497 return self._og_search_property('description', html, fatal=False, **kargs)
498
499 def _og_search_title(self, html, **kargs):
500 return self._og_search_property('title', html, **kargs)
501
502 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
503 regexes = self._og_regexes('video') + self._og_regexes('video:url')
504 if secure:
505 regexes = self._og_regexes('video:secure_url') + regexes
506 return self._html_search_regex(regexes, html, name, **kargs)
507
508 def _og_search_url(self, html, **kargs):
509 return self._og_search_property('url', html, **kargs)
510
511 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
512 if display_name is None:
513 display_name = name
514 return self._html_search_regex(
515 r'''(?ix)<meta
516 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
517 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
518 html, display_name, fatal=fatal, **kwargs)
519
520 def _dc_search_uploader(self, html):
521 return self._html_search_meta('dc.creator', html, 'uploader')
522
523 def _rta_search(self, html):
524 # See http://www.rtalabel.org/index.php?content=howtofaq#single
525 if re.search(r'(?ix)<meta\s+name="rating"\s+'
526 r' content="RTA-5042-1996-1400-1577-RTA"',
527 html):
528 return 18
529 return 0
530
531 def _media_rating_search(self, html):
532 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
533 rating = self._html_search_meta('rating', html)
534
535 if not rating:
536 return None
537
538 RATING_TABLE = {
539 'safe for kids': 0,
540 'general': 8,
541 '14 years': 14,
542 'mature': 17,
543 'restricted': 19,
544 }
545 return RATING_TABLE.get(rating.lower(), None)
546
547 def _twitter_search_player(self, html):
548 return self._html_search_meta('twitter:player', html,
549 'twitter card player')
550
551 def _sort_formats(self, formats):
552 if not formats:
553 raise ExtractorError('No video formats found')
554
555 def _formats_key(f):
556 # TODO remove the following workaround
557 from ..utils import determine_ext
558 if not f.get('ext') and 'url' in f:
559 f['ext'] = determine_ext(f['url'])
560
561 preference = f.get('preference')
562 if preference is None:
563 proto = f.get('protocol')
564 if proto is None:
565 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
566
567 preference = 0 if proto in ['http', 'https'] else -0.1
568 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
569 preference -= 0.5
570
571 if f.get('vcodec') == 'none': # audio only
572 if self._downloader.params.get('prefer_free_formats'):
573 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
574 else:
575 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
576 ext_preference = 0
577 try:
578 audio_ext_preference = ORDER.index(f['ext'])
579 except ValueError:
580 audio_ext_preference = -1
581 else:
582 if self._downloader.params.get('prefer_free_formats'):
583 ORDER = ['flv', 'mp4', 'webm']
584 else:
585 ORDER = ['webm', 'flv', 'mp4']
586 try:
587 ext_preference = ORDER.index(f['ext'])
588 except ValueError:
589 ext_preference = -1
590 audio_ext_preference = 0
591
592 return (
593 preference,
594 f.get('quality') if f.get('quality') is not None else -1,
595 f.get('height') if f.get('height') is not None else -1,
596 f.get('width') if f.get('width') is not None else -1,
597 ext_preference,
598 f.get('tbr') if f.get('tbr') is not None else -1,
599 f.get('vbr') if f.get('vbr') is not None else -1,
600 f.get('abr') if f.get('abr') is not None else -1,
601 audio_ext_preference,
602 f.get('filesize') if f.get('filesize') is not None else -1,
603 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
604 f.get('format_id'),
605 )
606 formats.sort(key=_formats_key)
607
608 def http_scheme(self):
609 """ Either "https:" or "https:", depending on the user's preferences """
610 return (
611 'http:'
612 if self._downloader.params.get('prefer_insecure', False)
613 else 'https:')
614
615 def _proto_relative_url(self, url, scheme=None):
616 if url is None:
617 return url
618 if url.startswith('//'):
619 if scheme is None:
620 scheme = self.http_scheme()
621 return scheme + url
622 else:
623 return url
624
625 def _sleep(self, timeout, video_id, msg_template=None):
626 if msg_template is None:
627 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
628 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
629 self.to_screen(msg)
630 time.sleep(timeout)
631
632 def _extract_f4m_formats(self, manifest_url, video_id):
633 manifest = self._download_xml(
634 manifest_url, video_id, 'Downloading f4m manifest',
635 'Unable to download f4m manifest')
636
637 formats = []
638 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
639 for i, media_el in enumerate(media_nodes):
640 tbr = int_or_none(media_el.attrib.get('bitrate'))
641 format_id = 'f4m-%d' % (i if tbr is None else tbr)
642 formats.append({
643 'format_id': format_id,
644 'url': manifest_url,
645 'ext': 'flv',
646 'tbr': tbr,
647 'width': int_or_none(media_el.attrib.get('width')),
648 'height': int_or_none(media_el.attrib.get('height')),
649 })
650 self._sort_formats(formats)
651
652 return formats
653
654 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
655 entry_protocol='m3u8', preference=None):
656
657 formats = [{
658 'format_id': 'm3u8-meta',
659 'url': m3u8_url,
660 'ext': ext,
661 'protocol': 'm3u8',
662 'preference': -1,
663 'resolution': 'multiple',
664 'format_note': 'Quality selection URL',
665 }]
666
667 format_url = lambda u: (
668 u
669 if re.match(r'^https?://', u)
670 else compat_urlparse.urljoin(m3u8_url, u))
671
672 m3u8_doc = self._download_webpage(m3u8_url, video_id)
673 last_info = None
674 kv_rex = re.compile(
675 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
676 for line in m3u8_doc.splitlines():
677 if line.startswith('#EXT-X-STREAM-INF:'):
678 last_info = {}
679 for m in kv_rex.finditer(line):
680 v = m.group('val')
681 if v.startswith('"'):
682 v = v[1:-1]
683 last_info[m.group('key')] = v
684 elif line.startswith('#') or not line.strip():
685 continue
686 else:
687 if last_info is None:
688 formats.append({'url': format_url(line)})
689 continue
690 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
691
692 f = {
693 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
694 'url': format_url(line.strip()),
695 'tbr': tbr,
696 'ext': ext,
697 'protocol': entry_protocol,
698 'preference': preference,
699 }
700 codecs = last_info.get('CODECS')
701 if codecs:
702 # TODO: looks like video codec is not always necessarily goes first
703 va_codecs = codecs.split(',')
704 if va_codecs[0]:
705 f['vcodec'] = va_codecs[0].partition('.')[0]
706 if len(va_codecs) > 1 and va_codecs[1]:
707 f['acodec'] = va_codecs[1].partition('.')[0]
708 resolution = last_info.get('RESOLUTION')
709 if resolution:
710 width_str, height_str = resolution.split('x')
711 f['width'] = int(width_str)
712 f['height'] = int(height_str)
713 formats.append(f)
714 last_info = {}
715 self._sort_formats(formats)
716 return formats
717
718 def _live_title(self, name):
719 """ Generate the title for a live video """
720 now = datetime.datetime.now()
721 now_str = now.strftime("%Y-%m-%d %H:%M")
722 return name + ' ' + now_str
723
724 def _int(self, v, name, fatal=False, **kwargs):
725 res = int_or_none(v, **kwargs)
726 if 'get_attr' in kwargs:
727 print(getattr(v, kwargs['get_attr']))
728 if res is None:
729 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
730 if fatal:
731 raise ExtractorError(msg)
732 else:
733 self._downloader.report_warning(msg)
734 return res
735
736 def _float(self, v, name, fatal=False, **kwargs):
737 res = float_or_none(v, **kwargs)
738 if res is None:
739 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
740 if fatal:
741 raise ExtractorError(msg)
742 else:
743 self._downloader.report_warning(msg)
744 return res
745
746
747 class SearchInfoExtractor(InfoExtractor):
748 """
749 Base class for paged search queries extractors.
750 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
751 Instances should define _SEARCH_KEY and _MAX_RESULTS.
752 """
753
754 @classmethod
755 def _make_valid_url(cls):
756 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
757
758 @classmethod
759 def suitable(cls, url):
760 return re.match(cls._make_valid_url(), url) is not None
761
762 def _real_extract(self, query):
763 mobj = re.match(self._make_valid_url(), query)
764 if mobj is None:
765 raise ExtractorError('Invalid search query "%s"' % query)
766
767 prefix = mobj.group('prefix')
768 query = mobj.group('query')
769 if prefix == '':
770 return self._get_n_results(query, 1)
771 elif prefix == 'all':
772 return self._get_n_results(query, self._MAX_RESULTS)
773 else:
774 n = int(prefix)
775 if n <= 0:
776 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
777 elif n > self._MAX_RESULTS:
778 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
779 n = self._MAX_RESULTS
780 return self._get_n_results(query, n)
781
782 def _get_n_results(self, query, n):
783 """Get a specified number of results for a query"""
784 raise NotImplementedError("This method must be implemented by subclasses")
785
786 @property
787 def SEARCH_KEY(self):
788 return self._SEARCH_KEY