]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
[youtube] Extract average rating (closes #2362)
[yt-dlp.git] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import re
10 import socket
11 import sys
12 import time
13 import xml.etree.ElementTree
14
15 from ..compat import (
16 compat_cookiejar,
17 compat_HTTPError,
18 compat_http_client,
19 compat_urllib_error,
20 compat_urllib_parse_urlparse,
21 compat_urlparse,
22 compat_str,
23 )
24 from ..utils import (
25 age_restricted,
26 clean_html,
27 compiled_regex_type,
28 ExtractorError,
29 float_or_none,
30 HEADRequest,
31 int_or_none,
32 RegexNotFoundError,
33 sanitize_filename,
34 unescapeHTML,
35 )
36 _NO_DEFAULT = object()
37
38
39 class InfoExtractor(object):
40 """Information Extractor class.
41
42 Information extractors are the classes that, given a URL, extract
43 information about the video (or videos) the URL refers to. This
44 information includes the real video URL, the video title, author and
45 others. The information is stored in a dictionary which is then
46 passed to the YoutubeDL. The YoutubeDL processes this
47 information possibly downloading the video to the file system, among
48 other possible outcomes.
49
50 The type field determines the the type of the result.
51 By far the most common value (and the default if _type is missing) is
52 "video", which indicates a single video.
53
54 For a video, the dictionaries must include the following fields:
55
56 id: Video identifier.
57 title: Video title, unescaped.
58
59 Additionally, it must contain either a formats entry or a url one:
60
61 formats: A list of dictionaries for each format available, ordered
62 from worst to best quality.
63
64 Potential fields:
65 * url Mandatory. The URL of the video file
66 * ext Will be calculated from url if missing
67 * format A human-readable description of the format
68 ("mp4 container with h264/opus").
69 Calculated from the format_id, width, height.
70 and format_note fields if missing.
71 * format_id A short description of the format
72 ("mp4_h264_opus" or "19").
73 Technically optional, but strongly recommended.
74 * format_note Additional info about the format
75 ("3D" or "DASH video")
76 * width Width of the video, if known
77 * height Height of the video, if known
78 * resolution Textual description of width and height
79 * tbr Average bitrate of audio and video in KBit/s
80 * abr Average audio bitrate in KBit/s
81 * acodec Name of the audio codec in use
82 * asr Audio sampling rate in Hertz
83 * vbr Average video bitrate in KBit/s
84 * fps Frame rate
85 * vcodec Name of the video codec in use
86 * container Name of the container format
87 * filesize The number of bytes, if known in advance
88 * filesize_approx An estimate for the number of bytes
89 * player_url SWF Player URL (used for rtmpdump).
90 * protocol The protocol that will be used for the actual
91 download, lower-case.
92 "http", "https", "rtsp", "rtmp", "rtmpe",
93 "m3u8", or "m3u8_native".
94 * preference Order number of this format. If this field is
95 present and not None, the formats get sorted
96 by this field, regardless of all other values.
97 -1 for default (order by other properties),
98 -2 or smaller for less than default.
99 < -1000 to hide the format (if there is
100 another one which is strictly better)
101 * language_preference Is this in the correct requested
102 language?
103 10 if it's what the URL is about,
104 -1 for default (don't know),
105 -10 otherwise, other values reserved for now.
106 * quality Order number of the video quality of this
107 format, irrespective of the file format.
108 -1 for default (order by other properties),
109 -2 or smaller for less than default.
110 * source_preference Order number for this video source
111 (quality takes higher priority)
112 -1 for default (order by other properties),
113 -2 or smaller for less than default.
114 * http_method HTTP method to use for the download.
115 * http_headers A dictionary of additional HTTP headers
116 to add to the request.
117 * http_post_data Additional data to send with a POST
118 request.
119 * stretched_ratio If given and not 1, indicates that the
120 video's pixels are not square.
121 width : height ratio as float.
122 * no_resume The server does not support resuming the
123 (HTTP or RTMP) download. Boolean.
124
125 url: Final video URL.
126 ext: Video filename extension.
127 format: The video format, defaults to ext (used for --get-format)
128 player_url: SWF Player URL (used for rtmpdump).
129
130 The following fields are optional:
131
132 alt_title: A secondary title of the video.
133 display_id An alternative identifier for the video, not necessarily
134 unique, but available before title. Typically, id is
135 something like "4234987", title "Dancing naked mole rats",
136 and display_id "dancing-naked-mole-rats"
137 thumbnails: A list of dictionaries, with the following entries:
138 * "id" (optional, string) - Thumbnail format ID
139 * "url"
140 * "preference" (optional, int) - quality of the image
141 * "width" (optional, int)
142 * "height" (optional, int)
143 * "resolution" (optional, string "{width}x{height"},
144 deprecated)
145 thumbnail: Full URL to a video thumbnail image.
146 description: Full video description.
147 uploader: Full name of the video uploader.
148 creator: The main artist who created the video.
149 timestamp: UNIX timestamp of the moment the video became available.
150 upload_date: Video upload date (YYYYMMDD).
151 If not explicitly set, calculated from timestamp.
152 uploader_id: Nickname or id of the video uploader.
153 location: Physical location where the video was filmed.
154 subtitles: The subtitle file contents as a dictionary in the format
155 {language: subtitles}.
156 duration: Length of the video in seconds, as an integer.
157 view_count: How many users have watched the video on the platform.
158 like_count: Number of positive ratings of the video
159 dislike_count: Number of negative ratings of the video
160 average_rating: Average rating give by users, the scale used depends on the webpage
161 comment_count: Number of comments on the video
162 comments: A list of comments, each with one or more of the following
163 properties (all but one of text or html optional):
164 * "author" - human-readable name of the comment author
165 * "author_id" - user ID of the comment author
166 * "id" - Comment ID
167 * "html" - Comment as HTML
168 * "text" - Plain text of the comment
169 * "timestamp" - UNIX timestamp of comment
170 * "parent" - ID of the comment this one is replying to.
171 Set to "root" to indicate that this is a
172 comment to the original video.
173 age_limit: Age restriction for the video, as an integer (years)
174 webpage_url: The url to the video webpage, if given to youtube-dl it
175 should allow to get the same result again. (It will be set
176 by YoutubeDL if it's missing)
177 categories: A list of categories that the video falls in, for example
178 ["Sports", "Berlin"]
179 is_live: True, False, or None (=unknown). Whether this video is a
180 live stream that goes on instead of a fixed-length video.
181
182 Unless mentioned otherwise, the fields should be Unicode strings.
183
184 Unless mentioned otherwise, None is equivalent to absence of information.
185
186
187 _type "playlist" indicates multiple videos.
188 There must be a key "entries", which is a list, an iterable, or a PagedList
189 object, each element of which is a valid dictionary by this specification.
190
191 Additionally, playlists can have "title" and "id" attributes with the same
192 semantics as videos (see above).
193
194
195 _type "multi_video" indicates that there are multiple videos that
196 form a single show, for examples multiple acts of an opera or TV episode.
197 It must have an entries key like a playlist and contain all the keys
198 required for a video at the same time.
199
200
201 _type "url" indicates that the video must be extracted from another
202 location, possibly by a different extractor. Its only required key is:
203 "url" - the next URL to extract.
204 The key "ie_key" can be set to the class name (minus the trailing "IE",
205 e.g. "Youtube") if the extractor class is known in advance.
206 Additionally, the dictionary may have any properties of the resolved entity
207 known in advance, for example "title" if the title of the referred video is
208 known ahead of time.
209
210
211 _type "url_transparent" entities have the same specification as "url", but
212 indicate that the given additional information is more precise than the one
213 associated with the resolved URL.
214 This is useful when a site employs a video service that hosts the video and
215 its technical metadata, but that video service does not embed a useful
216 title, description etc.
217
218
219 Subclasses of this one should re-define the _real_initialize() and
220 _real_extract() methods and define a _VALID_URL regexp.
221 Probably, they should also be added to the list of extractors.
222
223 Finally, the _WORKING attribute should be set to False for broken IEs
224 in order to warn the users and skip the tests.
225 """
226
227 _ready = False
228 _downloader = None
229 _WORKING = True
230
231 def __init__(self, downloader=None):
232 """Constructor. Receives an optional downloader."""
233 self._ready = False
234 self.set_downloader(downloader)
235
236 @classmethod
237 def suitable(cls, url):
238 """Receives a URL and returns True if suitable for this IE."""
239
240 # This does not use has/getattr intentionally - we want to know whether
241 # we have cached the regexp for *this* class, whereas getattr would also
242 # match the superclass
243 if '_VALID_URL_RE' not in cls.__dict__:
244 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
245 return cls._VALID_URL_RE.match(url) is not None
246
247 @classmethod
248 def _match_id(cls, url):
249 if '_VALID_URL_RE' not in cls.__dict__:
250 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
251 m = cls._VALID_URL_RE.match(url)
252 assert m
253 return m.group('id')
254
255 @classmethod
256 def working(cls):
257 """Getter method for _WORKING."""
258 return cls._WORKING
259
260 def initialize(self):
261 """Initializes an instance (authentication, etc)."""
262 if not self._ready:
263 self._real_initialize()
264 self._ready = True
265
266 def extract(self, url):
267 """Extracts URL information and returns it in list of dicts."""
268 try:
269 self.initialize()
270 return self._real_extract(url)
271 except ExtractorError:
272 raise
273 except compat_http_client.IncompleteRead as e:
274 raise ExtractorError('A network error has occured.', cause=e, expected=True)
275 except (KeyError, StopIteration) as e:
276 raise ExtractorError('An extractor error has occured.', cause=e)
277
278 def set_downloader(self, downloader):
279 """Sets the downloader for this IE."""
280 self._downloader = downloader
281
282 def _real_initialize(self):
283 """Real initialization process. Redefine in subclasses."""
284 pass
285
286 def _real_extract(self, url):
287 """Real extraction process. Redefine in subclasses."""
288 pass
289
290 @classmethod
291 def ie_key(cls):
292 """A string for getting the InfoExtractor with get_info_extractor"""
293 return cls.__name__[:-2]
294
295 @property
296 def IE_NAME(self):
297 return type(self).__name__[:-2]
298
299 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
300 """ Returns the response handle """
301 if note is None:
302 self.report_download_webpage(video_id)
303 elif note is not False:
304 if video_id is None:
305 self.to_screen('%s' % (note,))
306 else:
307 self.to_screen('%s: %s' % (video_id, note))
308 try:
309 return self._downloader.urlopen(url_or_request)
310 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
311 if errnote is False:
312 return False
313 if errnote is None:
314 errnote = 'Unable to download webpage'
315 errmsg = '%s: %s' % (errnote, compat_str(err))
316 if fatal:
317 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
318 else:
319 self._downloader.report_warning(errmsg)
320 return False
321
322 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
323 """ Returns a tuple (page content as string, URL handle) """
324 # Strip hashes from the URL (#1038)
325 if isinstance(url_or_request, (compat_str, str)):
326 url_or_request = url_or_request.partition('#')[0]
327
328 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
329 if urlh is False:
330 assert not fatal
331 return False
332 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
333 return (content, urlh)
334
335 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
336 content_type = urlh.headers.get('Content-Type', '')
337 webpage_bytes = urlh.read()
338 if prefix is not None:
339 webpage_bytes = prefix + webpage_bytes
340 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
341 if m:
342 encoding = m.group(1)
343 else:
344 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
345 webpage_bytes[:1024])
346 if m:
347 encoding = m.group(1).decode('ascii')
348 elif webpage_bytes.startswith(b'\xff\xfe'):
349 encoding = 'utf-16'
350 else:
351 encoding = 'utf-8'
352 if self._downloader.params.get('dump_intermediate_pages', False):
353 try:
354 url = url_or_request.get_full_url()
355 except AttributeError:
356 url = url_or_request
357 self.to_screen('Dumping request to ' + url)
358 dump = base64.b64encode(webpage_bytes).decode('ascii')
359 self._downloader.to_screen(dump)
360 if self._downloader.params.get('write_pages', False):
361 try:
362 url = url_or_request.get_full_url()
363 except AttributeError:
364 url = url_or_request
365 basen = '%s_%s' % (video_id, url)
366 if len(basen) > 240:
367 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
368 basen = basen[:240 - len(h)] + h
369 raw_filename = basen + '.dump'
370 filename = sanitize_filename(raw_filename, restricted=True)
371 self.to_screen('Saving request to ' + filename)
372 # Working around MAX_PATH limitation on Windows (see
373 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
374 if os.name == 'nt':
375 absfilepath = os.path.abspath(filename)
376 if len(absfilepath) > 259:
377 filename = '\\\\?\\' + absfilepath
378 with open(filename, 'wb') as outf:
379 outf.write(webpage_bytes)
380
381 try:
382 content = webpage_bytes.decode(encoding, 'replace')
383 except LookupError:
384 content = webpage_bytes.decode('utf-8', 'replace')
385
386 if ('<title>Access to this site is blocked</title>' in content and
387 'Websense' in content[:512]):
388 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
389 blocked_iframe = self._html_search_regex(
390 r'<iframe src="([^"]+)"', content,
391 'Websense information URL', default=None)
392 if blocked_iframe:
393 msg += ' Visit %s for more details' % blocked_iframe
394 raise ExtractorError(msg, expected=True)
395
396 return content
397
398 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
399 """ Returns the data of the page as a string """
400 success = False
401 try_count = 0
402 while success is False:
403 try:
404 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
405 success = True
406 except compat_http_client.IncompleteRead as e:
407 try_count += 1
408 if try_count >= tries:
409 raise e
410 self._sleep(timeout, video_id)
411 if res is False:
412 return res
413 else:
414 content, _ = res
415 return content
416
417 def _download_xml(self, url_or_request, video_id,
418 note='Downloading XML', errnote='Unable to download XML',
419 transform_source=None, fatal=True):
420 """Return the xml as an xml.etree.ElementTree.Element"""
421 xml_string = self._download_webpage(
422 url_or_request, video_id, note, errnote, fatal=fatal)
423 if xml_string is False:
424 return xml_string
425 if transform_source:
426 xml_string = transform_source(xml_string)
427 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
428
429 def _download_json(self, url_or_request, video_id,
430 note='Downloading JSON metadata',
431 errnote='Unable to download JSON metadata',
432 transform_source=None,
433 fatal=True):
434 json_string = self._download_webpage(
435 url_or_request, video_id, note, errnote, fatal=fatal)
436 if (not fatal) and json_string is False:
437 return None
438 return self._parse_json(
439 json_string, video_id, transform_source=transform_source, fatal=fatal)
440
441 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
442 if transform_source:
443 json_string = transform_source(json_string)
444 try:
445 return json.loads(json_string)
446 except ValueError as ve:
447 errmsg = '%s: Failed to parse JSON ' % video_id
448 if fatal:
449 raise ExtractorError(errmsg, cause=ve)
450 else:
451 self.report_warning(errmsg + str(ve))
452
453 def report_warning(self, msg, video_id=None):
454 idstr = '' if video_id is None else '%s: ' % video_id
455 self._downloader.report_warning(
456 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
457
458 def to_screen(self, msg):
459 """Print msg to screen, prefixing it with '[ie_name]'"""
460 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
461
462 def report_extraction(self, id_or_name):
463 """Report information extraction."""
464 self.to_screen('%s: Extracting information' % id_or_name)
465
466 def report_download_webpage(self, video_id):
467 """Report webpage download."""
468 self.to_screen('%s: Downloading webpage' % video_id)
469
470 def report_age_confirmation(self):
471 """Report attempt to confirm age."""
472 self.to_screen('Confirming age')
473
474 def report_login(self):
475 """Report attempt to log in."""
476 self.to_screen('Logging in')
477
478 # Methods for following #608
479 @staticmethod
480 def url_result(url, ie=None, video_id=None):
481 """Returns a url that points to a page that should be processed"""
482 # TODO: ie should be the class used for getting the info
483 video_info = {'_type': 'url',
484 'url': url,
485 'ie_key': ie}
486 if video_id is not None:
487 video_info['id'] = video_id
488 return video_info
489
490 @staticmethod
491 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
492 """Returns a playlist"""
493 video_info = {'_type': 'playlist',
494 'entries': entries}
495 if playlist_id:
496 video_info['id'] = playlist_id
497 if playlist_title:
498 video_info['title'] = playlist_title
499 if playlist_description:
500 video_info['description'] = playlist_description
501 return video_info
502
503 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
504 """
505 Perform a regex search on the given string, using a single or a list of
506 patterns returning the first matching group.
507 In case of failure return a default value or raise a WARNING or a
508 RegexNotFoundError, depending on fatal, specifying the field name.
509 """
510 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
511 mobj = re.search(pattern, string, flags)
512 else:
513 for p in pattern:
514 mobj = re.search(p, string, flags)
515 if mobj:
516 break
517
518 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
519 _name = '\033[0;34m%s\033[0m' % name
520 else:
521 _name = name
522
523 if mobj:
524 if group is None:
525 # return the first matching group
526 return next(g for g in mobj.groups() if g is not None)
527 else:
528 return mobj.group(group)
529 elif default is not _NO_DEFAULT:
530 return default
531 elif fatal:
532 raise RegexNotFoundError('Unable to extract %s' % _name)
533 else:
534 self._downloader.report_warning('unable to extract %s; '
535 'please report this issue on http://yt-dl.org/bug' % _name)
536 return None
537
538 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
539 """
540 Like _search_regex, but strips HTML tags and unescapes entities.
541 """
542 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
543 if res:
544 return clean_html(res).strip()
545 else:
546 return res
547
548 def _get_login_info(self):
549 """
550 Get the the login info as (username, password)
551 It will look in the netrc file using the _NETRC_MACHINE value
552 If there's no info available, return (None, None)
553 """
554 if self._downloader is None:
555 return (None, None)
556
557 username = None
558 password = None
559 downloader_params = self._downloader.params
560
561 # Attempt to use provided username and password or .netrc data
562 if downloader_params.get('username', None) is not None:
563 username = downloader_params['username']
564 password = downloader_params['password']
565 elif downloader_params.get('usenetrc', False):
566 try:
567 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
568 if info is not None:
569 username = info[0]
570 password = info[2]
571 else:
572 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
573 except (IOError, netrc.NetrcParseError) as err:
574 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
575
576 return (username, password)
577
578 def _get_tfa_info(self):
579 """
580 Get the two-factor authentication info
581 TODO - asking the user will be required for sms/phone verify
582 currently just uses the command line option
583 If there's no info available, return None
584 """
585 if self._downloader is None:
586 return None
587 downloader_params = self._downloader.params
588
589 if downloader_params.get('twofactor', None) is not None:
590 return downloader_params['twofactor']
591
592 return None
593
594 # Helper functions for extracting OpenGraph info
595 @staticmethod
596 def _og_regexes(prop):
597 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
598 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
599 template = r'<meta[^>]+?%s[^>]+?%s'
600 return [
601 template % (property_re, content_re),
602 template % (content_re, property_re),
603 ]
604
605 def _og_search_property(self, prop, html, name=None, **kargs):
606 if name is None:
607 name = 'OpenGraph %s' % prop
608 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
609 if escaped is None:
610 return None
611 return unescapeHTML(escaped)
612
613 def _og_search_thumbnail(self, html, **kargs):
614 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
615
616 def _og_search_description(self, html, **kargs):
617 return self._og_search_property('description', html, fatal=False, **kargs)
618
619 def _og_search_title(self, html, **kargs):
620 return self._og_search_property('title', html, **kargs)
621
622 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
623 regexes = self._og_regexes('video') + self._og_regexes('video:url')
624 if secure:
625 regexes = self._og_regexes('video:secure_url') + regexes
626 return self._html_search_regex(regexes, html, name, **kargs)
627
628 def _og_search_url(self, html, **kargs):
629 return self._og_search_property('url', html, **kargs)
630
631 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
632 if display_name is None:
633 display_name = name
634 return self._html_search_regex(
635 r'''(?isx)<meta
636 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
637 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
638 html, display_name, fatal=fatal, group='content', **kwargs)
639
640 def _dc_search_uploader(self, html):
641 return self._html_search_meta('dc.creator', html, 'uploader')
642
643 def _rta_search(self, html):
644 # See http://www.rtalabel.org/index.php?content=howtofaq#single
645 if re.search(r'(?ix)<meta\s+name="rating"\s+'
646 r' content="RTA-5042-1996-1400-1577-RTA"',
647 html):
648 return 18
649 return 0
650
651 def _media_rating_search(self, html):
652 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
653 rating = self._html_search_meta('rating', html)
654
655 if not rating:
656 return None
657
658 RATING_TABLE = {
659 'safe for kids': 0,
660 'general': 8,
661 '14 years': 14,
662 'mature': 17,
663 'restricted': 19,
664 }
665 return RATING_TABLE.get(rating.lower(), None)
666
667 def _family_friendly_search(self, html):
668 # See http://schema.org/VideoObj
669 family_friendly = self._html_search_meta('isFamilyFriendly', html)
670
671 if not family_friendly:
672 return None
673
674 RATING_TABLE = {
675 '1': 0,
676 'true': 0,
677 '0': 18,
678 'false': 18,
679 }
680 return RATING_TABLE.get(family_friendly.lower(), None)
681
682 def _twitter_search_player(self, html):
683 return self._html_search_meta('twitter:player', html,
684 'twitter card player')
685
686 def _sort_formats(self, formats):
687 if not formats:
688 raise ExtractorError('No video formats found')
689
690 def _formats_key(f):
691 # TODO remove the following workaround
692 from ..utils import determine_ext
693 if not f.get('ext') and 'url' in f:
694 f['ext'] = determine_ext(f['url'])
695
696 preference = f.get('preference')
697 if preference is None:
698 proto = f.get('protocol')
699 if proto is None:
700 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
701
702 preference = 0 if proto in ['http', 'https'] else -0.1
703 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
704 preference -= 0.5
705
706 if f.get('vcodec') == 'none': # audio only
707 if self._downloader.params.get('prefer_free_formats'):
708 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
709 else:
710 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
711 ext_preference = 0
712 try:
713 audio_ext_preference = ORDER.index(f['ext'])
714 except ValueError:
715 audio_ext_preference = -1
716 else:
717 if self._downloader.params.get('prefer_free_formats'):
718 ORDER = ['flv', 'mp4', 'webm']
719 else:
720 ORDER = ['webm', 'flv', 'mp4']
721 try:
722 ext_preference = ORDER.index(f['ext'])
723 except ValueError:
724 ext_preference = -1
725 audio_ext_preference = 0
726
727 return (
728 preference,
729 f.get('language_preference') if f.get('language_preference') is not None else -1,
730 f.get('quality') if f.get('quality') is not None else -1,
731 f.get('tbr') if f.get('tbr') is not None else -1,
732 f.get('vbr') if f.get('vbr') is not None else -1,
733 f.get('height') if f.get('height') is not None else -1,
734 f.get('width') if f.get('width') is not None else -1,
735 ext_preference,
736 f.get('abr') if f.get('abr') is not None else -1,
737 audio_ext_preference,
738 f.get('fps') if f.get('fps') is not None else -1,
739 f.get('filesize') if f.get('filesize') is not None else -1,
740 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
741 f.get('source_preference') if f.get('source_preference') is not None else -1,
742 f.get('format_id'),
743 )
744 formats.sort(key=_formats_key)
745
746 def _check_formats(self, formats, video_id):
747 if formats:
748 formats[:] = filter(
749 lambda f: self._is_valid_url(
750 f['url'], video_id,
751 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
752 formats)
753
754 def _is_valid_url(self, url, video_id, item='video'):
755 try:
756 self._request_webpage(
757 HEADRequest(url), video_id,
758 'Checking %s URL' % item)
759 return True
760 except ExtractorError as e:
761 if isinstance(e.cause, compat_HTTPError):
762 self.report_warning(
763 '%s URL is invalid, skipping' % item, video_id)
764 return False
765 raise
766
767 def http_scheme(self):
768 """ Either "http:" or "https:", depending on the user's preferences """
769 return (
770 'http:'
771 if self._downloader.params.get('prefer_insecure', False)
772 else 'https:')
773
774 def _proto_relative_url(self, url, scheme=None):
775 if url is None:
776 return url
777 if url.startswith('//'):
778 if scheme is None:
779 scheme = self.http_scheme()
780 return scheme + url
781 else:
782 return url
783
784 def _sleep(self, timeout, video_id, msg_template=None):
785 if msg_template is None:
786 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
787 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
788 self.to_screen(msg)
789 time.sleep(timeout)
790
791 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
792 manifest = self._download_xml(
793 manifest_url, video_id, 'Downloading f4m manifest',
794 'Unable to download f4m manifest')
795
796 formats = []
797 manifest_version = '1.0'
798 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
799 if not media_nodes:
800 manifest_version = '2.0'
801 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
802 for i, media_el in enumerate(media_nodes):
803 if manifest_version == '2.0':
804 manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/'
805 + (media_el.attrib.get('href') or media_el.attrib.get('url')))
806 tbr = int_or_none(media_el.attrib.get('bitrate'))
807 formats.append({
808 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])),
809 'url': manifest_url,
810 'ext': 'flv',
811 'tbr': tbr,
812 'width': int_or_none(media_el.attrib.get('width')),
813 'height': int_or_none(media_el.attrib.get('height')),
814 'preference': preference,
815 })
816 self._sort_formats(formats)
817
818 return formats
819
820 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
821 entry_protocol='m3u8', preference=None,
822 m3u8_id=None):
823
824 formats = [{
825 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])),
826 'url': m3u8_url,
827 'ext': ext,
828 'protocol': 'm3u8',
829 'preference': -1,
830 'resolution': 'multiple',
831 'format_note': 'Quality selection URL',
832 }]
833
834 format_url = lambda u: (
835 u
836 if re.match(r'^https?://', u)
837 else compat_urlparse.urljoin(m3u8_url, u))
838
839 m3u8_doc = self._download_webpage(
840 m3u8_url, video_id,
841 note='Downloading m3u8 information',
842 errnote='Failed to download m3u8 information')
843 last_info = None
844 kv_rex = re.compile(
845 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
846 for line in m3u8_doc.splitlines():
847 if line.startswith('#EXT-X-STREAM-INF:'):
848 last_info = {}
849 for m in kv_rex.finditer(line):
850 v = m.group('val')
851 if v.startswith('"'):
852 v = v[1:-1]
853 last_info[m.group('key')] = v
854 elif line.startswith('#') or not line.strip():
855 continue
856 else:
857 if last_info is None:
858 formats.append({'url': format_url(line)})
859 continue
860 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
861 f = {
862 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])),
863 'url': format_url(line.strip()),
864 'tbr': tbr,
865 'ext': ext,
866 'protocol': entry_protocol,
867 'preference': preference,
868 }
869 codecs = last_info.get('CODECS')
870 if codecs:
871 # TODO: looks like video codec is not always necessarily goes first
872 va_codecs = codecs.split(',')
873 if va_codecs[0]:
874 f['vcodec'] = va_codecs[0].partition('.')[0]
875 if len(va_codecs) > 1 and va_codecs[1]:
876 f['acodec'] = va_codecs[1].partition('.')[0]
877 resolution = last_info.get('RESOLUTION')
878 if resolution:
879 width_str, height_str = resolution.split('x')
880 f['width'] = int(width_str)
881 f['height'] = int(height_str)
882 formats.append(f)
883 last_info = {}
884 self._sort_formats(formats)
885 return formats
886
887 # TODO: improve extraction
888 def _extract_smil_formats(self, smil_url, video_id, fatal=True):
889 smil = self._download_xml(
890 smil_url, video_id, 'Downloading SMIL file',
891 'Unable to download SMIL file', fatal=fatal)
892 if smil is False:
893 assert not fatal
894 return []
895
896 base = smil.find('./head/meta').get('base')
897
898 formats = []
899 rtmp_count = 0
900 for video in smil.findall('./body/switch/video'):
901 src = video.get('src')
902 if not src:
903 continue
904 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
905 width = int_or_none(video.get('width'))
906 height = int_or_none(video.get('height'))
907 proto = video.get('proto')
908 if not proto:
909 if base:
910 if base.startswith('rtmp'):
911 proto = 'rtmp'
912 elif base.startswith('http'):
913 proto = 'http'
914 ext = video.get('ext')
915 if proto == 'm3u8':
916 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
917 elif proto == 'rtmp':
918 rtmp_count += 1
919 streamer = video.get('streamer') or base
920 formats.append({
921 'url': streamer,
922 'play_path': src,
923 'ext': 'flv',
924 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
925 'tbr': bitrate,
926 'width': width,
927 'height': height,
928 })
929 self._sort_formats(formats)
930
931 return formats
932
933 def _live_title(self, name):
934 """ Generate the title for a live video """
935 now = datetime.datetime.now()
936 now_str = now.strftime("%Y-%m-%d %H:%M")
937 return name + ' ' + now_str
938
939 def _int(self, v, name, fatal=False, **kwargs):
940 res = int_or_none(v, **kwargs)
941 if 'get_attr' in kwargs:
942 print(getattr(v, kwargs['get_attr']))
943 if res is None:
944 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
945 if fatal:
946 raise ExtractorError(msg)
947 else:
948 self._downloader.report_warning(msg)
949 return res
950
951 def _float(self, v, name, fatal=False, **kwargs):
952 res = float_or_none(v, **kwargs)
953 if res is None:
954 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
955 if fatal:
956 raise ExtractorError(msg)
957 else:
958 self._downloader.report_warning(msg)
959 return res
960
961 def _set_cookie(self, domain, name, value, expire_time=None):
962 cookie = compat_cookiejar.Cookie(
963 0, name, value, None, None, domain, None,
964 None, '/', True, False, expire_time, '', None, None, None)
965 self._downloader.cookiejar.set_cookie(cookie)
966
967 def get_testcases(self, include_onlymatching=False):
968 t = getattr(self, '_TEST', None)
969 if t:
970 assert not hasattr(self, '_TESTS'), \
971 '%s has _TEST and _TESTS' % type(self).__name__
972 tests = [t]
973 else:
974 tests = getattr(self, '_TESTS', [])
975 for t in tests:
976 if not include_onlymatching and t.get('only_matching', False):
977 continue
978 t['name'] = type(self).__name__[:-len('IE')]
979 yield t
980
981 def is_suitable(self, age_limit):
982 """ Test whether the extractor is generally suitable for the given
983 age limit (i.e. pornographic sites are not, all others usually are) """
984
985 any_restricted = False
986 for tc in self.get_testcases(include_onlymatching=False):
987 if 'playlist' in tc:
988 tc = tc['playlist'][0]
989 is_restricted = age_restricted(
990 tc.get('info_dict', {}).get('age_limit'), age_limit)
991 if not is_restricted:
992 return True
993 any_restricted = any_restricted or is_restricted
994 return not any_restricted
995
996
997 class SearchInfoExtractor(InfoExtractor):
998 """
999 Base class for paged search queries extractors.
1000 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
1001 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1002 """
1003
1004 @classmethod
1005 def _make_valid_url(cls):
1006 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1007
1008 @classmethod
1009 def suitable(cls, url):
1010 return re.match(cls._make_valid_url(), url) is not None
1011
1012 def _real_extract(self, query):
1013 mobj = re.match(self._make_valid_url(), query)
1014 if mobj is None:
1015 raise ExtractorError('Invalid search query "%s"' % query)
1016
1017 prefix = mobj.group('prefix')
1018 query = mobj.group('query')
1019 if prefix == '':
1020 return self._get_n_results(query, 1)
1021 elif prefix == 'all':
1022 return self._get_n_results(query, self._MAX_RESULTS)
1023 else:
1024 n = int(prefix)
1025 if n <= 0:
1026 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1027 elif n > self._MAX_RESULTS:
1028 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1029 n = self._MAX_RESULTS
1030 return self._get_n_results(query, n)
1031
1032 def _get_n_results(self, query, n):
1033 """Get a specified number of results for a query"""
1034 raise NotImplementedError("This method must be implemented by subclasses")
1035
1036 @property
1037 def SEARCH_KEY(self):
1038 return self._SEARCH_KEY