]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[gamespot] Modernize
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
f1a9d64e
PH
1from __future__ import unicode_literals
2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4
PH
8import os
9import re
10import socket
11import sys
4094b6e3 12import time
267ed0c5 13import xml.etree.ElementTree
d6983cb4 14
8c25f81b 15from ..compat import (
d6983cb4
PH
16 compat_http_client,
17 compat_urllib_error,
c7deaa4c 18 compat_urllib_parse_urlparse,
f0b5d6af 19 compat_urlparse,
d6983cb4 20 compat_str,
8c25f81b
PH
21)
22from ..utils import (
d6983cb4
PH
23 clean_html,
24 compiled_regex_type,
25 ExtractorError,
b14f3a4c 26 float_or_none,
31bb8d3f 27 int_or_none,
55b3e45b 28 RegexNotFoundError,
d41e6efc 29 sanitize_filename,
f38de77f 30 unescapeHTML,
d6983cb4 31)
46374a56 32_NO_DEFAULT = object()
d6983cb4 33
dca08720 34
d6983cb4
PH
35class InfoExtractor(object):
36 """Information Extractor class.
37
38 Information extractors are the classes that, given a URL, extract
39 information about the video (or videos) the URL refers to. This
40 information includes the real video URL, the video title, author and
41 others. The information is stored in a dictionary which is then
42 passed to the FileDownloader. The FileDownloader processes this
43 information possibly downloading the video to the file system, among
44 other possible outcomes.
45
46 The dictionaries must include the following fields:
47
48 id: Video identifier.
d6983cb4 49 title: Video title, unescaped.
d67b0b15 50
f49d89ee 51 Additionally, it must contain either a formats entry or a url one:
d67b0b15 52
f49d89ee
PH
53 formats: A list of dictionaries for each format available, ordered
54 from worst to best quality.
55
56 Potential fields:
d67b0b15
PH
57 * url Mandatory. The URL of the video file
58 * ext Will be calculated from url if missing
59 * format A human-readable description of the format
60 ("mp4 container with h264/opus").
61 Calculated from the format_id, width, height.
62 and format_note fields if missing.
63 * format_id A short description of the format
5d4f3985
PH
64 ("mp4_h264_opus" or "19").
65 Technically optional, but strongly recommended.
d67b0b15
PH
66 * format_note Additional info about the format
67 ("3D" or "DASH video")
68 * width Width of the video, if known
69 * height Height of the video, if known
f49d89ee 70 * resolution Textual description of width and height
7217e148 71 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
72 * abr Average audio bitrate in KBit/s
73 * acodec Name of the audio codec in use
dd27fd17 74 * asr Audio sampling rate in Hertz
d67b0b15 75 * vbr Average video bitrate in KBit/s
fbb21cf5 76 * fps Frame rate
d67b0b15 77 * vcodec Name of the video codec in use
1394ce65 78 * container Name of the container format
d67b0b15 79 * filesize The number of bytes, if known in advance
9732d77e 80 * filesize_approx An estimate for the number of bytes
d67b0b15 81 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
82 * protocol The protocol that will be used for the actual
83 download, lower-case.
db1f3888 84 "http", "https", "rtsp", "rtmp", "m3u8" or so.
f49d89ee 85 * preference Order number of this format. If this field is
08d13955 86 present and not None, the formats get sorted
38d63d84 87 by this field, regardless of all other values.
f49d89ee
PH
88 -1 for default (order by other properties),
89 -2 or smaller for less than default.
5d73273f
PH
90 * quality Order number of the video quality of this
91 format, irrespective of the file format.
92 -1 for default (order by other properties),
93 -2 or smaller for less than default.
c64ed2a3
PH
94 * source_preference Order number for this video source
95 (quality takes higher priority)
96 -1 for default (order by other properties),
97 -2 or smaller for less than default.
d769be6c
PH
98 * http_referer HTTP Referer header value to set.
99 * http_method HTTP method to use for the download.
100 * http_headers A dictionary of additional HTTP headers
101 to add to the request.
102 * http_post_data Additional data to send with a POST
103 request.
c0ba0f48 104 url: Final video URL.
d6983cb4 105 ext: Video filename extension.
d67b0b15
PH
106 format: The video format, defaults to ext (used for --get-format)
107 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 108
d6983cb4
PH
109 The following fields are optional:
110
0afef30b
PH
111 display_id An alternative identifier for the video, not necessarily
112 unique, but available before title. Typically, id is
113 something like "4234987", title "Dancing naked mole rats",
114 and display_id "dancing-naked-mole-rats"
d5519808
PH
115 thumbnails: A list of dictionaries, with the following entries:
116 * "url"
117 * "width" (optional, int)
118 * "height" (optional, int)
119 * "resolution" (optional, string "{width}x{height"},
120 deprecated)
d6983cb4
PH
121 thumbnail: Full URL to a video thumbnail image.
122 description: One-line video description.
123 uploader: Full name of the video uploader.
955c4514 124 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 125 upload_date: Video upload date (YYYYMMDD).
955c4514 126 If not explicitly set, calculated from timestamp.
d6983cb4 127 uploader_id: Nickname or id of the video uploader.
da9ec3b9 128 location: Physical location where the video was filmed.
5d51a883
JMF
129 subtitles: The subtitle file contents as a dictionary in the format
130 {language: subtitles}.
c0ba0f48 131 duration: Length of the video in seconds, as an integer.
f3d29461 132 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
133 like_count: Number of positive ratings of the video
134 dislike_count: Number of negative ratings of the video
135 comment_count: Number of comments on the video
8dbe9899 136 age_limit: Age restriction for the video, as an integer (years)
9103bbc5
JMF
137 webpage_url: The url to the video webpage, if given to youtube-dl it
138 should allow to get the same result again. (It will be set
139 by YoutubeDL if it's missing)
ad3bc6ac
PH
140 categories: A list of categories that the video falls in, for example
141 ["Sports", "Berlin"]
7267bd53
PH
142 is_live: True, False, or None (=unknown). Whether this video is a
143 live stream that goes on instead of a fixed-length video.
d6983cb4 144
deefc05b 145 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 146
d838b1bd
PH
147 Unless mentioned otherwise, None is equivalent to absence of information.
148
d6983cb4
PH
149 Subclasses of this one should re-define the _real_initialize() and
150 _real_extract() methods and define a _VALID_URL regexp.
151 Probably, they should also be added to the list of extractors.
152
d6983cb4
PH
153 Finally, the _WORKING attribute should be set to False for broken IEs
154 in order to warn the users and skip the tests.
155 """
156
157 _ready = False
158 _downloader = None
159 _WORKING = True
160
161 def __init__(self, downloader=None):
162 """Constructor. Receives an optional downloader."""
163 self._ready = False
164 self.set_downloader(downloader)
165
166 @classmethod
167 def suitable(cls, url):
168 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
169
170 # This does not use has/getattr intentionally - we want to know whether
171 # we have cached the regexp for *this* class, whereas getattr would also
172 # match the superclass
173 if '_VALID_URL_RE' not in cls.__dict__:
174 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
175 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 176
ed9266db
PH
177 @classmethod
178 def _match_id(cls, url):
179 if '_VALID_URL_RE' not in cls.__dict__:
180 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
181 m = cls._VALID_URL_RE.match(url)
182 assert m
183 return m.group('id')
184
d6983cb4
PH
185 @classmethod
186 def working(cls):
187 """Getter method for _WORKING."""
188 return cls._WORKING
189
190 def initialize(self):
191 """Initializes an instance (authentication, etc)."""
192 if not self._ready:
193 self._real_initialize()
194 self._ready = True
195
196 def extract(self, url):
197 """Extracts URL information and returns it in list of dicts."""
198 self.initialize()
199 return self._real_extract(url)
200
201 def set_downloader(self, downloader):
202 """Sets the downloader for this IE."""
203 self._downloader = downloader
204
205 def _real_initialize(self):
206 """Real initialization process. Redefine in subclasses."""
207 pass
208
209 def _real_extract(self, url):
210 """Real extraction process. Redefine in subclasses."""
211 pass
212
56c73665
JMF
213 @classmethod
214 def ie_key(cls):
215 """A string for getting the InfoExtractor with get_info_extractor"""
216 return cls.__name__[:-2]
217
d6983cb4
PH
218 @property
219 def IE_NAME(self):
220 return type(self).__name__[:-2]
221
7cc3570e 222 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
223 """ Returns the response handle """
224 if note is None:
225 self.report_download_webpage(video_id)
226 elif note is not False:
7cc3570e 227 if video_id is None:
f1a9d64e 228 self.to_screen('%s' % (note,))
7cc3570e 229 else:
f1a9d64e 230 self.to_screen('%s: %s' % (video_id, note))
d6983cb4 231 try:
dca08720 232 return self._downloader.urlopen(url_or_request)
d6983cb4 233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
234 if errnote is False:
235 return False
d6983cb4 236 if errnote is None:
f1a9d64e
PH
237 errnote = 'Unable to download webpage'
238 errmsg = '%s: %s' % (errnote, compat_str(err))
7cc3570e
PH
239 if fatal:
240 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
241 else:
242 self._downloader.report_warning(errmsg)
243 return False
d6983cb4 244
7cc3570e 245 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 246 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
247 # Strip hashes from the URL (#1038)
248 if isinstance(url_or_request, (compat_str, str)):
249 url_or_request = url_or_request.partition('#')[0]
250
7cc3570e
PH
251 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
252 if urlh is False:
253 assert not fatal
254 return False
23be51d8
PH
255 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
256 return (content, urlh)
257
258 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 259 content_type = urlh.headers.get('Content-Type', '')
f143d86a 260 webpage_bytes = urlh.read()
d6983cb4
PH
261 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
262 if m:
263 encoding = m.group(1)
264 else:
0d75ae2c 265 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
266 webpage_bytes[:1024])
267 if m:
268 encoding = m.group(1).decode('ascii')
b60016e8
PH
269 elif webpage_bytes.startswith(b'\xff\xfe'):
270 encoding = 'utf-16'
f143d86a
PH
271 else:
272 encoding = 'utf-8'
d6983cb4
PH
273 if self._downloader.params.get('dump_intermediate_pages', False):
274 try:
275 url = url_or_request.get_full_url()
276 except AttributeError:
277 url = url_or_request
f1a9d64e 278 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
279 dump = base64.b64encode(webpage_bytes).decode('ascii')
280 self._downloader.to_screen(dump)
d41e6efc
PH
281 if self._downloader.params.get('write_pages', False):
282 try:
283 url = url_or_request.get_full_url()
284 except AttributeError:
285 url = url_or_request
5afa7f8b 286 basen = '%s_%s' % (video_id, url)
c1bce22f 287 if len(basen) > 240:
f1a9d64e 288 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
289 basen = basen[:240 - len(h)] + h
290 raw_filename = basen + '.dump'
d41e6efc 291 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 292 self.to_screen('Saving request to ' + filename)
5f58165d
S
293 # Working around MAX_PATH limitation on Windows (see
294 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
295 if os.name == 'nt':
296 absfilepath = os.path.abspath(filename)
297 if len(absfilepath) > 259:
298 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
299 with open(filename, 'wb') as outf:
300 outf.write(webpage_bytes)
301
ec0fafbb
AA
302 try:
303 content = webpage_bytes.decode(encoding, 'replace')
304 except LookupError:
305 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 306
f1a9d64e
PH
307 if ('<title>Access to this site is blocked</title>' in content and
308 'Websense' in content[:512]):
309 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
310 blocked_iframe = self._html_search_regex(
311 r'<iframe src="([^"]+)"', content,
f1a9d64e 312 'Websense information URL', default=None)
2410c43d 313 if blocked_iframe:
f1a9d64e 314 msg += ' Visit %s for more details' % blocked_iframe
2410c43d
PH
315 raise ExtractorError(msg, expected=True)
316
23be51d8 317 return content
d6983cb4 318
7cc3570e 319 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 320 """ Returns the data of the page as a string """
7cc3570e
PH
321 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
322 if res is False:
323 return res
324 else:
325 content, _ = res
326 return content
d6983cb4 327
2a275ab0 328 def _download_xml(self, url_or_request, video_id,
f1a9d64e 329 note='Downloading XML', errnote='Unable to download XML',
28746fbd 330 transform_source=None, fatal=True):
267ed0c5 331 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd
PH
332 xml_string = self._download_webpage(
333 url_or_request, video_id, note, errnote, fatal=fatal)
334 if xml_string is False:
335 return xml_string
e2b38da9
PH
336 if transform_source:
337 xml_string = transform_source(xml_string)
267ed0c5
JMF
338 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
339
3d3538e4 340 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
341 note='Downloading JSON metadata',
342 errnote='Unable to download JSON metadata',
b090af59
PH
343 transform_source=None,
344 fatal=True):
345 json_string = self._download_webpage(
346 url_or_request, video_id, note, errnote, fatal=fatal)
347 if (not fatal) and json_string is False:
348 return None
81c2f20b
PH
349 if transform_source:
350 json_string = transform_source(json_string)
3d3538e4
PH
351 try:
352 return json.loads(json_string)
353 except ValueError as ve:
e7b6d122
PH
354 errmsg = '%s: Failed to parse JSON ' % video_id
355 if fatal:
356 raise ExtractorError(errmsg, cause=ve)
357 else:
358 self.report_warning(errmsg + str(ve))
3d3538e4 359
f45f96f8 360 def report_warning(self, msg, video_id=None):
f1a9d64e 361 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 362 self._downloader.report_warning(
f1a9d64e 363 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 364
d6983cb4
PH
365 def to_screen(self, msg):
366 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 367 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
368
369 def report_extraction(self, id_or_name):
370 """Report information extraction."""
f1a9d64e 371 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
372
373 def report_download_webpage(self, video_id):
374 """Report webpage download."""
f1a9d64e 375 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
376
377 def report_age_confirmation(self):
378 """Report attempt to confirm age."""
f1a9d64e 379 self.to_screen('Confirming age')
d6983cb4 380
fc79158d
JMF
381 def report_login(self):
382 """Report attempt to log in."""
f1a9d64e 383 self.to_screen('Logging in')
fc79158d 384
d6983cb4 385 #Methods for following #608
c0d0b01f
JMF
386 @staticmethod
387 def url_result(url, ie=None, video_id=None):
d6983cb4
PH
388 """Returns a url that points to a page that should be processed"""
389 #TODO: ie should be the class used for getting the info
390 video_info = {'_type': 'url',
391 'url': url,
392 'ie_key': ie}
7012b23c
PH
393 if video_id is not None:
394 video_info['id'] = video_id
d6983cb4 395 return video_info
c0d0b01f
JMF
396 @staticmethod
397 def playlist_result(entries, playlist_id=None, playlist_title=None):
d6983cb4
PH
398 """Returns a playlist"""
399 video_info = {'_type': 'playlist',
400 'entries': entries}
401 if playlist_id:
402 video_info['id'] = playlist_id
403 if playlist_title:
404 video_info['title'] = playlist_title
405 return video_info
406
46374a56 407 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4
PH
408 """
409 Perform a regex search on the given string, using a single or a list of
410 patterns returning the first matching group.
411 In case of failure return a default value or raise a WARNING or a
55b3e45b 412 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
413 """
414 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
415 mobj = re.search(pattern, string, flags)
416 else:
417 for p in pattern:
418 mobj = re.search(p, string, flags)
c3415d1b
PH
419 if mobj:
420 break
d6983cb4 421
87a28127 422 if os.name != 'nt' and sys.stderr.isatty():
f1a9d64e 423 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
424 else:
425 _name = name
426
427 if mobj:
428 # return the first matching group
429 return next(g for g in mobj.groups() if g is not None)
46374a56 430 elif default is not _NO_DEFAULT:
d6983cb4
PH
431 return default
432 elif fatal:
f1a9d64e 433 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 434 else:
f1a9d64e
PH
435 self._downloader.report_warning('unable to extract %s; '
436 'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4
PH
437 return None
438
46374a56 439 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4
PH
440 """
441 Like _search_regex, but strips HTML tags and unescapes entities.
442 """
443 res = self._search_regex(pattern, string, name, default, fatal, flags)
444 if res:
445 return clean_html(res).strip()
446 else:
447 return res
448
fc79158d
JMF
449 def _get_login_info(self):
450 """
451 Get the the login info as (username, password)
452 It will look in the netrc file using the _NETRC_MACHINE value
453 If there's no info available, return (None, None)
454 """
455 if self._downloader is None:
456 return (None, None)
457
458 username = None
459 password = None
460 downloader_params = self._downloader.params
461
462 # Attempt to use provided username and password or .netrc data
463 if downloader_params.get('username', None) is not None:
464 username = downloader_params['username']
465 password = downloader_params['password']
466 elif downloader_params.get('usenetrc', False):
467 try:
468 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
469 if info is not None:
470 username = info[0]
471 password = info[2]
472 else:
473 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
474 except (IOError, netrc.NetrcParseError) as err:
f1a9d64e 475 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
fc79158d
JMF
476
477 return (username, password)
478
83317f69 479 def _get_tfa_info(self):
480 """
481 Get the two-factor authentication info
482 TODO - asking the user will be required for sms/phone verify
483 currently just uses the command line option
484 If there's no info available, return None
485 """
486 if self._downloader is None:
83317f69 487 return None
488 downloader_params = self._downloader.params
489
490 if downloader_params.get('twofactor', None) is not None:
491 return downloader_params['twofactor']
492
83317f69 493 return None
494
46720279
JMF
495 # Helper functions for extracting OpenGraph info
496 @staticmethod
ab2d5247 497 def _og_regexes(prop):
c1206423 498 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
9887c9b2 499 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
78fb87b2 500 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 501 return [
78fb87b2
JMF
502 template % (property_re, content_re),
503 template % (content_re, property_re),
ab2d5247 504 ]
46720279 505
3c4e6d83 506 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 507 if name is None:
3c4e6d83 508 name = 'OpenGraph %s' % prop
ab2d5247 509 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
510 if escaped is None:
511 return None
512 return unescapeHTML(escaped)
46720279
JMF
513
514 def _og_search_thumbnail(self, html, **kargs):
f1a9d64e 515 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
46720279
JMF
516
517 def _og_search_description(self, html, **kargs):
518 return self._og_search_property('description', html, fatal=False, **kargs)
519
520 def _og_search_title(self, html, **kargs):
521 return self._og_search_property('title', html, **kargs)
522
8ffa13e0 523 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
524 regexes = self._og_regexes('video') + self._og_regexes('video:url')
525 if secure:
526 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 527 return self._html_search_regex(regexes, html, name, **kargs)
46720279 528
78338f71
JMF
529 def _og_search_url(self, html, **kargs):
530 return self._og_search_property('url', html, **kargs)
531
40c696e5 532 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
59040888
PH
533 if display_name is None:
534 display_name = name
535 return self._html_search_regex(
aaebed13 536 r'''(?ix)<meta
1a30deca 537 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
59040888 538 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
40c696e5 539 html, display_name, fatal=fatal, **kwargs)
59040888
PH
540
541 def _dc_search_uploader(self, html):
542 return self._html_search_meta('dc.creator', html, 'uploader')
543
8dbe9899
PH
544 def _rta_search(self, html):
545 # See http://www.rtalabel.org/index.php?content=howtofaq#single
546 if re.search(r'(?ix)<meta\s+name="rating"\s+'
547 r' content="RTA-5042-1996-1400-1577-RTA"',
548 html):
549 return 18
550 return 0
551
59040888
PH
552 def _media_rating_search(self, html):
553 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
554 rating = self._html_search_meta('rating', html)
555
556 if not rating:
557 return None
558
559 RATING_TABLE = {
560 'safe for kids': 0,
561 'general': 8,
562 '14 years': 14,
563 'mature': 17,
564 'restricted': 19,
565 }
566 return RATING_TABLE.get(rating.lower(), None)
567
0c708f11
JMF
568 def _twitter_search_player(self, html):
569 return self._html_search_meta('twitter:player', html,
570 'twitter card player')
571
4bcc7bd1 572 def _sort_formats(self, formats):
7e8caf30 573 if not formats:
f1a9d64e 574 raise ExtractorError('No video formats found')
7e8caf30 575
4bcc7bd1 576 def _formats_key(f):
e6812ac9
PH
577 # TODO remove the following workaround
578 from ..utils import determine_ext
579 if not f.get('ext') and 'url' in f:
580 f['ext'] = determine_ext(f['url'])
581
4bcc7bd1
PH
582 preference = f.get('preference')
583 if preference is None:
c7deaa4c
PH
584 proto = f.get('protocol')
585 if proto is None:
586 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
587
588 preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1
PH
589 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
590 preference -= 0.5
591
592 if f.get('vcodec') == 'none': # audio only
593 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 594 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 595 else:
f1a9d64e 596 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
597 ext_preference = 0
598 try:
599 audio_ext_preference = ORDER.index(f['ext'])
600 except ValueError:
601 audio_ext_preference = -1
602 else:
603 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 604 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 605 else:
f1a9d64e 606 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
607 try:
608 ext_preference = ORDER.index(f['ext'])
609 except ValueError:
610 ext_preference = -1
611 audio_ext_preference = 0
612
613 return (
614 preference,
5d73273f 615 f.get('quality') if f.get('quality') is not None else -1,
4bcc7bd1
PH
616 f.get('height') if f.get('height') is not None else -1,
617 f.get('width') if f.get('width') is not None else -1,
618 ext_preference,
9933b574 619 f.get('tbr') if f.get('tbr') is not None else -1,
4bcc7bd1
PH
620 f.get('vbr') if f.get('vbr') is not None else -1,
621 f.get('abr') if f.get('abr') is not None else -1,
622 audio_ext_preference,
2c8e03d9 623 f.get('fps') if f.get('fps') is not None else -1,
4bcc7bd1 624 f.get('filesize') if f.get('filesize') is not None else -1,
9732d77e 625 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 626 f.get('source_preference') if f.get('source_preference') is not None else -1,
4bcc7bd1
PH
627 f.get('format_id'),
628 )
629 formats.sort(key=_formats_key)
59040888 630
20991253 631 def http_scheme(self):
1ede5b24 632 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
633 return (
634 'http:'
635 if self._downloader.params.get('prefer_insecure', False)
636 else 'https:')
637
57c7411f
PH
638 def _proto_relative_url(self, url, scheme=None):
639 if url is None:
640 return url
641 if url.startswith('//'):
642 if scheme is None:
643 scheme = self.http_scheme()
644 return scheme + url
645 else:
646 return url
647
4094b6e3
PH
648 def _sleep(self, timeout, video_id, msg_template=None):
649 if msg_template is None:
f1a9d64e 650 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
651 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
652 self.to_screen(msg)
653 time.sleep(timeout)
654
31bb8d3f 655 def _extract_f4m_formats(self, manifest_url, video_id):
f036a632
JMF
656 manifest = self._download_xml(
657 manifest_url, video_id, 'Downloading f4m manifest',
658 'Unable to download f4m manifest')
31bb8d3f
JMF
659
660 formats = []
b2527359
PH
661 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
662 for i, media_el in enumerate(media_nodes):
663 tbr = int_or_none(media_el.attrib.get('bitrate'))
664 format_id = 'f4m-%d' % (i if tbr is None else tbr)
31bb8d3f 665 formats.append({
b2527359 666 'format_id': format_id,
31bb8d3f
JMF
667 'url': manifest_url,
668 'ext': 'flv',
b2527359 669 'tbr': tbr,
31bb8d3f
JMF
670 'width': int_or_none(media_el.attrib.get('width')),
671 'height': int_or_none(media_el.attrib.get('height')),
672 })
673 self._sort_formats(formats)
674
675 return formats
676
f0b5d6af
PH
677 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
678 entry_protocol='m3u8', preference=None):
679
704df56d
PH
680 formats = [{
681 'format_id': 'm3u8-meta',
682 'url': m3u8_url,
683 'ext': ext,
684 'protocol': 'm3u8',
685 'preference': -1,
686 'resolution': 'multiple',
687 'format_note': 'Quality selection URL',
688 }]
689
f0b5d6af
PH
690 format_url = lambda u: (
691 u
692 if re.match(r'^https?://', u)
693 else compat_urlparse.urljoin(m3u8_url, u))
694
81515ad9
PH
695 m3u8_doc = self._download_webpage(
696 m3u8_url, video_id,
697 note='Downloading m3u8 information',
698 errnote='Failed to download m3u8 information')
704df56d
PH
699 last_info = None
700 kv_rex = re.compile(
701 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
702 for line in m3u8_doc.splitlines():
703 if line.startswith('#EXT-X-STREAM-INF:'):
704 last_info = {}
705 for m in kv_rex.finditer(line):
706 v = m.group('val')
707 if v.startswith('"'):
708 v = v[1:-1]
709 last_info[m.group('key')] = v
710 elif line.startswith('#') or not line.strip():
711 continue
712 else:
daebaab6 713 if last_info is None:
f0b5d6af 714 formats.append({'url': format_url(line)})
3524cc25 715 continue
704df56d
PH
716 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
717
718 f = {
719 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
f0b5d6af 720 'url': format_url(line.strip()),
704df56d
PH
721 'tbr': tbr,
722 'ext': ext,
f0b5d6af
PH
723 'protocol': entry_protocol,
724 'preference': preference,
704df56d
PH
725 }
726 codecs = last_info.get('CODECS')
727 if codecs:
9ebf22b7
S
728 # TODO: looks like video codec is not always necessarily goes first
729 va_codecs = codecs.split(',')
730 if va_codecs[0]:
731 f['vcodec'] = va_codecs[0].partition('.')[0]
732 if len(va_codecs) > 1 and va_codecs[1]:
733 f['acodec'] = va_codecs[1].partition('.')[0]
704df56d
PH
734 resolution = last_info.get('RESOLUTION')
735 if resolution:
736 width_str, height_str = resolution.split('x')
737 f['width'] = int(width_str)
738 f['height'] = int(height_str)
739 formats.append(f)
740 last_info = {}
741 self._sort_formats(formats)
742 return formats
743
f4b1c7ad
PH
744 def _live_title(self, name):
745 """ Generate the title for a live video """
746 now = datetime.datetime.now()
747 now_str = now.strftime("%Y-%m-%d %H:%M")
748 return name + ' ' + now_str
749
b14f3a4c
PH
750 def _int(self, v, name, fatal=False, **kwargs):
751 res = int_or_none(v, **kwargs)
752 if 'get_attr' in kwargs:
753 print(getattr(v, kwargs['get_attr']))
754 if res is None:
755 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
756 if fatal:
757 raise ExtractorError(msg)
758 else:
759 self._downloader.report_warning(msg)
760 return res
761
762 def _float(self, v, name, fatal=False, **kwargs):
763 res = float_or_none(v, **kwargs)
764 if res is None:
765 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
766 if fatal:
767 raise ExtractorError(msg)
768 else:
769 self._downloader.report_warning(msg)
770 return res
771
8dbe9899 772
d6983cb4
PH
773class SearchInfoExtractor(InfoExtractor):
774 """
775 Base class for paged search queries extractors.
776 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
777 Instances should define _SEARCH_KEY and _MAX_RESULTS.
778 """
779
780 @classmethod
781 def _make_valid_url(cls):
782 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
783
784 @classmethod
785 def suitable(cls, url):
786 return re.match(cls._make_valid_url(), url) is not None
787
788 def _real_extract(self, query):
789 mobj = re.match(self._make_valid_url(), query)
790 if mobj is None:
f1a9d64e 791 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
792
793 prefix = mobj.group('prefix')
794 query = mobj.group('query')
795 if prefix == '':
796 return self._get_n_results(query, 1)
797 elif prefix == 'all':
798 return self._get_n_results(query, self._MAX_RESULTS)
799 else:
800 n = int(prefix)
801 if n <= 0:
f1a9d64e 802 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 803 elif n > self._MAX_RESULTS:
f1a9d64e 804 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
805 n = self._MAX_RESULTS
806 return self._get_n_results(query, n)
807
808 def _get_n_results(self, query, n):
809 """Get a specified number of results for a query"""
416a5efc 810 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
811
812 @property
813 def SEARCH_KEY(self):
814 return self._SEARCH_KEY