]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Improve cache and add an option to print the extracted signatures
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211
PH
5import itertools
6import io
c5e8d7af 7import json
c4417ddb
PH
8import operator
9import os.path
c5e8d7af
PH
10import re
11import socket
e0df6211
PH
12import string
13import struct
14import traceback
15import zlib
c5e8d7af 16
b05654f0 17from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 18from .subtitles import SubtitlesInfoExtractor
c5e8d7af 19from ..utils import (
edf3e38e 20 compat_chr,
c5e8d7af
PH
21 compat_http_client,
22 compat_parse_qs,
23 compat_urllib_error,
24 compat_urllib_parse,
25 compat_urllib_request,
26 compat_str,
27
28 clean_html,
29 get_element_by_id,
30 ExtractorError,
31 unescapeHTML,
32 unified_strdate,
04cc9617 33 orderedSet,
edf3e38e 34 write_json_file,
c5e8d7af
PH
35)
36
de7f3446 37class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
49
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
52 try:
53 self.report_lang()
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
57 return False
58 return True
59
60 def _login(self):
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 return False
67
68 request = compat_urllib_request.Request(self._LOGIN_URL)
69 try:
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
73 return False
74
75 galx = None
76 dsh = None
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
78 if match:
79 galx = match.group(1)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
81 if match:
82 dsh = match.group(1)
c5e8d7af 83
b2e8bc1b
JMF
84 # Log in
85 login_form_strs = {
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
87 u'Email': username,
88 u'GALX': galx,
89 u'Passwd': password,
90 u'PersistentCookie': u'yes',
91 u'_utf8': u'霱',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
95 u'dnConn': u'',
96 u'dsh': dsh,
97 u'pstMsg': u'0',
98 u'rmShown': u'1',
99 u'secTok': u'',
100 u'signIn': u'Sign in',
101 u'timeStmp': u'',
102 u'service': u'youtube',
103 u'uilel': u'3',
104 u'hl': u'en_US',
105 }
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
107 # chokes on unicode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
111 try:
112 self.report_login()
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
116 return False
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
119 return False
120 return True
121
122 def _confirm_age(self):
123 age_form = {
124 'next_url': '/',
125 'action_confirm': 'Confirm',
126 }
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
128 try:
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
133 return True
134
135 def _real_initialize(self):
136 if self._downloader is None:
137 return
138 if not self._set_language():
139 return
140 if not self._login():
141 return
142 self._confirm_age()
c5e8d7af 143
8377574c 144
de7f3446 145class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 146 IE_DESC = u'YouTube.com'
c5e8d7af
PH
147 _VALID_URL = r"""^
148 (
149 (?:https?://)? # http(s):// (optional)
f4b05232 150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
e69ae5b9
JMF
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
d741e55a 157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
160 v=
161 )
f4b05232
JMF
162 ))
163 |youtu\.be/ # just youtu.be/xxxx
164 )
c5e8d7af 165 )? # all until now is optional -> you can pass the naked ID
8963d9c2 166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
167 (?(1).+)? # if we found the ID, everything can follow
168 $"""
c5e8d7af 169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 170 # Listed in order of quality
bdc6b3fc 171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 172 # Apple HTTP Live Streaming
bdc6b3fc 173 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
174 # 3D
175 '85', '84', '102', '83', '101', '82', '100',
176 # Dash video
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
179 # Dash audio
180 '141', '172', '140', '171', '139',
1d043b93 181 ]
bdc6b3fc 182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 183 # Apple HTTP Live Streaming
bdc6b3fc
AZ
184 '96', '95', '94', '93', '92', '132', '151',
185 # 3D
86fe61c8 186 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
187 # Dash video
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
190 # Dash audio
191 '172', '141', '171', '140', '139',
1d043b93 192 ]
bdc6b3fc
AZ
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
198 }
c5e8d7af
PH
199 _video_extensions = {
200 '13': '3gp',
bdc6b3fc 201 '17': '3gp',
c5e8d7af
PH
202 '18': 'mp4',
203 '22': 'mp4',
bdc6b3fc 204 '36': '3gp',
c5e8d7af 205 '37': 'mp4',
d69cf69a 206 '38': 'mp4',
c5e8d7af
PH
207 '43': 'webm',
208 '44': 'webm',
209 '45': 'webm',
210 '46': 'webm',
1d043b93 211
86fe61c8
AZ
212 # 3d videos
213 '82': 'mp4',
214 '83': 'mp4',
215 '84': 'mp4',
216 '85': 'mp4',
217 '100': 'webm',
218 '101': 'webm',
219 '102': 'webm',
836a086c 220
96fb5605 221 # Apple HTTP Live Streaming
1d043b93
JMF
222 '92': 'mp4',
223 '93': 'mp4',
224 '94': 'mp4',
225 '95': 'mp4',
226 '96': 'mp4',
227 '132': 'mp4',
228 '151': 'mp4',
836a086c
AZ
229
230 # Dash mp4
231 '133': 'mp4',
232 '134': 'mp4',
233 '135': 'mp4',
234 '136': 'mp4',
235 '137': 'mp4',
236 '138': 'mp4',
237 '139': 'mp4',
238 '140': 'mp4',
239 '141': 'mp4',
240 '160': 'mp4',
241
242 # Dash webm
243 '171': 'webm',
244 '172': 'webm',
245 '242': 'webm',
246 '243': 'webm',
247 '244': 'webm',
248 '245': 'webm',
249 '246': 'webm',
250 '247': 'webm',
251 '248': 'webm',
c5e8d7af
PH
252 }
253 _video_dimensions = {
254 '5': '240x400',
255 '6': '???',
256 '13': '???',
257 '17': '144x176',
258 '18': '360x640',
259 '22': '720x1280',
260 '34': '360x640',
261 '35': '480x854',
bdc6b3fc 262 '36': '240x320',
c5e8d7af
PH
263 '37': '1080x1920',
264 '38': '3072x4096',
265 '43': '360x640',
266 '44': '480x854',
267 '45': '720x1280',
268 '46': '1080x1920',
86fe61c8
AZ
269 '82': '360p',
270 '83': '480p',
271 '84': '720p',
272 '85': '1080p',
1d043b93
JMF
273 '92': '240p',
274 '93': '360p',
275 '94': '480p',
276 '95': '720p',
277 '96': '1080p',
86fe61c8
AZ
278 '100': '360p',
279 '101': '480p',
836a086c 280 '102': '720p',
1d043b93
JMF
281 '132': '240p',
282 '151': '72p',
836a086c
AZ
283 '133': '240p',
284 '134': '360p',
285 '135': '480p',
286 '136': '720p',
287 '137': '1080p',
288 '138': '>1080p',
289 '139': '48k',
290 '140': '128k',
291 '141': '256k',
292 '160': '192p',
293 '171': '128k',
294 '172': '256k',
295 '242': '240p',
296 '243': '360p',
297 '244': '480p',
298 '245': '480p',
299 '246': '480p',
300 '247': '720p',
301 '248': '1080p',
c5e8d7af 302 }
836a086c
AZ
303 _special_itags = {
304 '82': '3D',
305 '83': '3D',
306 '84': '3D',
307 '85': '3D',
308 '100': '3D',
309 '101': '3D',
310 '102': '3D',
311 '133': 'DASH Video',
312 '134': 'DASH Video',
313 '135': 'DASH Video',
314 '136': 'DASH Video',
315 '137': 'DASH Video',
316 '138': 'DASH Video',
317 '139': 'DASH Audio',
318 '140': 'DASH Audio',
319 '141': 'DASH Audio',
320 '160': 'DASH Video',
321 '171': 'DASH Audio',
322 '172': 'DASH Audio',
323 '242': 'DASH Video',
324 '243': 'DASH Video',
325 '244': 'DASH Video',
326 '245': 'DASH Video',
327 '246': 'DASH Video',
328 '247': 'DASH Video',
329 '248': 'DASH Video',
c5e8d7af 330 }
836a086c 331
c5e8d7af 332 IE_NAME = u'youtube'
2eb88d95
PH
333 _TESTS = [
334 {
0e853ca4
PH
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
337 u"info_dict": {
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 343 }
0e853ca4
PH
344 },
345 {
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
349 u"info_dict": {
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
2eb88d95 355 }
0e853ca4
PH
356 },
357 {
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
361 u"info_dict": {
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
c7bf7366 364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
45ed795c 365 u"uploader": u"Icona Pop",
0e853ca4 366 u"uploader_id": u"IconaPop"
2eb88d95 367 }
c108eb73
JMF
368 },
369 {
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
373 u"info_dict": {
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
379 }
380 },
1d043b93
JMF
381 {
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
385 u'info_dict': {
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
391 },
392 u'params': {
393 u'skip_download': True,
394 },
395 },
2eb88d95
PH
396 ]
397
c5e8d7af
PH
398
399 @classmethod
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 402 if YoutubePlaylistIE.suitable(url): return False
c5e8d7af
PH
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
404
e0df6211
PH
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 407 self._player_cache = {}
e0df6211 408
c5e8d7af
PH
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
412
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
416
c5e8d7af
PH
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
420
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
424
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
428
c4417ddb
PH
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 431 player_url)
e0df6211
PH
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
434
c4417ddb
PH
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
edf3e38e
PH
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
c4417ddb 440
edf3e38e 441 if cache_dir != u'NONE':
c4417ddb
PH
442 cache_fn = os.path.join(os.path.expanduser(cache_dir),
443 u'youtube-sigfuncs',
444 func_id + '.json')
445 try:
edf3e38e 446 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
447 cache_spec = json.load(cachef)
448 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 449 except IOError:
c4417ddb 450 pass # No cache available
83799698 451
e0df6211
PH
452 if player_type == 'js':
453 code = self._download_webpage(
454 player_url, video_id,
83799698 455 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 456 errnote=u'Download of %s failed' % player_url)
83799698 457 res = self._parse_sig_js(code)
c4417ddb 458 elif player_type == 'swf':
e0df6211
PH
459 urlh = self._request_webpage(
460 player_url, video_id,
83799698 461 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
462 errnote=u'Download of %s failed' % player_url)
463 code = urlh.read()
83799698 464 res = self._parse_sig_swf(code)
e0df6211
PH
465 else:
466 assert False, 'Invalid player type %r' % player_type
467
c4417ddb 468 if cache_dir is not False:
edf3e38e
PH
469 try:
470 cache_res = res(map(compat_chr, range(slen)))
471 cache_spec = [ord(c) for c in cache_res]
472 try:
473 os.makedirs(os.path.dirname(cache_fn))
474 except OSError as ose:
475 if ose.errno != errno.EEXIST:
476 raise
477 write_json_file(cache_spec, cache_fn)
478 except Exception as e:
479 tb = traceback.format_exc()
480 self._downloader.report_warning(
481 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
482
483 return res
484
edf3e38e
PH
485 def _print_sig_code(self, func, slen):
486 def gen_sig_code(idxs):
487 def _genslice(start, end, step):
488 starts = u'' if start == 0 else str(start)
489 ends = u':%d' % (end+step)
490 steps = u'' if step == 1 else (':%d' % step)
491 return u's[%s%s%s]' % (starts, ends, steps)
492
493 step = None
494 for i, prev in zip(idxs[1:], idxs[:-1]):
495 if step is not None:
496 if i - prev == step:
497 continue
498 yield _genslice(start, prev, step)
499 step = None
500 continue
501 if i - prev in [-1, 1]:
502 step = i - prev
503 start = prev
504 continue
505 else:
506 yield u's[%d]' % prev
507 if step is None:
508 yield u's[%d]' % i
509 else:
510 yield _genslice(start, i, step)
511
512 cache_res = func(map(compat_chr, range(slen)))
513 cache_spec = [ord(c) for c in cache_res]
514 expr_code = u' + '.join(gen_sig_code(cache_spec))
515 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
516 self.to_screen(u'Extracted signature:\n' + code)
517
e0df6211
PH
518 def _parse_sig_js(self, jscode):
519 funcname = self._search_regex(
520 r'signature=([a-zA-Z]+)', jscode,
521 u'Initial JS player signature function name')
522
523 functions = {}
524
525 def argidx(varname):
526 return string.lowercase.index(varname)
527
528 def interpret_statement(stmt, local_vars, allow_recursion=20):
529 if allow_recursion < 0:
530 raise ExctractorError(u'Recursion limit reached')
531
532 if stmt.startswith(u'var '):
533 stmt = stmt[len(u'var '):]
534 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
535 r'=(?P<expr>.*)$', stmt)
536 if ass_m:
537 if ass_m.groupdict().get('index'):
538 def assign(val):
539 lvar = local_vars[ass_m.group('out')]
540 idx = interpret_expression(ass_m.group('index'),
541 local_vars, allow_recursion)
542 assert isinstance(idx, int)
543 lvar[idx] = val
544 return val
545 expr = ass_m.group('expr')
546 else:
547 def assign(val):
548 local_vars[ass_m.group('out')] = val
549 return val
550 expr = ass_m.group('expr')
551 elif stmt.startswith(u'return '):
552 assign = lambda v: v
553 expr = stmt[len(u'return '):]
554 else:
555 raise ExtractorError(
556 u'Cannot determine left side of statement in %r' % stmt)
557
558 v = interpret_expression(expr, local_vars, allow_recursion)
559 return assign(v)
560
561 def interpret_expression(expr, local_vars, allow_recursion):
562 if expr.isdigit():
563 return int(expr)
564
565 if expr.isalpha():
566 return local_vars[expr]
567
568 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
569 if m:
570 member = m.group('member')
571 val = local_vars[m.group('in')]
572 if member == 'split("")':
573 return list(val)
574 if member == 'join("")':
575 return u''.join(val)
576 if member == 'length':
577 return len(val)
578 if member == 'reverse()':
579 return val[::-1]
580 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
581 if slice_m:
582 idx = interpret_expression(
583 slice_m.group('idx'), local_vars, allow_recursion-1)
584 return val[idx:]
585
586 m = re.match(
587 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
588 if m:
589 val = local_vars[m.group('in')]
590 idx = interpret_expression(m.group('idx'), local_vars,
591 allow_recursion-1)
592 return val[idx]
593
594 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
595 if m:
596 a = interpret_expression(m.group('a'),
597 local_vars, allow_recursion)
598 b = interpret_expression(m.group('b'),
599 local_vars, allow_recursion)
600 return a % b
601
602 m = re.match(
603 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
604 if m:
605 fname = m.group('func')
606 if fname not in functions:
607 functions[fname] = extract_function(fname)
608 argvals = [int(v) if v.isdigit() else local_vars[v]
609 for v in m.group('args').split(',')]
610 return functions[fname](argvals)
611 raise ExtractorError(u'Unsupported JS expression %r' % expr)
612
613 def extract_function(funcname):
614 func_m = re.search(
615 r'function ' + re.escape(funcname) +
616 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
617 jscode)
618 argnames = func_m.group('args').split(',')
619
620 def resf(args):
621 local_vars = dict(zip(argnames, args))
622 for stmt in func_m.group('code').split(';'):
623 res = interpret_statement(stmt, local_vars)
624 return res
625 return resf
626
627 initial_function = extract_function(funcname)
628 return lambda s: initial_function([s])
629
630 def _parse_sig_swf(self, file_contents):
631 if file_contents[1:3] != b'WS':
632 raise ExtractorError(
633 u'Not an SWF file; header is %r' % file_contents[:3])
634 if file_contents[:1] == b'C':
635 content = zlib.decompress(file_contents[8:])
636 else:
637 raise NotImplementedError(u'Unsupported compression format %r' %
638 file_contents[:1])
639
640 def extract_tags(content):
641 pos = 0
642 while pos < len(content):
643 header16 = struct.unpack('<H', content[pos:pos+2])[0]
644 pos += 2
645 tag_code = header16 >> 6
646 tag_len = header16 & 0x3f
647 if tag_len == 0x3f:
648 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
649 pos += 4
650 assert pos+tag_len <= len(content)
651 yield (tag_code, content[pos:pos+tag_len])
652 pos += tag_len
653
654 code_tag = next(tag
655 for tag_code, tag in extract_tags(content)
656 if tag_code == 82)
657 p = code_tag.index(b'\0', 4) + 1
ba552f54 658 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
659
660 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
661 def read_int(reader=None):
662 if reader is None:
663 reader = code_reader
e0df6211
PH
664 res = 0
665 shift = 0
666 for _ in range(5):
ba552f54
PH
667 buf = reader.read(1)
668 assert len(buf) == 1
669 b = struct.unpack('<B', buf)[0]
e0df6211
PH
670 res = res | ((b & 0x7f) << shift)
671 if b & 0x80 == 0:
672 break
673 shift += 7
ba552f54
PH
674 return res
675
676 def u30(reader=None):
677 res = read_int(reader)
678 assert res & 0xf0000000 == 0
e0df6211
PH
679 return res
680 u32 = read_int
681
ba552f54
PH
682 def s32(reader=None):
683 v = read_int(reader)
e0df6211
PH
684 if v & 0x80000000 != 0:
685 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
686 return v
687
688 def string(reader=None):
689 if reader is None:
690 reader = code_reader
691 slen = u30(reader)
692 resb = reader.read(slen)
693 assert len(resb) == slen
694 return resb.decode('utf-8')
695
696 def read_bytes(count, reader=None):
697 if reader is None:
698 reader = code_reader
699 resb = reader.read(count)
700 assert len(resb) == count
701 return resb
702
703 def read_byte(reader=None):
704 resb = read_bytes(1, reader=reader)
705 res = struct.unpack('<B', resb)[0]
706 return res
e0df6211
PH
707
708 # minor_version + major_version
2f2ffea9 709 _ = read_bytes(2 + 2)
e0df6211
PH
710
711 # Constant pool
ba552f54 712 int_count = u30()
e0df6211 713 for _c in range(1, int_count):
ba552f54
PH
714 _ = s32()
715 uint_count = u30()
e0df6211 716 for _c in range(1, uint_count):
ba552f54
PH
717 _ = u32()
718 double_count = u30()
719 _ = read_bytes((double_count-1) * 8)
720 string_count = u30()
e0df6211
PH
721 constant_strings = [u'']
722 for _c in range(1, string_count):
ba552f54 723 s = string()
e0df6211 724 constant_strings.append(s)
ba552f54 725 namespace_count = u30()
e0df6211 726 for _c in range(1, namespace_count):
ba552f54
PH
727 _ = read_bytes(1) # kind
728 _ = u30() # name
729 ns_set_count = u30()
e0df6211 730 for _c in range(1, ns_set_count):
ba552f54 731 count = u30()
e0df6211 732 for _c2 in range(count):
ba552f54
PH
733 _ = u30()
734 multiname_count = u30()
e0df6211
PH
735 MULTINAME_SIZES = {
736 0x07: 2, # QName
737 0x0d: 2, # QNameA
738 0x0f: 1, # RTQName
739 0x10: 1, # RTQNameA
740 0x11: 0, # RTQNameL
741 0x12: 0, # RTQNameLA
742 0x09: 2, # Multiname
743 0x0e: 2, # MultinameA
744 0x1b: 1, # MultinameL
745 0x1c: 1, # MultinameLA
746 }
747 multinames = [u'']
748 for _c in range(1, multiname_count):
ba552f54 749 kind = u30()
e0df6211
PH
750 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
751 if kind == 0x07:
ba552f54
PH
752 namespace_idx = u30()
753 name_idx = u30()
e0df6211
PH
754 multinames.append(constant_strings[name_idx])
755 else:
756 multinames.append('[MULTINAME kind: %d]' % kind)
757 for _c2 in range(MULTINAME_SIZES[kind]):
ba552f54 758 _ = u30()
e0df6211
PH
759
760 # Methods
ba552f54 761 method_count = u30()
e0df6211
PH
762 MethodInfo = collections.namedtuple(
763 'MethodInfo',
764 ['NEED_ARGUMENTS', 'NEED_REST'])
765 method_infos = []
766 for method_id in range(method_count):
ba552f54
PH
767 param_count = u30()
768 _ = u30() # return type
e0df6211 769 for _ in range(param_count):
ba552f54
PH
770 _ = u30() # param type
771 _ = u30() # name index (always 0 for youtube)
772 flags = read_byte()
e0df6211
PH
773 if flags & 0x08 != 0:
774 # Options present
ba552f54 775 option_count = u30()
e0df6211 776 for c in range(option_count):
ba552f54
PH
777 _ = u30() # val
778 _ = read_bytes(1) # kind
e0df6211
PH
779 if flags & 0x80 != 0:
780 # Param names present
781 for _ in range(param_count):
ba552f54 782 _ = u30() # param name
e0df6211
PH
783 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
784 method_infos.append(mi)
785
786 # Metadata
ba552f54 787 metadata_count = u30()
e0df6211 788 for _c in range(metadata_count):
ba552f54
PH
789 _ = u30() # name
790 item_count = u30()
e0df6211 791 for _c2 in range(item_count):
ba552f54
PH
792 _ = u30() # key
793 _ = u30() # value
794
795 def parse_traits_info():
796 trait_name_idx = u30()
797 kind_full = read_byte()
e0df6211
PH
798 kind = kind_full & 0x0f
799 attrs = kind_full >> 4
800 methods = {}
801 if kind in [0x00, 0x06]: # Slot or Const
ba552f54
PH
802 _ = u30() # Slot id
803 type_name_idx = u30()
804 vindex = u30()
e0df6211 805 if vindex != 0:
ba552f54 806 _ = read_byte() # vkind
e0df6211 807 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
ba552f54
PH
808 _ = u30() # disp_id
809 method_idx = u30()
e0df6211
PH
810 methods[multinames[trait_name_idx]] = method_idx
811 elif kind == 0x04: # Class
ba552f54
PH
812 _ = u30() # slot_id
813 _ = u30() # classi
e0df6211 814 elif kind == 0x05: # Function
ba552f54
PH
815 _ = u30() # slot_id
816 function_idx = u30()
e0df6211
PH
817 methods[function_idx] = multinames[trait_name_idx]
818 else:
819 raise ExtractorError(u'Unsupported trait kind %d' % kind)
820
821 if attrs & 0x4 != 0: # Metadata present
ba552f54 822 metadata_count = u30()
e0df6211 823 for _c3 in range(metadata_count):
ba552f54 824 _ = u30()
e0df6211 825
ba552f54 826 return methods
e0df6211
PH
827
828 # Classes
829 TARGET_CLASSNAME = u'SignatureDecipher'
830 searched_idx = multinames.index(TARGET_CLASSNAME)
831 searched_class_id = None
ba552f54 832 class_count = u30()
e0df6211 833 for class_id in range(class_count):
ba552f54 834 name_idx = u30()
e0df6211
PH
835 if name_idx == searched_idx:
836 # We found the class we're looking for!
837 searched_class_id = class_id
ba552f54
PH
838 _ = u30() # super_name idx
839 flags = read_byte()
e0df6211 840 if flags & 0x08 != 0: # Protected namespace is present
ba552f54
PH
841 protected_ns_idx = u30()
842 intrf_count = u30()
e0df6211 843 for _c2 in range(intrf_count):
ba552f54
PH
844 _ = u30()
845 _ = u30() # iinit
846 trait_count = u30()
e0df6211 847 for _c2 in range(trait_count):
ba552f54 848 _ = parse_traits_info()
e0df6211
PH
849
850 if searched_class_id is None:
851 raise ExtractorError(u'Target class %r not found' %
852 TARGET_CLASSNAME)
853
854 method_names = {}
855 method_idxs = {}
856 for class_id in range(class_count):
ba552f54
PH
857 _ = u30() # cinit
858 trait_count = u30()
e0df6211 859 for _c2 in range(trait_count):
ba552f54 860 trait_methods = parse_traits_info()
e0df6211
PH
861 if class_id == searched_class_id:
862 method_names.update(trait_methods.items())
863 method_idxs.update(dict(
864 (idx, name)
865 for name, idx in trait_methods.items()))
866
867 # Scripts
ba552f54 868 script_count = u30()
e0df6211 869 for _c in range(script_count):
ba552f54
PH
870 _ = u30() # init
871 trait_count = u30()
e0df6211 872 for _c2 in range(trait_count):
ba552f54 873 _ = parse_traits_info()
e0df6211
PH
874
875 # Method bodies
ba552f54 876 method_body_count = u30()
e0df6211
PH
877 Method = collections.namedtuple('Method', ['code', 'local_count'])
878 methods = {}
879 for _c in range(method_body_count):
ba552f54
PH
880 method_idx = u30()
881 max_stack = u30()
882 local_count = u30()
883 init_scope_depth = u30()
884 max_scope_depth = u30()
885 code_length = u30()
886 code = read_bytes(code_length)
e0df6211 887 if method_idx in method_idxs:
ba552f54 888 m = Method(code, local_count)
e0df6211 889 methods[method_idxs[method_idx]] = m
ba552f54 890 exception_count = u30()
e0df6211 891 for _c2 in range(exception_count):
ba552f54
PH
892 _ = u30() # from
893 _ = u30() # to
894 _ = u30() # target
895 _ = u30() # exc_type
896 _ = u30() # var_name
897 trait_count = u30()
e0df6211 898 for _c2 in range(trait_count):
ba552f54 899 _ = parse_traits_info()
e0df6211 900
ba552f54 901 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
902 assert len(methods) == len(method_idxs)
903
904 method_pyfunctions = {}
905
906 def extract_function(func_name):
907 if func_name in method_pyfunctions:
908 return method_pyfunctions[func_name]
909 if func_name not in methods:
910 raise ExtractorError(u'Cannot find function %r' % func_name)
911 m = methods[func_name]
912
913 def resfunc(args):
e0df6211
PH
914 registers = ['(this)'] + list(args) + [None] * m.local_count
915 stack = []
916 coder = io.BytesIO(m.code)
917 while True:
918 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 919 if opcode == 36: # pushbyte
e0df6211
PH
920 v = struct.unpack('!B', coder.read(1))[0]
921 stack.append(v)
922 elif opcode == 44: # pushstring
923 idx = u30(coder)
924 stack.append(constant_strings[idx])
925 elif opcode == 48: # pushscope
926 # We don't implement the scope register, so we'll just
927 # ignore the popped value
928 stack.pop()
929 elif opcode == 70: # callproperty
930 index = u30(coder)
931 mname = multinames[index]
932 arg_count = u30(coder)
933 args = list(reversed(
934 [stack.pop() for _ in range(arg_count)]))
935 obj = stack.pop()
936 if mname == u'split':
937 assert len(args) == 1
938 assert isinstance(args[0], compat_str)
939 assert isinstance(obj, compat_str)
940 if args[0] == u'':
941 res = list(obj)
942 else:
943 res = obj.split(args[0])
944 stack.append(res)
a7177865
PH
945 elif mname == u'slice':
946 assert len(args) == 1
947 assert isinstance(args[0], int)
948 assert isinstance(obj, list)
949 res = obj[args[0]:]
950 stack.append(res)
951 elif mname == u'join':
952 assert len(args) == 1
953 assert isinstance(args[0], compat_str)
954 assert isinstance(obj, list)
955 res = args[0].join(obj)
956 stack.append(res)
e0df6211
PH
957 elif mname in method_pyfunctions:
958 stack.append(method_pyfunctions[mname](args))
959 else:
960 raise NotImplementedError(
961 u'Unsupported property %r on %r'
962 % (mname, obj))
a7177865
PH
963 elif opcode == 72: # returnvalue
964 res = stack.pop()
965 return res
966 elif opcode == 79: # callpropvoid
967 index = u30(coder)
968 mname = multinames[index]
969 arg_count = u30(coder)
970 args = list(reversed(
971 [stack.pop() for _ in range(arg_count)]))
972 obj = stack.pop()
973 if mname == u'reverse':
974 assert isinstance(obj, list)
975 obj.reverse()
976 else:
977 raise NotImplementedError(
978 u'Unsupported (void) property %r on %r'
979 % (mname, obj))
e0df6211
PH
980 elif opcode == 93: # findpropstrict
981 index = u30(coder)
982 mname = multinames[index]
983 res = extract_function(mname)
984 stack.append(res)
985 elif opcode == 97: # setproperty
986 index = u30(coder)
987 value = stack.pop()
988 idx = stack.pop()
989 obj = stack.pop()
990 assert isinstance(obj, list)
991 assert isinstance(idx, int)
992 obj[idx] = value
993 elif opcode == 98: # getlocal
994 index = u30(coder)
995 stack.append(registers[index])
996 elif opcode == 99: # setlocal
997 index = u30(coder)
998 value = stack.pop()
999 registers[index] = value
1000 elif opcode == 102: # getproperty
1001 index = u30(coder)
1002 pname = multinames[index]
1003 if pname == u'length':
1004 obj = stack.pop()
1005 assert isinstance(obj, list)
1006 stack.append(len(obj))
1007 else: # Assume attribute access
1008 idx = stack.pop()
1009 assert isinstance(idx, int)
1010 obj = stack.pop()
1011 assert isinstance(obj, list)
1012 stack.append(obj[idx])
1013 elif opcode == 128: # coerce
1014 _ = u30(coder)
1015 elif opcode == 133: # coerce_s
1016 assert isinstance(stack[-1], (type(None), compat_str))
1017 elif opcode == 164: # modulo
1018 value2 = stack.pop()
1019 value1 = stack.pop()
1020 res = value1 % value2
1021 stack.append(res)
a7177865
PH
1022 elif opcode == 208: # getlocal_0
1023 stack.append(registers[0])
1024 elif opcode == 209: # getlocal_1
1025 stack.append(registers[1])
1026 elif opcode == 210: # getlocal_2
1027 stack.append(registers[2])
1028 elif opcode == 211: # getlocal_3
1029 stack.append(registers[3])
e0df6211
PH
1030 elif opcode == 214: # setlocal_2
1031 registers[2] = stack.pop()
1032 elif opcode == 215: # setlocal_3
1033 registers[3] = stack.pop()
1034 else:
1035 raise NotImplementedError(
1036 u'Unsupported opcode %d' % opcode)
1037
1038 method_pyfunctions[func_name] = resfunc
1039 return resfunc
1040
1041 initial_function = extract_function(u'decipher')
1042 return lambda s: initial_function([s])
1043
83799698 1044 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1045 """Turn the encrypted s field into a working signature"""
6b37f0be 1046
83799698 1047 if player_url is not None:
e0df6211 1048 try:
83799698
PH
1049 if player_url not in self._player_cache:
1050 func = self._extract_signature_function(
c4417ddb 1051 video_id, player_url, len(s)
e0df6211 1052 )
83799698 1053 self._player_cache[player_url] = func
edf3e38e
PH
1054 func = self._player_cache[player_url]
1055 if self._downloader.params.get('youtube_print_sig_code'):
1056 self._print_sig_code(func, len(s))
1057 return func(s)
e0df6211
PH
1058 except Exception as e:
1059 tb = traceback.format_exc()
83799698
PH
1060 self._downloader.report_warning(
1061 u'Automatic signature extraction failed: ' + tb)
e0df6211 1062
83799698
PH
1063 self._downloader.report_warning(
1064 u'Warning: Falling back to static signature algorithm')
2f2ffea9
PH
1065 return self._static_decrypt_signature(
1066 s, video_id, player_url, age_gate)
e0df6211 1067
2f2ffea9 1068 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1069 if age_gate:
1070 # The videos with age protection use another player, so the
1071 # algorithms can be different.
1072 if len(s) == 86:
1073 return s[2:63] + s[82] + s[64:82] + s[63]
1074
1075 if len(s) == 92:
444b1165
JMF
1076 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1077 elif len(s) == 90:
1078 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1079 elif len(s) == 89:
1080 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1081 elif len(s) == 88:
3e223834 1082 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1083 elif len(s) == 87:
3a725669 1084 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1085 elif len(s) == 86:
1cf911bc 1086 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
be547e1d 1087 elif len(s) == 85:
6ae8ee3f 1088 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1089 elif len(s) == 84:
23b00bc0 1090 return s[81:36:-1] + s[0] + s[35:2:-1]
be547e1d 1091 elif len(s) == 83:
e1842025 1092 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
be547e1d 1093 elif len(s) == 82:
ce85f022 1094 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
be547e1d 1095 elif len(s) == 81:
aedd6bb9 1096 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1097 elif len(s) == 80:
1098 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1099 elif len(s) == 79:
1100 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1101
1102 else:
1103 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1104
75952c6e
JMF
1105 def _decrypt_signature_age_gate(self, s):
1106 # The videos with age protection use another player, so the algorithms
1107 # can be different.
1108 if len(s) == 86:
1109 return s[2:63] + s[82] + s[64:82] + s[63]
1110 else:
1111 # Fallback to the other algortihms
b072a9de 1112 return self._decrypt_signature(s)
c5e8d7af 1113
de7f3446 1114 def _get_available_subtitles(self, video_id):
de7f3446 1115 try:
7fad1c63
JMF
1116 sub_list = self._download_webpage(
1117 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1118 video_id, note=False)
1119 except ExtractorError as err:
de7f3446
JMF
1120 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1121 return {}
1122 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1123
1124 sub_lang_list = {}
1125 for l in lang_list:
1126 lang = l[1]
1127 params = compat_urllib_parse.urlencode({
1128 'lang': lang,
1129 'v': video_id,
1130 'fmt': self._downloader.params.get('subtitlesformat'),
1131 })
1132 url = u'http://www.youtube.com/api/timedtext?' + params
1133 sub_lang_list[lang] = url
1134 if not sub_lang_list:
1135 self._downloader.report_warning(u'video doesn\'t have subtitles')
1136 return {}
1137 return sub_lang_list
1138
055e6f36 1139 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1140 """We need the webpage for getting the captions url, pass it as an
1141 argument to speed up the process."""
de7f3446
JMF
1142 sub_format = self._downloader.params.get('subtitlesformat')
1143 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1144 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1145 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1146 if mobj is None:
1147 self._downloader.report_warning(err_msg)
1148 return {}
1149 player_config = json.loads(mobj.group(1))
1150 try:
1151 args = player_config[u'args']
1152 caption_url = args[u'ttsurl']
1153 timestamp = args[u'timestamp']
055e6f36
JMF
1154 # We get the available subtitles
1155 list_params = compat_urllib_parse.urlencode({
1156 'type': 'list',
1157 'tlangs': 1,
1158 'asrs': 1,
de7f3446 1159 })
055e6f36
JMF
1160 list_url = caption_url + '&' + list_params
1161 list_page = self._download_webpage(list_url, video_id)
1162 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
e3dc22ca
JMF
1163 original_lang_node = caption_list.find('track')
1164 if original_lang_node.attrib.get('kind') != 'asr' :
1165 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1166 return {}
1167 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1168
1169 sub_lang_list = {}
1170 for lang_node in caption_list.findall('target'):
1171 sub_lang = lang_node.attrib['lang_code']
1172 params = compat_urllib_parse.urlencode({
1173 'lang': original_lang,
1174 'tlang': sub_lang,
1175 'fmt': sub_format,
1176 'ts': timestamp,
1177 'kind': 'asr',
1178 })
1179 sub_lang_list[sub_lang] = caption_url + '&' + params
1180 return sub_lang_list
de7f3446
JMF
1181 # An extractor error can be raise by the download process if there are
1182 # no automatic captions but there are subtitles
1183 except (KeyError, ExtractorError):
1184 self._downloader.report_warning(err_msg)
1185 return {}
1186
c5e8d7af
PH
1187 def _print_formats(self, formats):
1188 print('Available formats:')
1189 for x in formats:
03cc7c20
JMF
1190 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1191 self._video_dimensions.get(x, '???'),
836a086c 1192 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1193
1194 def _extract_id(self, url):
1195 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1196 if mobj is None:
1197 raise ExtractorError(u'Invalid URL: %s' % url)
1198 video_id = mobj.group(2)
1199 return video_id
1200
1d043b93
JMF
1201 def _get_video_url_list(self, url_map):
1202 """
1203 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1204 with the requested formats.
1205 """
1206 req_format = self._downloader.params.get('format', None)
1207 format_limit = self._downloader.params.get('format_limit', None)
1208 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1209 if format_limit is not None and format_limit in available_formats:
1210 format_list = available_formats[available_formats.index(format_limit):]
1211 else:
1212 format_list = available_formats
1213 existing_formats = [x for x in format_list if x in url_map]
1214 if len(existing_formats) == 0:
1215 raise ExtractorError(u'no known formats available for video')
1216 if self._downloader.params.get('listformats', None):
1217 self._print_formats(existing_formats)
1218 return
1219 if req_format is None or req_format == 'best':
1220 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1221 elif req_format == 'worst':
1222 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1223 elif req_format in ('-1', 'all'):
1224 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1225 else:
1226 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1227 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1228 # available in the specified format. For example,
1229 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1230 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1231 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1232 req_formats = req_format.split('/')
1233 video_url_list = None
1234 for rf in req_formats:
1235 if rf in url_map:
1236 video_url_list = [(rf, url_map[rf])]
1237 break
bdc6b3fc
AZ
1238 if rf in self._video_formats_map:
1239 for srf in self._video_formats_map[rf]:
1240 if srf in url_map:
1241 video_url_list = [(srf, url_map[srf])]
1242 break
1243 else:
1244 continue
1245 break
1d043b93
JMF
1246 if video_url_list is None:
1247 raise ExtractorError(u'requested format not available')
1248 return video_url_list
1249
1250 def _extract_from_m3u8(self, manifest_url, video_id):
1251 url_map = {}
1252 def _get_urls(_manifest):
1253 lines = _manifest.split('\n')
1254 urls = filter(lambda l: l and not l.startswith('#'),
1255 lines)
1256 return urls
1257 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1258 formats_urls = _get_urls(manifest)
1259 for format_url in formats_urls:
890f62e8 1260 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1261 url_map[itag] = format_url
1262 return url_map
1263
c5e8d7af 1264 def _real_extract(self, url):
d7f44b5b
PH
1265 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1266 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1267
c5e8d7af
PH
1268 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1269 mobj = re.search(self._NEXT_URL_RE, url)
1270 if mobj:
1271 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1272 video_id = self._extract_id(url)
1273
1274 # Get video webpage
1275 self.report_video_webpage_download(video_id)
1276 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1277 request = compat_urllib_request.Request(url)
1278 try:
1279 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1280 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1281 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1282
1283 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1284
1285 # Attempt to extract SWF player URL
e0df6211 1286 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1287 if mobj is not None:
1288 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1289 else:
1290 player_url = None
1291
1292 # Get video info
1293 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1294 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1295 self.report_age_confirmation()
1296 age_gate = True
1297 # We simulate the access to the video from www.youtube.com/v/{video_id}
1298 # this can be viewed without login into Youtube
1299 data = compat_urllib_parse.urlencode({'video_id': video_id,
1300 'el': 'embedded',
1301 'gl': 'US',
1302 'hl': 'en',
1303 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1304 'asv': 3,
1305 'sts':'1588',
1306 })
1307 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1308 video_info_webpage = self._download_webpage(video_info_url, video_id,
1309 note=False,
1310 errnote='unable to download video info webpage')
1311 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1312 else:
1313 age_gate = False
1314 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1315 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1316 % (video_id, el_type))
1317 video_info_webpage = self._download_webpage(video_info_url, video_id,
1318 note=False,
1319 errnote='unable to download video info webpage')
1320 video_info = compat_parse_qs(video_info_webpage)
1321 if 'token' in video_info:
1322 break
c5e8d7af
PH
1323 if 'token' not in video_info:
1324 if 'reason' in video_info:
9a82b238 1325 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1326 else:
1327 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1328
1329 # Check for "rental" videos
1330 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1331 raise ExtractorError(u'"rental" videos not supported')
1332
1333 # Start extracting information
1334 self.report_information_extraction(video_id)
1335
1336 # uploader
1337 if 'author' not in video_info:
1338 raise ExtractorError(u'Unable to extract uploader name')
1339 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1340
1341 # uploader_id
1342 video_uploader_id = None
1343 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1344 if mobj is not None:
1345 video_uploader_id = mobj.group(1)
1346 else:
1347 self._downloader.report_warning(u'unable to extract uploader nickname')
1348
1349 # title
1350 if 'title' not in video_info:
1351 raise ExtractorError(u'Unable to extract video title')
1352 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1353
1354 # thumbnail image
7763b04e
JMF
1355 # We try first to get a high quality image:
1356 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1357 video_webpage, re.DOTALL)
1358 if m_thumb is not None:
1359 video_thumbnail = m_thumb.group(1)
1360 elif 'thumbnail_url' not in video_info:
c5e8d7af
PH
1361 self._downloader.report_warning(u'unable to extract video thumbnail')
1362 video_thumbnail = ''
1363 else: # don't panic if we can't find it
1364 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1365
1366 # upload date
1367 upload_date = None
1368 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1369 if mobj is not None:
1370 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1371 upload_date = unified_strdate(upload_date)
1372
1373 # description
1374 video_description = get_element_by_id("eow-description", video_webpage)
1375 if video_description:
1376 video_description = clean_html(video_description)
1377 else:
1378 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1379 if fd_mobj:
1380 video_description = unescapeHTML(fd_mobj.group(1))
1381 else:
1382 video_description = u''
1383
1384 # subtitles
d82134c3 1385 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1386
c5e8d7af 1387 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1388 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1389 return
1390
1391 if 'length_seconds' not in video_info:
1392 self._downloader.report_warning(u'unable to extract video duration')
1393 video_duration = ''
1394 else:
1395 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1396
c5e8d7af 1397 # Decide which formats to download
c5e8d7af
PH
1398
1399 try:
1400 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1401 if not mobj:
1402 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1403 info = json.loads(mobj.group(1))
1404 args = info['args']
7ce7e394
JMF
1405 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1406 # this signatures are encrypted
1407 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1408 if m_s is not None:
1409 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1410 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
cde846b3 1411 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
b7a68384 1412 if m_s is not None:
37b6d5f6
AZ
1413 if 'url_encoded_fmt_stream_map' in video_info:
1414 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1415 else:
1416 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
211fbc13 1417 elif 'adaptive_fmts' in video_info:
37b6d5f6
AZ
1418 if 'url_encoded_fmt_stream_map' in video_info:
1419 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1420 else:
1421 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
c5e8d7af
PH
1422 except ValueError:
1423 pass
1424
1425 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1426 self.report_rtmp_download()
1427 video_url_list = [(None, video_info['conn'][0])]
1428 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
a7055eb9
JMF
1429 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1430 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af
PH
1431 url_map = {}
1432 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1433 url_data = compat_parse_qs(url_data_str)
1434 if 'itag' in url_data and 'url' in url_data:
1435 url = url_data['url'][0]
1436 if 'sig' in url_data:
1437 url += '&signature=' + url_data['sig'][0]
1438 elif 's' in url_data:
e0df6211 1439 encrypted_sig = url_data['s'][0]
769fda3c 1440 if self._downloader.params.get('verbose'):
c108eb73 1441 if age_gate:
83799698
PH
1442 player_version = self._search_regex(
1443 r'-(.+)\.swf$',
1444 player_url if player_url else None,
e0df6211
PH
1445 'flash player', fatal=False)
1446 player_desc = 'flash player %s' % player_version
c108eb73 1447 else:
83799698
PH
1448 player_version = self._search_regex(
1449 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1450 'html5 player', fatal=False)
e0df6211
PH
1451 player_desc = u'html5 player %s' % player_version
1452
1453 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1454 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1455 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1456
83799698 1457 if not age_gate:
e0df6211
PH
1458 jsplayer_url_json = self._search_regex(
1459 r'"assets":.+?"js":\s*("[^"]+")',
1460 video_webpage, u'JS player URL')
83799698 1461 player_url = json.loads(jsplayer_url_json)
e0df6211 1462
83799698
PH
1463 signature = self._decrypt_signature(
1464 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1465 url += '&signature=' + signature
1466 if 'ratebypass' not in url:
1467 url += '&ratebypass=yes'
1468 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1469 video_url_list = self._get_video_url_list(url_map)
1470 if not video_url_list:
c5e8d7af 1471 return
1d043b93
JMF
1472 elif video_info.get('hlsvp'):
1473 manifest_url = video_info['hlsvp'][0]
1474 url_map = self._extract_from_m3u8(manifest_url, video_id)
1475 video_url_list = self._get_video_url_list(url_map)
1476 if not video_url_list:
1477 return
1478
c5e8d7af
PH
1479 else:
1480 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1481
1482 results = []
1483 for format_param, video_real_url in video_url_list:
1484 # Extension
1485 video_extension = self._video_extensions.get(format_param, 'flv')
1486
03cc7c20
JMF
1487 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1488 self._video_dimensions.get(format_param, '???'),
836a086c 1489 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
c5e8d7af
PH
1490
1491 results.append({
1492 'id': video_id,
1493 'url': video_real_url,
1494 'uploader': video_uploader,
1495 'uploader_id': video_uploader_id,
1496 'upload_date': upload_date,
1497 'title': video_title,
1498 'ext': video_extension,
1499 'format': video_format,
1500 'thumbnail': video_thumbnail,
1501 'description': video_description,
1502 'player_url': player_url,
1503 'subtitles': video_subtitles,
1504 'duration': video_duration
1505 })
1506 return results
1507
1508class YoutubePlaylistIE(InfoExtractor):
0f818663 1509 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1510 _VALID_URL = r"""(?:
1511 (?:https?://)?
1512 (?:\w+\.)?
1513 youtube\.com/
1514 (?:
1515 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1516 \? (?:.*?&)*? (?:p|a|list)=
1517 | p/
1518 )
c626a3d9 1519 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1520 .*
1521 |
c626a3d9 1522 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1523 )"""
1524 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1525 _MAX_RESULTS = 50
1526 IE_NAME = u'youtube:playlist'
1527
1528 @classmethod
1529 def suitable(cls, url):
1530 """Receives a URL and returns True if suitable for this IE."""
1531 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1532
1533 def _real_extract(self, url):
1534 # Extract playlist id
1535 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1536 if mobj is None:
1537 raise ExtractorError(u'Invalid URL: %s' % url)
1538
1539 # Download playlist videos from API
1540 playlist_id = mobj.group(1) or mobj.group(2)
c5e8d7af
PH
1541 videos = []
1542
755eb032 1543 for page_num in itertools.count(1):
771822eb
JMF
1544 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1545 if start_index >= 1000:
1546 self._downloader.report_warning(u'Max number of results reached')
1547 break
1548 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
c5e8d7af
PH
1549 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1550
1551 try:
1552 response = json.loads(page)
1553 except ValueError as err:
1554 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1555
1556 if 'feed' not in response:
1557 raise ExtractorError(u'Got a malformed response from YouTube API')
1558 playlist_title = response['feed']['title']['$t']
1559 if 'entry' not in response['feed']:
1560 # Number of videos is a multiple of self._MAX_RESULTS
1561 break
1562
1563 for entry in response['feed']['entry']:
1564 index = entry['yt$position']['$t']
c215217e
JMF
1565 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1566 videos.append((
1567 index,
1568 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1569 ))
c5e8d7af 1570
c5e8d7af
PH
1571 videos = [v[1] for v in sorted(videos)]
1572
20c3893f 1573 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
c5e8d7af
PH
1574 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1575
1576
1577class YoutubeChannelIE(InfoExtractor):
0f818663 1578 IE_DESC = u'YouTube.com channels'
c5e8d7af
PH
1579 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1580 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1581 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1582 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1583 IE_NAME = u'youtube:channel'
1584
1585 def extract_videos_from_page(self, page):
1586 ids_in_page = []
1587 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1588 if mobj.group(1) not in ids_in_page:
1589 ids_in_page.append(mobj.group(1))
1590 return ids_in_page
1591
1592 def _real_extract(self, url):
1593 # Extract channel id
1594 mobj = re.match(self._VALID_URL, url)
1595 if mobj is None:
1596 raise ExtractorError(u'Invalid URL: %s' % url)
1597
1598 # Download channel page
1599 channel_id = mobj.group(1)
1600 video_ids = []
1601 pagenum = 1
1602
1603 url = self._TEMPLATE_URL % (channel_id, pagenum)
1604 page = self._download_webpage(url, channel_id,
1605 u'Downloading page #%s' % pagenum)
1606
1607 # Extract video identifiers
1608 ids_in_page = self.extract_videos_from_page(page)
1609 video_ids.extend(ids_in_page)
1610
1611 # Download any subsequent channel pages using the json-based channel_ajax query
1612 if self._MORE_PAGES_INDICATOR in page:
755eb032 1613 for pagenum in itertools.count(1):
c5e8d7af
PH
1614 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1615 page = self._download_webpage(url, channel_id,
1616 u'Downloading page #%s' % pagenum)
1617
1618 page = json.loads(page)
1619
1620 ids_in_page = self.extract_videos_from_page(page['content_html'])
1621 video_ids.extend(ids_in_page)
1622
1623 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1624 break
1625
1626 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1627
1628 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
20c3893f 1629 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
c5e8d7af
PH
1630 return [self.playlist_result(url_entries, channel_id)]
1631
1632
1633class YoutubeUserIE(InfoExtractor):
0f818663 1634 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
faab1d38 1635 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1636 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1637 _GDATA_PAGE_SIZE = 50
fd9cf738 1638 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1639 IE_NAME = u'youtube:user'
1640
e3ea4790 1641 @classmethod
f4b05232 1642 def suitable(cls, url):
e3ea4790
JMF
1643 # Don't return True if the url can be extracted with other youtube
1644 # extractor, the regex would is too permissive and it would match.
1645 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1646 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1647 else: return super(YoutubeUserIE, cls).suitable(url)
1648
c5e8d7af
PH
1649 def _real_extract(self, url):
1650 # Extract username
1651 mobj = re.match(self._VALID_URL, url)
1652 if mobj is None:
1653 raise ExtractorError(u'Invalid URL: %s' % url)
1654
1655 username = mobj.group(1)
1656
1657 # Download video ids using YouTube Data API. Result size per
1658 # query is limited (currently to 50 videos) so we need to query
1659 # page by page until there are no video ids - it means we got
1660 # all of them.
1661
1662 video_ids = []
c5e8d7af 1663
755eb032 1664 for pagenum in itertools.count(0):
c5e8d7af
PH
1665 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1666
1667 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1668 page = self._download_webpage(gdata_url, username,
1669 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1670
fd9cf738
JMF
1671 try:
1672 response = json.loads(page)
1673 except ValueError as err:
1674 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1675 if 'entry' not in response['feed']:
1676 # Number of videos is a multiple of self._MAX_RESULTS
1677 break
fd9cf738 1678
c5e8d7af
PH
1679 # Extract video identifiers
1680 ids_in_page = []
fd9cf738
JMF
1681 for entry in response['feed']['entry']:
1682 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1683 video_ids.extend(ids_in_page)
1684
1685 # A little optimization - if current page is not
1686 # "full", ie. does not contain PAGE_SIZE video ids then
1687 # we can assume that this page is the last one - there
1688 # are no more ids on further pages - no need to query
1689 # again.
1690
1691 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1692 break
1693
c5e8d7af 1694 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
20c3893f 1695 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
c5e8d7af 1696 return [self.playlist_result(url_results, playlist_title = username)]
b05654f0
PH
1697
1698class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1699 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1700 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1701 _MAX_RESULTS = 1000
1702 IE_NAME = u'youtube:search'
1703 _SEARCH_KEY = 'ytsearch'
1704
1705 def report_download_page(self, query, pagenum):
1706 """Report attempt to download search page with given number."""
1707 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1708
1709 def _get_n_results(self, query, n):
1710 """Get a specified number of results for a query"""
1711
1712 video_ids = []
1713 pagenum = 0
1714 limit = n
1715
1716 while (50 * pagenum) < limit:
1717 self.report_download_page(query, pagenum+1)
1718 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1719 request = compat_urllib_request.Request(result_url)
1720 try:
1721 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1722 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1723 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1724 api_response = json.loads(data)['data']
1725
1726 if not 'items' in api_response:
1727 raise ExtractorError(u'[youtube] No video results')
1728
1729 new_ids = list(video['id'] for video in api_response['items'])
1730 video_ids += new_ids
1731
1732 limit = min(n, api_response['totalItems'])
1733 pagenum += 1
1734
1735 if len(video_ids) > n:
1736 video_ids = video_ids[:n]
1737 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1738 return self.playlist_result(videos, query)
75dff0ee
JMF
1739
1740
1741class YoutubeShowIE(InfoExtractor):
0f818663 1742 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1743 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1744 IE_NAME = u'youtube:show'
1745
1746 def _real_extract(self, url):
1747 mobj = re.match(self._VALID_URL, url)
1748 show_name = mobj.group(1)
1749 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1750 # There's one playlist for each season of the show
1751 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1752 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1753 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1754
1755
b2e8bc1b 1756class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1757 """
1758 Base class for extractors that fetch info from
1759 http://www.youtube.com/feed_ajax
1760 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1761 """
b2e8bc1b 1762 _LOGIN_REQUIRED = True
04cc9617 1763 _PAGING_STEP = 30
43ba5456
JMF
1764 # use action_load_personal_feed instead of action_load_system_feed
1765 _PERSONAL_FEED = False
04cc9617 1766
d7ae0639
JMF
1767 @property
1768 def _FEED_TEMPLATE(self):
43ba5456
JMF
1769 action = 'action_load_system_feed'
1770 if self._PERSONAL_FEED:
1771 action = 'action_load_personal_feed'
1772 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1773
1774 @property
1775 def IE_NAME(self):
1776 return u'youtube:%s' % self._FEED_NAME
04cc9617 1777
81f0259b 1778 def _real_initialize(self):
b2e8bc1b 1779 self._login()
81f0259b 1780
04cc9617
JMF
1781 def _real_extract(self, url):
1782 feed_entries = []
1783 # The step argument is available only in 2.7 or higher
1784 for i in itertools.count(0):
1785 paging = i*self._PAGING_STEP
d7ae0639
JMF
1786 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1787 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1788 u'Downloading page %s' % i)
1789 info = json.loads(info)
1790 feed_html = info['feed_html']
43ba5456 1791 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617
JMF
1792 ids = orderedSet(m.group(1) for m in m_ids)
1793 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1794 if info['paging'] is None:
1795 break
d7ae0639
JMF
1796 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1797
1798class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1799 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1800 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1801 _FEED_NAME = 'subscriptions'
1802 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1803
1804class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1805 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1806 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1807 _FEED_NAME = 'recommended'
1808 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1809
43ba5456
JMF
1810class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1811 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1812 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1813 _FEED_NAME = 'watch_later'
1814 _PLAYLIST_TITLE = u'Youtube Watch Later'
1815 _PAGING_STEP = 100
1816 _PERSONAL_FEED = True
c626a3d9
JMF
1817
1818class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1819 IE_NAME = u'youtube:favorites'
1820 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1821 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1822 _LOGIN_REQUIRED = True
1823
1824 def _real_extract(self, url):
1825 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1826 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1827 return self.url_result(playlist_id, 'YoutubePlaylist')