]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Update static signatures
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211
PH
5import itertools
6import io
c5e8d7af 7import json
c4417ddb
PH
8import operator
9import os.path
c5e8d7af
PH
10import re
11import socket
e0df6211
PH
12import string
13import struct
14import traceback
15import zlib
c5e8d7af 16
b05654f0 17from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 18from .subtitles import SubtitlesInfoExtractor
c5e8d7af 19from ..utils import (
edf3e38e 20 compat_chr,
c5e8d7af
PH
21 compat_http_client,
22 compat_parse_qs,
23 compat_urllib_error,
24 compat_urllib_parse,
25 compat_urllib_request,
26 compat_str,
27
28 clean_html,
29 get_element_by_id,
30 ExtractorError,
31 unescapeHTML,
32 unified_strdate,
04cc9617 33 orderedSet,
edf3e38e 34 write_json_file,
c5e8d7af
PH
35)
36
de7f3446 37class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
49
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
52 try:
53 self.report_lang()
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
57 return False
58 return True
59
60 def _login(self):
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 return False
67
68 request = compat_urllib_request.Request(self._LOGIN_URL)
69 try:
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
73 return False
74
75 galx = None
76 dsh = None
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
78 if match:
79 galx = match.group(1)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
81 if match:
82 dsh = match.group(1)
c5e8d7af 83
b2e8bc1b
JMF
84 # Log in
85 login_form_strs = {
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
87 u'Email': username,
88 u'GALX': galx,
89 u'Passwd': password,
90 u'PersistentCookie': u'yes',
91 u'_utf8': u'霱',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
95 u'dnConn': u'',
96 u'dsh': dsh,
97 u'pstMsg': u'0',
98 u'rmShown': u'1',
99 u'secTok': u'',
100 u'signIn': u'Sign in',
101 u'timeStmp': u'',
102 u'service': u'youtube',
103 u'uilel': u'3',
104 u'hl': u'en_US',
105 }
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
107 # chokes on unicode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
111 try:
112 self.report_login()
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
116 return False
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
119 return False
120 return True
121
122 def _confirm_age(self):
123 age_form = {
124 'next_url': '/',
125 'action_confirm': 'Confirm',
126 }
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
128 try:
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
133 return True
134
135 def _real_initialize(self):
136 if self._downloader is None:
137 return
138 if not self._set_language():
139 return
140 if not self._login():
141 return
142 self._confirm_age()
c5e8d7af 143
8377574c 144
de7f3446 145class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 146 IE_DESC = u'YouTube.com'
c5e8d7af
PH
147 _VALID_URL = r"""^
148 (
149 (?:https?://)? # http(s):// (optional)
f4b05232 150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
e69ae5b9
JMF
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
d741e55a 157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
160 v=
161 )
f4b05232
JMF
162 ))
163 |youtu\.be/ # just youtu.be/xxxx
164 )
c5e8d7af 165 )? # all until now is optional -> you can pass the naked ID
8963d9c2 166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
167 (?(1).+)? # if we found the ID, everything can follow
168 $"""
c5e8d7af 169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 170 # Listed in order of quality
bdc6b3fc 171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 172 # Apple HTTP Live Streaming
bdc6b3fc 173 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
174 # 3D
175 '85', '84', '102', '83', '101', '82', '100',
176 # Dash video
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
179 # Dash audio
180 '141', '172', '140', '171', '139',
1d043b93 181 ]
bdc6b3fc 182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 183 # Apple HTTP Live Streaming
bdc6b3fc
AZ
184 '96', '95', '94', '93', '92', '132', '151',
185 # 3D
86fe61c8 186 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
187 # Dash video
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
190 # Dash audio
191 '172', '141', '171', '140', '139',
1d043b93 192 ]
bdc6b3fc
AZ
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
198 }
c5e8d7af
PH
199 _video_extensions = {
200 '13': '3gp',
bdc6b3fc 201 '17': '3gp',
c5e8d7af
PH
202 '18': 'mp4',
203 '22': 'mp4',
bdc6b3fc 204 '36': '3gp',
c5e8d7af 205 '37': 'mp4',
d69cf69a 206 '38': 'mp4',
c5e8d7af
PH
207 '43': 'webm',
208 '44': 'webm',
209 '45': 'webm',
210 '46': 'webm',
1d043b93 211
86fe61c8
AZ
212 # 3d videos
213 '82': 'mp4',
214 '83': 'mp4',
215 '84': 'mp4',
216 '85': 'mp4',
217 '100': 'webm',
218 '101': 'webm',
219 '102': 'webm',
836a086c 220
96fb5605 221 # Apple HTTP Live Streaming
1d043b93
JMF
222 '92': 'mp4',
223 '93': 'mp4',
224 '94': 'mp4',
225 '95': 'mp4',
226 '96': 'mp4',
227 '132': 'mp4',
228 '151': 'mp4',
836a086c
AZ
229
230 # Dash mp4
231 '133': 'mp4',
232 '134': 'mp4',
233 '135': 'mp4',
234 '136': 'mp4',
235 '137': 'mp4',
236 '138': 'mp4',
237 '139': 'mp4',
238 '140': 'mp4',
239 '141': 'mp4',
240 '160': 'mp4',
241
242 # Dash webm
243 '171': 'webm',
244 '172': 'webm',
245 '242': 'webm',
246 '243': 'webm',
247 '244': 'webm',
248 '245': 'webm',
249 '246': 'webm',
250 '247': 'webm',
251 '248': 'webm',
c5e8d7af
PH
252 }
253 _video_dimensions = {
254 '5': '240x400',
255 '6': '???',
256 '13': '???',
257 '17': '144x176',
258 '18': '360x640',
259 '22': '720x1280',
260 '34': '360x640',
261 '35': '480x854',
bdc6b3fc 262 '36': '240x320',
c5e8d7af
PH
263 '37': '1080x1920',
264 '38': '3072x4096',
265 '43': '360x640',
266 '44': '480x854',
267 '45': '720x1280',
268 '46': '1080x1920',
86fe61c8
AZ
269 '82': '360p',
270 '83': '480p',
271 '84': '720p',
272 '85': '1080p',
1d043b93
JMF
273 '92': '240p',
274 '93': '360p',
275 '94': '480p',
276 '95': '720p',
277 '96': '1080p',
86fe61c8
AZ
278 '100': '360p',
279 '101': '480p',
836a086c 280 '102': '720p',
1d043b93
JMF
281 '132': '240p',
282 '151': '72p',
836a086c
AZ
283 '133': '240p',
284 '134': '360p',
285 '135': '480p',
286 '136': '720p',
287 '137': '1080p',
288 '138': '>1080p',
289 '139': '48k',
290 '140': '128k',
291 '141': '256k',
292 '160': '192p',
293 '171': '128k',
294 '172': '256k',
295 '242': '240p',
296 '243': '360p',
297 '244': '480p',
298 '245': '480p',
299 '246': '480p',
300 '247': '720p',
301 '248': '1080p',
c5e8d7af 302 }
836a086c
AZ
303 _special_itags = {
304 '82': '3D',
305 '83': '3D',
306 '84': '3D',
307 '85': '3D',
308 '100': '3D',
309 '101': '3D',
310 '102': '3D',
311 '133': 'DASH Video',
312 '134': 'DASH Video',
313 '135': 'DASH Video',
314 '136': 'DASH Video',
315 '137': 'DASH Video',
316 '138': 'DASH Video',
317 '139': 'DASH Audio',
318 '140': 'DASH Audio',
319 '141': 'DASH Audio',
320 '160': 'DASH Video',
321 '171': 'DASH Audio',
322 '172': 'DASH Audio',
323 '242': 'DASH Video',
324 '243': 'DASH Video',
325 '244': 'DASH Video',
326 '245': 'DASH Video',
327 '246': 'DASH Video',
328 '247': 'DASH Video',
329 '248': 'DASH Video',
c5e8d7af 330 }
836a086c 331
c5e8d7af 332 IE_NAME = u'youtube'
2eb88d95
PH
333 _TESTS = [
334 {
0e853ca4
PH
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
337 u"info_dict": {
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 343 }
0e853ca4
PH
344 },
345 {
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
349 u"info_dict": {
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
2eb88d95 355 }
0e853ca4
PH
356 },
357 {
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
361 u"info_dict": {
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
c7bf7366 364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
45ed795c 365 u"uploader": u"Icona Pop",
0e853ca4 366 u"uploader_id": u"IconaPop"
2eb88d95 367 }
c108eb73
JMF
368 },
369 {
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
373 u"info_dict": {
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
379 }
380 },
1d043b93
JMF
381 {
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
385 u'info_dict': {
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
391 },
392 u'params': {
393 u'skip_download': True,
394 },
395 },
2eb88d95
PH
396 ]
397
c5e8d7af
PH
398
399 @classmethod
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 402 if YoutubePlaylistIE.suitable(url): return False
c5e8d7af
PH
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
404
e0df6211
PH
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 407 self._player_cache = {}
e0df6211 408
c5e8d7af
PH
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
412
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
416
c5e8d7af
PH
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
420
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
424
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
428
c4417ddb
PH
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 431 player_url)
e0df6211
PH
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
434
c4417ddb
PH
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
edf3e38e
PH
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
c4417ddb 440
edf3e38e 441 if cache_dir != u'NONE':
c4417ddb
PH
442 cache_fn = os.path.join(os.path.expanduser(cache_dir),
443 u'youtube-sigfuncs',
444 func_id + '.json')
445 try:
edf3e38e 446 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
447 cache_spec = json.load(cachef)
448 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 449 except IOError:
c4417ddb 450 pass # No cache available
83799698 451
e0df6211
PH
452 if player_type == 'js':
453 code = self._download_webpage(
454 player_url, video_id,
83799698 455 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 456 errnote=u'Download of %s failed' % player_url)
83799698 457 res = self._parse_sig_js(code)
c4417ddb 458 elif player_type == 'swf':
e0df6211
PH
459 urlh = self._request_webpage(
460 player_url, video_id,
83799698 461 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
462 errnote=u'Download of %s failed' % player_url)
463 code = urlh.read()
83799698 464 res = self._parse_sig_swf(code)
e0df6211
PH
465 else:
466 assert False, 'Invalid player type %r' % player_type
467
c4417ddb 468 if cache_dir is not False:
edf3e38e
PH
469 try:
470 cache_res = res(map(compat_chr, range(slen)))
471 cache_spec = [ord(c) for c in cache_res]
472 try:
473 os.makedirs(os.path.dirname(cache_fn))
474 except OSError as ose:
475 if ose.errno != errno.EEXIST:
476 raise
477 write_json_file(cache_spec, cache_fn)
478 except Exception as e:
479 tb = traceback.format_exc()
480 self._downloader.report_warning(
481 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
482
483 return res
484
edf3e38e
PH
485 def _print_sig_code(self, func, slen):
486 def gen_sig_code(idxs):
487 def _genslice(start, end, step):
488 starts = u'' if start == 0 else str(start)
489 ends = u':%d' % (end+step)
490 steps = u'' if step == 1 else (':%d' % step)
491 return u's[%s%s%s]' % (starts, ends, steps)
492
493 step = None
494 for i, prev in zip(idxs[1:], idxs[:-1]):
495 if step is not None:
496 if i - prev == step:
497 continue
498 yield _genslice(start, prev, step)
499 step = None
500 continue
501 if i - prev in [-1, 1]:
502 step = i - prev
503 start = prev
504 continue
505 else:
506 yield u's[%d]' % prev
507 if step is None:
508 yield u's[%d]' % i
509 else:
510 yield _genslice(start, i, step)
511
512 cache_res = func(map(compat_chr, range(slen)))
513 cache_spec = [ord(c) for c in cache_res]
514 expr_code = u' + '.join(gen_sig_code(cache_spec))
515 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
516 self.to_screen(u'Extracted signature:\n' + code)
517
e0df6211
PH
518 def _parse_sig_js(self, jscode):
519 funcname = self._search_regex(
520 r'signature=([a-zA-Z]+)', jscode,
521 u'Initial JS player signature function name')
522
523 functions = {}
524
525 def argidx(varname):
526 return string.lowercase.index(varname)
527
528 def interpret_statement(stmt, local_vars, allow_recursion=20):
529 if allow_recursion < 0:
530 raise ExctractorError(u'Recursion limit reached')
531
532 if stmt.startswith(u'var '):
533 stmt = stmt[len(u'var '):]
534 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
535 r'=(?P<expr>.*)$', stmt)
536 if ass_m:
537 if ass_m.groupdict().get('index'):
538 def assign(val):
539 lvar = local_vars[ass_m.group('out')]
540 idx = interpret_expression(ass_m.group('index'),
541 local_vars, allow_recursion)
542 assert isinstance(idx, int)
543 lvar[idx] = val
544 return val
545 expr = ass_m.group('expr')
546 else:
547 def assign(val):
548 local_vars[ass_m.group('out')] = val
549 return val
550 expr = ass_m.group('expr')
551 elif stmt.startswith(u'return '):
552 assign = lambda v: v
553 expr = stmt[len(u'return '):]
554 else:
555 raise ExtractorError(
556 u'Cannot determine left side of statement in %r' % stmt)
557
558 v = interpret_expression(expr, local_vars, allow_recursion)
559 return assign(v)
560
561 def interpret_expression(expr, local_vars, allow_recursion):
562 if expr.isdigit():
563 return int(expr)
564
565 if expr.isalpha():
566 return local_vars[expr]
567
568 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
569 if m:
570 member = m.group('member')
571 val = local_vars[m.group('in')]
572 if member == 'split("")':
573 return list(val)
574 if member == 'join("")':
575 return u''.join(val)
576 if member == 'length':
577 return len(val)
578 if member == 'reverse()':
579 return val[::-1]
580 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
581 if slice_m:
582 idx = interpret_expression(
583 slice_m.group('idx'), local_vars, allow_recursion-1)
584 return val[idx:]
585
586 m = re.match(
587 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
588 if m:
589 val = local_vars[m.group('in')]
590 idx = interpret_expression(m.group('idx'), local_vars,
591 allow_recursion-1)
592 return val[idx]
593
594 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
595 if m:
596 a = interpret_expression(m.group('a'),
597 local_vars, allow_recursion)
598 b = interpret_expression(m.group('b'),
599 local_vars, allow_recursion)
600 return a % b
601
602 m = re.match(
603 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
604 if m:
605 fname = m.group('func')
606 if fname not in functions:
607 functions[fname] = extract_function(fname)
608 argvals = [int(v) if v.isdigit() else local_vars[v]
609 for v in m.group('args').split(',')]
610 return functions[fname](argvals)
611 raise ExtractorError(u'Unsupported JS expression %r' % expr)
612
613 def extract_function(funcname):
614 func_m = re.search(
615 r'function ' + re.escape(funcname) +
616 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
617 jscode)
618 argnames = func_m.group('args').split(',')
619
620 def resf(args):
621 local_vars = dict(zip(argnames, args))
622 for stmt in func_m.group('code').split(';'):
623 res = interpret_statement(stmt, local_vars)
624 return res
625 return resf
626
627 initial_function = extract_function(funcname)
628 return lambda s: initial_function([s])
629
630 def _parse_sig_swf(self, file_contents):
631 if file_contents[1:3] != b'WS':
632 raise ExtractorError(
633 u'Not an SWF file; header is %r' % file_contents[:3])
634 if file_contents[:1] == b'C':
635 content = zlib.decompress(file_contents[8:])
636 else:
637 raise NotImplementedError(u'Unsupported compression format %r' %
638 file_contents[:1])
639
640 def extract_tags(content):
641 pos = 0
642 while pos < len(content):
643 header16 = struct.unpack('<H', content[pos:pos+2])[0]
644 pos += 2
645 tag_code = header16 >> 6
646 tag_len = header16 & 0x3f
647 if tag_len == 0x3f:
648 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
649 pos += 4
650 assert pos+tag_len <= len(content)
651 yield (tag_code, content[pos:pos+tag_len])
652 pos += tag_len
653
654 code_tag = next(tag
655 for tag_code, tag in extract_tags(content)
656 if tag_code == 82)
657 p = code_tag.index(b'\0', 4) + 1
ba552f54 658 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
659
660 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
661 def read_int(reader=None):
662 if reader is None:
663 reader = code_reader
e0df6211
PH
664 res = 0
665 shift = 0
666 for _ in range(5):
ba552f54
PH
667 buf = reader.read(1)
668 assert len(buf) == 1
669 b = struct.unpack('<B', buf)[0]
e0df6211
PH
670 res = res | ((b & 0x7f) << shift)
671 if b & 0x80 == 0:
672 break
673 shift += 7
ba552f54
PH
674 return res
675
676 def u30(reader=None):
677 res = read_int(reader)
678 assert res & 0xf0000000 == 0
e0df6211
PH
679 return res
680 u32 = read_int
681
ba552f54
PH
682 def s32(reader=None):
683 v = read_int(reader)
e0df6211
PH
684 if v & 0x80000000 != 0:
685 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
686 return v
687
688 def string(reader=None):
689 if reader is None:
690 reader = code_reader
691 slen = u30(reader)
692 resb = reader.read(slen)
693 assert len(resb) == slen
694 return resb.decode('utf-8')
695
696 def read_bytes(count, reader=None):
697 if reader is None:
698 reader = code_reader
699 resb = reader.read(count)
700 assert len(resb) == count
701 return resb
702
703 def read_byte(reader=None):
704 resb = read_bytes(1, reader=reader)
705 res = struct.unpack('<B', resb)[0]
706 return res
e0df6211
PH
707
708 # minor_version + major_version
2f2ffea9 709 _ = read_bytes(2 + 2)
e0df6211
PH
710
711 # Constant pool
ba552f54 712 int_count = u30()
e0df6211 713 for _c in range(1, int_count):
ba552f54
PH
714 _ = s32()
715 uint_count = u30()
e0df6211 716 for _c in range(1, uint_count):
ba552f54
PH
717 _ = u32()
718 double_count = u30()
719 _ = read_bytes((double_count-1) * 8)
720 string_count = u30()
e0df6211
PH
721 constant_strings = [u'']
722 for _c in range(1, string_count):
ba552f54 723 s = string()
e0df6211 724 constant_strings.append(s)
ba552f54 725 namespace_count = u30()
e0df6211 726 for _c in range(1, namespace_count):
ba552f54
PH
727 _ = read_bytes(1) # kind
728 _ = u30() # name
729 ns_set_count = u30()
e0df6211 730 for _c in range(1, ns_set_count):
ba552f54 731 count = u30()
e0df6211 732 for _c2 in range(count):
ba552f54
PH
733 _ = u30()
734 multiname_count = u30()
e0df6211
PH
735 MULTINAME_SIZES = {
736 0x07: 2, # QName
737 0x0d: 2, # QNameA
738 0x0f: 1, # RTQName
739 0x10: 1, # RTQNameA
740 0x11: 0, # RTQNameL
741 0x12: 0, # RTQNameLA
742 0x09: 2, # Multiname
743 0x0e: 2, # MultinameA
744 0x1b: 1, # MultinameL
745 0x1c: 1, # MultinameLA
746 }
747 multinames = [u'']
748 for _c in range(1, multiname_count):
ba552f54 749 kind = u30()
e0df6211
PH
750 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
751 if kind == 0x07:
ba552f54
PH
752 namespace_idx = u30()
753 name_idx = u30()
e0df6211
PH
754 multinames.append(constant_strings[name_idx])
755 else:
756 multinames.append('[MULTINAME kind: %d]' % kind)
757 for _c2 in range(MULTINAME_SIZES[kind]):
ba552f54 758 _ = u30()
e0df6211
PH
759
760 # Methods
ba552f54 761 method_count = u30()
e0df6211
PH
762 MethodInfo = collections.namedtuple(
763 'MethodInfo',
764 ['NEED_ARGUMENTS', 'NEED_REST'])
765 method_infos = []
766 for method_id in range(method_count):
ba552f54
PH
767 param_count = u30()
768 _ = u30() # return type
e0df6211 769 for _ in range(param_count):
ba552f54
PH
770 _ = u30() # param type
771 _ = u30() # name index (always 0 for youtube)
772 flags = read_byte()
e0df6211
PH
773 if flags & 0x08 != 0:
774 # Options present
ba552f54 775 option_count = u30()
e0df6211 776 for c in range(option_count):
ba552f54
PH
777 _ = u30() # val
778 _ = read_bytes(1) # kind
e0df6211
PH
779 if flags & 0x80 != 0:
780 # Param names present
781 for _ in range(param_count):
ba552f54 782 _ = u30() # param name
e0df6211
PH
783 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
784 method_infos.append(mi)
785
786 # Metadata
ba552f54 787 metadata_count = u30()
e0df6211 788 for _c in range(metadata_count):
ba552f54
PH
789 _ = u30() # name
790 item_count = u30()
e0df6211 791 for _c2 in range(item_count):
ba552f54
PH
792 _ = u30() # key
793 _ = u30() # value
794
795 def parse_traits_info():
796 trait_name_idx = u30()
797 kind_full = read_byte()
e0df6211
PH
798 kind = kind_full & 0x0f
799 attrs = kind_full >> 4
800 methods = {}
801 if kind in [0x00, 0x06]: # Slot or Const
ba552f54
PH
802 _ = u30() # Slot id
803 type_name_idx = u30()
804 vindex = u30()
e0df6211 805 if vindex != 0:
ba552f54 806 _ = read_byte() # vkind
e0df6211 807 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
ba552f54
PH
808 _ = u30() # disp_id
809 method_idx = u30()
e0df6211
PH
810 methods[multinames[trait_name_idx]] = method_idx
811 elif kind == 0x04: # Class
ba552f54
PH
812 _ = u30() # slot_id
813 _ = u30() # classi
e0df6211 814 elif kind == 0x05: # Function
ba552f54
PH
815 _ = u30() # slot_id
816 function_idx = u30()
e0df6211
PH
817 methods[function_idx] = multinames[trait_name_idx]
818 else:
819 raise ExtractorError(u'Unsupported trait kind %d' % kind)
820
821 if attrs & 0x4 != 0: # Metadata present
ba552f54 822 metadata_count = u30()
e0df6211 823 for _c3 in range(metadata_count):
ba552f54 824 _ = u30()
e0df6211 825
ba552f54 826 return methods
e0df6211
PH
827
828 # Classes
829 TARGET_CLASSNAME = u'SignatureDecipher'
830 searched_idx = multinames.index(TARGET_CLASSNAME)
831 searched_class_id = None
ba552f54 832 class_count = u30()
e0df6211 833 for class_id in range(class_count):
ba552f54 834 name_idx = u30()
e0df6211
PH
835 if name_idx == searched_idx:
836 # We found the class we're looking for!
837 searched_class_id = class_id
ba552f54
PH
838 _ = u30() # super_name idx
839 flags = read_byte()
e0df6211 840 if flags & 0x08 != 0: # Protected namespace is present
ba552f54
PH
841 protected_ns_idx = u30()
842 intrf_count = u30()
e0df6211 843 for _c2 in range(intrf_count):
ba552f54
PH
844 _ = u30()
845 _ = u30() # iinit
846 trait_count = u30()
e0df6211 847 for _c2 in range(trait_count):
ba552f54 848 _ = parse_traits_info()
e0df6211
PH
849
850 if searched_class_id is None:
851 raise ExtractorError(u'Target class %r not found' %
852 TARGET_CLASSNAME)
853
854 method_names = {}
855 method_idxs = {}
856 for class_id in range(class_count):
ba552f54
PH
857 _ = u30() # cinit
858 trait_count = u30()
e0df6211 859 for _c2 in range(trait_count):
ba552f54 860 trait_methods = parse_traits_info()
e0df6211
PH
861 if class_id == searched_class_id:
862 method_names.update(trait_methods.items())
863 method_idxs.update(dict(
864 (idx, name)
865 for name, idx in trait_methods.items()))
866
867 # Scripts
ba552f54 868 script_count = u30()
e0df6211 869 for _c in range(script_count):
ba552f54
PH
870 _ = u30() # init
871 trait_count = u30()
e0df6211 872 for _c2 in range(trait_count):
ba552f54 873 _ = parse_traits_info()
e0df6211
PH
874
875 # Method bodies
ba552f54 876 method_body_count = u30()
e0df6211
PH
877 Method = collections.namedtuple('Method', ['code', 'local_count'])
878 methods = {}
879 for _c in range(method_body_count):
ba552f54
PH
880 method_idx = u30()
881 max_stack = u30()
882 local_count = u30()
883 init_scope_depth = u30()
884 max_scope_depth = u30()
885 code_length = u30()
886 code = read_bytes(code_length)
e0df6211 887 if method_idx in method_idxs:
ba552f54 888 m = Method(code, local_count)
e0df6211 889 methods[method_idxs[method_idx]] = m
ba552f54 890 exception_count = u30()
e0df6211 891 for _c2 in range(exception_count):
ba552f54
PH
892 _ = u30() # from
893 _ = u30() # to
894 _ = u30() # target
895 _ = u30() # exc_type
896 _ = u30() # var_name
897 trait_count = u30()
e0df6211 898 for _c2 in range(trait_count):
ba552f54 899 _ = parse_traits_info()
e0df6211 900
ba552f54 901 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
902 assert len(methods) == len(method_idxs)
903
904 method_pyfunctions = {}
905
906 def extract_function(func_name):
907 if func_name in method_pyfunctions:
908 return method_pyfunctions[func_name]
909 if func_name not in methods:
910 raise ExtractorError(u'Cannot find function %r' % func_name)
911 m = methods[func_name]
912
913 def resfunc(args):
e0df6211
PH
914 registers = ['(this)'] + list(args) + [None] * m.local_count
915 stack = []
916 coder = io.BytesIO(m.code)
917 while True:
918 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 919 if opcode == 36: # pushbyte
e0df6211
PH
920 v = struct.unpack('!B', coder.read(1))[0]
921 stack.append(v)
922 elif opcode == 44: # pushstring
923 idx = u30(coder)
924 stack.append(constant_strings[idx])
925 elif opcode == 48: # pushscope
926 # We don't implement the scope register, so we'll just
927 # ignore the popped value
928 stack.pop()
929 elif opcode == 70: # callproperty
930 index = u30(coder)
931 mname = multinames[index]
932 arg_count = u30(coder)
933 args = list(reversed(
934 [stack.pop() for _ in range(arg_count)]))
935 obj = stack.pop()
936 if mname == u'split':
937 assert len(args) == 1
938 assert isinstance(args[0], compat_str)
939 assert isinstance(obj, compat_str)
940 if args[0] == u'':
941 res = list(obj)
942 else:
943 res = obj.split(args[0])
944 stack.append(res)
a7177865
PH
945 elif mname == u'slice':
946 assert len(args) == 1
947 assert isinstance(args[0], int)
948 assert isinstance(obj, list)
949 res = obj[args[0]:]
950 stack.append(res)
951 elif mname == u'join':
952 assert len(args) == 1
953 assert isinstance(args[0], compat_str)
954 assert isinstance(obj, list)
955 res = args[0].join(obj)
956 stack.append(res)
e0df6211
PH
957 elif mname in method_pyfunctions:
958 stack.append(method_pyfunctions[mname](args))
959 else:
960 raise NotImplementedError(
961 u'Unsupported property %r on %r'
962 % (mname, obj))
a7177865
PH
963 elif opcode == 72: # returnvalue
964 res = stack.pop()
965 return res
966 elif opcode == 79: # callpropvoid
967 index = u30(coder)
968 mname = multinames[index]
969 arg_count = u30(coder)
970 args = list(reversed(
971 [stack.pop() for _ in range(arg_count)]))
972 obj = stack.pop()
973 if mname == u'reverse':
974 assert isinstance(obj, list)
975 obj.reverse()
976 else:
977 raise NotImplementedError(
978 u'Unsupported (void) property %r on %r'
979 % (mname, obj))
e0df6211
PH
980 elif opcode == 93: # findpropstrict
981 index = u30(coder)
982 mname = multinames[index]
983 res = extract_function(mname)
984 stack.append(res)
985 elif opcode == 97: # setproperty
986 index = u30(coder)
987 value = stack.pop()
988 idx = stack.pop()
989 obj = stack.pop()
990 assert isinstance(obj, list)
991 assert isinstance(idx, int)
992 obj[idx] = value
993 elif opcode == 98: # getlocal
994 index = u30(coder)
995 stack.append(registers[index])
996 elif opcode == 99: # setlocal
997 index = u30(coder)
998 value = stack.pop()
999 registers[index] = value
1000 elif opcode == 102: # getproperty
1001 index = u30(coder)
1002 pname = multinames[index]
1003 if pname == u'length':
1004 obj = stack.pop()
1005 assert isinstance(obj, list)
1006 stack.append(len(obj))
1007 else: # Assume attribute access
1008 idx = stack.pop()
1009 assert isinstance(idx, int)
1010 obj = stack.pop()
1011 assert isinstance(obj, list)
1012 stack.append(obj[idx])
1013 elif opcode == 128: # coerce
1014 _ = u30(coder)
1015 elif opcode == 133: # coerce_s
1016 assert isinstance(stack[-1], (type(None), compat_str))
1017 elif opcode == 164: # modulo
1018 value2 = stack.pop()
1019 value1 = stack.pop()
1020 res = value1 % value2
1021 stack.append(res)
a7177865
PH
1022 elif opcode == 208: # getlocal_0
1023 stack.append(registers[0])
1024 elif opcode == 209: # getlocal_1
1025 stack.append(registers[1])
1026 elif opcode == 210: # getlocal_2
1027 stack.append(registers[2])
1028 elif opcode == 211: # getlocal_3
1029 stack.append(registers[3])
e0df6211
PH
1030 elif opcode == 214: # setlocal_2
1031 registers[2] = stack.pop()
1032 elif opcode == 215: # setlocal_3
1033 registers[3] = stack.pop()
1034 else:
1035 raise NotImplementedError(
1036 u'Unsupported opcode %d' % opcode)
1037
1038 method_pyfunctions[func_name] = resfunc
1039 return resfunc
1040
1041 initial_function = extract_function(u'decipher')
1042 return lambda s: initial_function([s])
1043
83799698 1044 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1045 """Turn the encrypted s field into a working signature"""
6b37f0be 1046
83799698 1047 if player_url is not None:
e0df6211 1048 try:
83799698
PH
1049 if player_url not in self._player_cache:
1050 func = self._extract_signature_function(
c4417ddb 1051 video_id, player_url, len(s)
e0df6211 1052 )
83799698 1053 self._player_cache[player_url] = func
edf3e38e
PH
1054 func = self._player_cache[player_url]
1055 if self._downloader.params.get('youtube_print_sig_code'):
1056 self._print_sig_code(func, len(s))
1057 return func(s)
e0df6211
PH
1058 except Exception as e:
1059 tb = traceback.format_exc()
83799698
PH
1060 self._downloader.report_warning(
1061 u'Automatic signature extraction failed: ' + tb)
e0df6211 1062
83799698
PH
1063 self._downloader.report_warning(
1064 u'Warning: Falling back to static signature algorithm')
2f2ffea9
PH
1065 return self._static_decrypt_signature(
1066 s, video_id, player_url, age_gate)
e0df6211 1067
2f2ffea9 1068 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1069 if age_gate:
1070 # The videos with age protection use another player, so the
1071 # algorithms can be different.
1072 if len(s) == 86:
1073 return s[2:63] + s[82] + s[64:82] + s[63]
1074
4ba146f3
PH
1075 if len(s) == 93:
1076 return s[86:29:-1] + s[88] + s[28:5:-1]
1077 elif len(s) == 92:
444b1165 1078 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
4ba146f3
PH
1079 elif len(s) == 91:
1080 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
1081 elif len(s) == 90:
1082 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1083 elif len(s) == 89:
1084 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1085 elif len(s) == 88:
3e223834 1086 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1087 elif len(s) == 87:
3a725669 1088 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1089 elif len(s) == 86:
1cf911bc 1090 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
be547e1d 1091 elif len(s) == 85:
6ae8ee3f 1092 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1093 elif len(s) == 84:
23b00bc0 1094 return s[81:36:-1] + s[0] + s[35:2:-1]
be547e1d 1095 elif len(s) == 83:
e1842025 1096 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
be547e1d 1097 elif len(s) == 82:
ce85f022 1098 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
be547e1d 1099 elif len(s) == 81:
aedd6bb9 1100 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1101 elif len(s) == 80:
1102 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1103 elif len(s) == 79:
1104 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1105
1106 else:
1107 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1108
75952c6e
JMF
1109 def _decrypt_signature_age_gate(self, s):
1110 # The videos with age protection use another player, so the algorithms
1111 # can be different.
1112 if len(s) == 86:
1113 return s[2:63] + s[82] + s[64:82] + s[63]
1114 else:
1115 # Fallback to the other algortihms
b072a9de 1116 return self._decrypt_signature(s)
c5e8d7af 1117
de7f3446 1118 def _get_available_subtitles(self, video_id):
de7f3446 1119 try:
7fad1c63
JMF
1120 sub_list = self._download_webpage(
1121 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1122 video_id, note=False)
1123 except ExtractorError as err:
de7f3446
JMF
1124 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1125 return {}
1126 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1127
1128 sub_lang_list = {}
1129 for l in lang_list:
1130 lang = l[1]
1131 params = compat_urllib_parse.urlencode({
1132 'lang': lang,
1133 'v': video_id,
1134 'fmt': self._downloader.params.get('subtitlesformat'),
1135 })
1136 url = u'http://www.youtube.com/api/timedtext?' + params
1137 sub_lang_list[lang] = url
1138 if not sub_lang_list:
1139 self._downloader.report_warning(u'video doesn\'t have subtitles')
1140 return {}
1141 return sub_lang_list
1142
055e6f36 1143 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1144 """We need the webpage for getting the captions url, pass it as an
1145 argument to speed up the process."""
de7f3446
JMF
1146 sub_format = self._downloader.params.get('subtitlesformat')
1147 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1148 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1149 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1150 if mobj is None:
1151 self._downloader.report_warning(err_msg)
1152 return {}
1153 player_config = json.loads(mobj.group(1))
1154 try:
1155 args = player_config[u'args']
1156 caption_url = args[u'ttsurl']
1157 timestamp = args[u'timestamp']
055e6f36
JMF
1158 # We get the available subtitles
1159 list_params = compat_urllib_parse.urlencode({
1160 'type': 'list',
1161 'tlangs': 1,
1162 'asrs': 1,
de7f3446 1163 })
055e6f36
JMF
1164 list_url = caption_url + '&' + list_params
1165 list_page = self._download_webpage(list_url, video_id)
1166 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
e3dc22ca
JMF
1167 original_lang_node = caption_list.find('track')
1168 if original_lang_node.attrib.get('kind') != 'asr' :
1169 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1170 return {}
1171 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1172
1173 sub_lang_list = {}
1174 for lang_node in caption_list.findall('target'):
1175 sub_lang = lang_node.attrib['lang_code']
1176 params = compat_urllib_parse.urlencode({
1177 'lang': original_lang,
1178 'tlang': sub_lang,
1179 'fmt': sub_format,
1180 'ts': timestamp,
1181 'kind': 'asr',
1182 })
1183 sub_lang_list[sub_lang] = caption_url + '&' + params
1184 return sub_lang_list
de7f3446
JMF
1185 # An extractor error can be raise by the download process if there are
1186 # no automatic captions but there are subtitles
1187 except (KeyError, ExtractorError):
1188 self._downloader.report_warning(err_msg)
1189 return {}
1190
c5e8d7af
PH
1191 def _print_formats(self, formats):
1192 print('Available formats:')
1193 for x in formats:
03cc7c20
JMF
1194 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1195 self._video_dimensions.get(x, '???'),
836a086c 1196 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1197
1198 def _extract_id(self, url):
1199 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1200 if mobj is None:
1201 raise ExtractorError(u'Invalid URL: %s' % url)
1202 video_id = mobj.group(2)
1203 return video_id
1204
1d043b93
JMF
1205 def _get_video_url_list(self, url_map):
1206 """
1207 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1208 with the requested formats.
1209 """
1210 req_format = self._downloader.params.get('format', None)
1211 format_limit = self._downloader.params.get('format_limit', None)
1212 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1213 if format_limit is not None and format_limit in available_formats:
1214 format_list = available_formats[available_formats.index(format_limit):]
1215 else:
1216 format_list = available_formats
1217 existing_formats = [x for x in format_list if x in url_map]
1218 if len(existing_formats) == 0:
1219 raise ExtractorError(u'no known formats available for video')
1220 if self._downloader.params.get('listformats', None):
1221 self._print_formats(existing_formats)
1222 return
1223 if req_format is None or req_format == 'best':
1224 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1225 elif req_format == 'worst':
1226 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1227 elif req_format in ('-1', 'all'):
1228 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1229 else:
1230 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1231 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1232 # available in the specified format. For example,
1233 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1234 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1235 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1236 req_formats = req_format.split('/')
1237 video_url_list = None
1238 for rf in req_formats:
1239 if rf in url_map:
1240 video_url_list = [(rf, url_map[rf])]
1241 break
bdc6b3fc
AZ
1242 if rf in self._video_formats_map:
1243 for srf in self._video_formats_map[rf]:
1244 if srf in url_map:
1245 video_url_list = [(srf, url_map[srf])]
1246 break
1247 else:
1248 continue
1249 break
1d043b93
JMF
1250 if video_url_list is None:
1251 raise ExtractorError(u'requested format not available')
1252 return video_url_list
1253
1254 def _extract_from_m3u8(self, manifest_url, video_id):
1255 url_map = {}
1256 def _get_urls(_manifest):
1257 lines = _manifest.split('\n')
1258 urls = filter(lambda l: l and not l.startswith('#'),
1259 lines)
1260 return urls
1261 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1262 formats_urls = _get_urls(manifest)
1263 for format_url in formats_urls:
890f62e8 1264 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1265 url_map[itag] = format_url
1266 return url_map
1267
c5e8d7af 1268 def _real_extract(self, url):
d7f44b5b
PH
1269 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1270 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1271
c5e8d7af
PH
1272 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1273 mobj = re.search(self._NEXT_URL_RE, url)
1274 if mobj:
1275 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1276 video_id = self._extract_id(url)
1277
1278 # Get video webpage
1279 self.report_video_webpage_download(video_id)
1280 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1281 request = compat_urllib_request.Request(url)
1282 try:
1283 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1284 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1285 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1286
1287 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1288
1289 # Attempt to extract SWF player URL
e0df6211 1290 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1291 if mobj is not None:
1292 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1293 else:
1294 player_url = None
1295
1296 # Get video info
1297 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1298 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1299 self.report_age_confirmation()
1300 age_gate = True
1301 # We simulate the access to the video from www.youtube.com/v/{video_id}
1302 # this can be viewed without login into Youtube
1303 data = compat_urllib_parse.urlencode({'video_id': video_id,
1304 'el': 'embedded',
1305 'gl': 'US',
1306 'hl': 'en',
1307 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1308 'asv': 3,
1309 'sts':'1588',
1310 })
1311 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1312 video_info_webpage = self._download_webpage(video_info_url, video_id,
1313 note=False,
1314 errnote='unable to download video info webpage')
1315 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1316 else:
1317 age_gate = False
1318 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1319 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1320 % (video_id, el_type))
1321 video_info_webpage = self._download_webpage(video_info_url, video_id,
1322 note=False,
1323 errnote='unable to download video info webpage')
1324 video_info = compat_parse_qs(video_info_webpage)
1325 if 'token' in video_info:
1326 break
c5e8d7af
PH
1327 if 'token' not in video_info:
1328 if 'reason' in video_info:
9a82b238 1329 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1330 else:
1331 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1332
1333 # Check for "rental" videos
1334 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1335 raise ExtractorError(u'"rental" videos not supported')
1336
1337 # Start extracting information
1338 self.report_information_extraction(video_id)
1339
1340 # uploader
1341 if 'author' not in video_info:
1342 raise ExtractorError(u'Unable to extract uploader name')
1343 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1344
1345 # uploader_id
1346 video_uploader_id = None
1347 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1348 if mobj is not None:
1349 video_uploader_id = mobj.group(1)
1350 else:
1351 self._downloader.report_warning(u'unable to extract uploader nickname')
1352
1353 # title
1354 if 'title' not in video_info:
1355 raise ExtractorError(u'Unable to extract video title')
1356 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1357
1358 # thumbnail image
7763b04e
JMF
1359 # We try first to get a high quality image:
1360 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1361 video_webpage, re.DOTALL)
1362 if m_thumb is not None:
1363 video_thumbnail = m_thumb.group(1)
1364 elif 'thumbnail_url' not in video_info:
c5e8d7af
PH
1365 self._downloader.report_warning(u'unable to extract video thumbnail')
1366 video_thumbnail = ''
1367 else: # don't panic if we can't find it
1368 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1369
1370 # upload date
1371 upload_date = None
1372 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1373 if mobj is not None:
1374 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1375 upload_date = unified_strdate(upload_date)
1376
1377 # description
1378 video_description = get_element_by_id("eow-description", video_webpage)
1379 if video_description:
1380 video_description = clean_html(video_description)
1381 else:
1382 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1383 if fd_mobj:
1384 video_description = unescapeHTML(fd_mobj.group(1))
1385 else:
1386 video_description = u''
1387
1388 # subtitles
d82134c3 1389 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1390
c5e8d7af 1391 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1392 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1393 return
1394
1395 if 'length_seconds' not in video_info:
1396 self._downloader.report_warning(u'unable to extract video duration')
1397 video_duration = ''
1398 else:
1399 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1400
c5e8d7af 1401 # Decide which formats to download
c5e8d7af
PH
1402
1403 try:
1404 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1405 if not mobj:
1406 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1407 info = json.loads(mobj.group(1))
1408 args = info['args']
7ce7e394
JMF
1409 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1410 # this signatures are encrypted
1411 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1412 if m_s is not None:
1413 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1414 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
cde846b3 1415 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
b7a68384 1416 if m_s is not None:
37b6d5f6
AZ
1417 if 'url_encoded_fmt_stream_map' in video_info:
1418 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1419 else:
1420 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
211fbc13 1421 elif 'adaptive_fmts' in video_info:
37b6d5f6
AZ
1422 if 'url_encoded_fmt_stream_map' in video_info:
1423 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1424 else:
1425 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
c5e8d7af
PH
1426 except ValueError:
1427 pass
1428
1429 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1430 self.report_rtmp_download()
1431 video_url_list = [(None, video_info['conn'][0])]
1432 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
a7055eb9
JMF
1433 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1434 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af
PH
1435 url_map = {}
1436 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1437 url_data = compat_parse_qs(url_data_str)
1438 if 'itag' in url_data and 'url' in url_data:
1439 url = url_data['url'][0]
1440 if 'sig' in url_data:
1441 url += '&signature=' + url_data['sig'][0]
1442 elif 's' in url_data:
e0df6211 1443 encrypted_sig = url_data['s'][0]
769fda3c 1444 if self._downloader.params.get('verbose'):
c108eb73 1445 if age_gate:
83799698
PH
1446 player_version = self._search_regex(
1447 r'-(.+)\.swf$',
1448 player_url if player_url else None,
e0df6211
PH
1449 'flash player', fatal=False)
1450 player_desc = 'flash player %s' % player_version
c108eb73 1451 else:
83799698
PH
1452 player_version = self._search_regex(
1453 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1454 'html5 player', fatal=False)
e0df6211
PH
1455 player_desc = u'html5 player %s' % player_version
1456
1457 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1458 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1459 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1460
83799698 1461 if not age_gate:
e0df6211
PH
1462 jsplayer_url_json = self._search_regex(
1463 r'"assets":.+?"js":\s*("[^"]+")',
1464 video_webpage, u'JS player URL')
83799698 1465 player_url = json.loads(jsplayer_url_json)
e0df6211 1466
83799698
PH
1467 signature = self._decrypt_signature(
1468 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1469 url += '&signature=' + signature
1470 if 'ratebypass' not in url:
1471 url += '&ratebypass=yes'
1472 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1473 video_url_list = self._get_video_url_list(url_map)
1474 if not video_url_list:
c5e8d7af 1475 return
1d043b93
JMF
1476 elif video_info.get('hlsvp'):
1477 manifest_url = video_info['hlsvp'][0]
1478 url_map = self._extract_from_m3u8(manifest_url, video_id)
1479 video_url_list = self._get_video_url_list(url_map)
1480 if not video_url_list:
1481 return
1482
c5e8d7af
PH
1483 else:
1484 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1485
1486 results = []
1487 for format_param, video_real_url in video_url_list:
1488 # Extension
1489 video_extension = self._video_extensions.get(format_param, 'flv')
1490
03cc7c20
JMF
1491 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1492 self._video_dimensions.get(format_param, '???'),
836a086c 1493 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
c5e8d7af
PH
1494
1495 results.append({
1496 'id': video_id,
1497 'url': video_real_url,
1498 'uploader': video_uploader,
1499 'uploader_id': video_uploader_id,
1500 'upload_date': upload_date,
1501 'title': video_title,
1502 'ext': video_extension,
1503 'format': video_format,
1504 'thumbnail': video_thumbnail,
1505 'description': video_description,
1506 'player_url': player_url,
1507 'subtitles': video_subtitles,
1508 'duration': video_duration
1509 })
1510 return results
1511
1512class YoutubePlaylistIE(InfoExtractor):
0f818663 1513 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1514 _VALID_URL = r"""(?:
1515 (?:https?://)?
1516 (?:\w+\.)?
1517 youtube\.com/
1518 (?:
1519 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1520 \? (?:.*?&)*? (?:p|a|list)=
1521 | p/
1522 )
c626a3d9 1523 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1524 .*
1525 |
c626a3d9 1526 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1527 )"""
1528 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1529 _MAX_RESULTS = 50
1530 IE_NAME = u'youtube:playlist'
1531
1532 @classmethod
1533 def suitable(cls, url):
1534 """Receives a URL and returns True if suitable for this IE."""
1535 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1536
1537 def _real_extract(self, url):
1538 # Extract playlist id
1539 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1540 if mobj is None:
1541 raise ExtractorError(u'Invalid URL: %s' % url)
1542
1543 # Download playlist videos from API
1544 playlist_id = mobj.group(1) or mobj.group(2)
c5e8d7af
PH
1545 videos = []
1546
755eb032 1547 for page_num in itertools.count(1):
771822eb
JMF
1548 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1549 if start_index >= 1000:
1550 self._downloader.report_warning(u'Max number of results reached')
1551 break
1552 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
c5e8d7af
PH
1553 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1554
1555 try:
1556 response = json.loads(page)
1557 except ValueError as err:
1558 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1559
1560 if 'feed' not in response:
1561 raise ExtractorError(u'Got a malformed response from YouTube API')
1562 playlist_title = response['feed']['title']['$t']
1563 if 'entry' not in response['feed']:
1564 # Number of videos is a multiple of self._MAX_RESULTS
1565 break
1566
1567 for entry in response['feed']['entry']:
1568 index = entry['yt$position']['$t']
c215217e
JMF
1569 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1570 videos.append((
1571 index,
1572 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1573 ))
c5e8d7af 1574
c5e8d7af
PH
1575 videos = [v[1] for v in sorted(videos)]
1576
20c3893f 1577 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
c5e8d7af
PH
1578 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1579
1580
1581class YoutubeChannelIE(InfoExtractor):
0f818663 1582 IE_DESC = u'YouTube.com channels'
c5e8d7af
PH
1583 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1584 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1585 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1586 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1587 IE_NAME = u'youtube:channel'
1588
1589 def extract_videos_from_page(self, page):
1590 ids_in_page = []
1591 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1592 if mobj.group(1) not in ids_in_page:
1593 ids_in_page.append(mobj.group(1))
1594 return ids_in_page
1595
1596 def _real_extract(self, url):
1597 # Extract channel id
1598 mobj = re.match(self._VALID_URL, url)
1599 if mobj is None:
1600 raise ExtractorError(u'Invalid URL: %s' % url)
1601
1602 # Download channel page
1603 channel_id = mobj.group(1)
1604 video_ids = []
1605 pagenum = 1
1606
1607 url = self._TEMPLATE_URL % (channel_id, pagenum)
1608 page = self._download_webpage(url, channel_id,
1609 u'Downloading page #%s' % pagenum)
1610
1611 # Extract video identifiers
1612 ids_in_page = self.extract_videos_from_page(page)
1613 video_ids.extend(ids_in_page)
1614
1615 # Download any subsequent channel pages using the json-based channel_ajax query
1616 if self._MORE_PAGES_INDICATOR in page:
755eb032 1617 for pagenum in itertools.count(1):
c5e8d7af
PH
1618 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1619 page = self._download_webpage(url, channel_id,
1620 u'Downloading page #%s' % pagenum)
1621
1622 page = json.loads(page)
1623
1624 ids_in_page = self.extract_videos_from_page(page['content_html'])
1625 video_ids.extend(ids_in_page)
1626
1627 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1628 break
1629
1630 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1631
1632 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
20c3893f 1633 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
c5e8d7af
PH
1634 return [self.playlist_result(url_entries, channel_id)]
1635
1636
1637class YoutubeUserIE(InfoExtractor):
0f818663 1638 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
faab1d38 1639 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1640 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1641 _GDATA_PAGE_SIZE = 50
fd9cf738 1642 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1643 IE_NAME = u'youtube:user'
1644
e3ea4790 1645 @classmethod
f4b05232 1646 def suitable(cls, url):
e3ea4790
JMF
1647 # Don't return True if the url can be extracted with other youtube
1648 # extractor, the regex would is too permissive and it would match.
1649 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1650 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1651 else: return super(YoutubeUserIE, cls).suitable(url)
1652
c5e8d7af
PH
1653 def _real_extract(self, url):
1654 # Extract username
1655 mobj = re.match(self._VALID_URL, url)
1656 if mobj is None:
1657 raise ExtractorError(u'Invalid URL: %s' % url)
1658
1659 username = mobj.group(1)
1660
1661 # Download video ids using YouTube Data API. Result size per
1662 # query is limited (currently to 50 videos) so we need to query
1663 # page by page until there are no video ids - it means we got
1664 # all of them.
1665
1666 video_ids = []
c5e8d7af 1667
755eb032 1668 for pagenum in itertools.count(0):
c5e8d7af
PH
1669 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1670
1671 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1672 page = self._download_webpage(gdata_url, username,
1673 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1674
fd9cf738
JMF
1675 try:
1676 response = json.loads(page)
1677 except ValueError as err:
1678 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1679 if 'entry' not in response['feed']:
1680 # Number of videos is a multiple of self._MAX_RESULTS
1681 break
fd9cf738 1682
c5e8d7af
PH
1683 # Extract video identifiers
1684 ids_in_page = []
fd9cf738
JMF
1685 for entry in response['feed']['entry']:
1686 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1687 video_ids.extend(ids_in_page)
1688
1689 # A little optimization - if current page is not
1690 # "full", ie. does not contain PAGE_SIZE video ids then
1691 # we can assume that this page is the last one - there
1692 # are no more ids on further pages - no need to query
1693 # again.
1694
1695 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1696 break
1697
c5e8d7af 1698 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
20c3893f 1699 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
c5e8d7af 1700 return [self.playlist_result(url_results, playlist_title = username)]
b05654f0
PH
1701
1702class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1703 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1704 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1705 _MAX_RESULTS = 1000
1706 IE_NAME = u'youtube:search'
1707 _SEARCH_KEY = 'ytsearch'
1708
1709 def report_download_page(self, query, pagenum):
1710 """Report attempt to download search page with given number."""
1711 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1712
1713 def _get_n_results(self, query, n):
1714 """Get a specified number of results for a query"""
1715
1716 video_ids = []
1717 pagenum = 0
1718 limit = n
1719
1720 while (50 * pagenum) < limit:
1721 self.report_download_page(query, pagenum+1)
1722 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1723 request = compat_urllib_request.Request(result_url)
1724 try:
1725 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1726 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1727 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1728 api_response = json.loads(data)['data']
1729
1730 if not 'items' in api_response:
1731 raise ExtractorError(u'[youtube] No video results')
1732
1733 new_ids = list(video['id'] for video in api_response['items'])
1734 video_ids += new_ids
1735
1736 limit = min(n, api_response['totalItems'])
1737 pagenum += 1
1738
1739 if len(video_ids) > n:
1740 video_ids = video_ids[:n]
1741 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1742 return self.playlist_result(videos, query)
75dff0ee
JMF
1743
1744
1745class YoutubeShowIE(InfoExtractor):
0f818663 1746 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1747 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1748 IE_NAME = u'youtube:show'
1749
1750 def _real_extract(self, url):
1751 mobj = re.match(self._VALID_URL, url)
1752 show_name = mobj.group(1)
1753 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1754 # There's one playlist for each season of the show
1755 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1756 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1757 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1758
1759
b2e8bc1b 1760class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1761 """
1762 Base class for extractors that fetch info from
1763 http://www.youtube.com/feed_ajax
1764 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1765 """
b2e8bc1b 1766 _LOGIN_REQUIRED = True
04cc9617 1767 _PAGING_STEP = 30
43ba5456
JMF
1768 # use action_load_personal_feed instead of action_load_system_feed
1769 _PERSONAL_FEED = False
04cc9617 1770
d7ae0639
JMF
1771 @property
1772 def _FEED_TEMPLATE(self):
43ba5456
JMF
1773 action = 'action_load_system_feed'
1774 if self._PERSONAL_FEED:
1775 action = 'action_load_personal_feed'
1776 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1777
1778 @property
1779 def IE_NAME(self):
1780 return u'youtube:%s' % self._FEED_NAME
04cc9617 1781
81f0259b 1782 def _real_initialize(self):
b2e8bc1b 1783 self._login()
81f0259b 1784
04cc9617
JMF
1785 def _real_extract(self, url):
1786 feed_entries = []
1787 # The step argument is available only in 2.7 or higher
1788 for i in itertools.count(0):
1789 paging = i*self._PAGING_STEP
d7ae0639
JMF
1790 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1791 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1792 u'Downloading page %s' % i)
1793 info = json.loads(info)
1794 feed_html = info['feed_html']
43ba5456 1795 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617
JMF
1796 ids = orderedSet(m.group(1) for m in m_ids)
1797 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1798 if info['paging'] is None:
1799 break
d7ae0639
JMF
1800 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1801
1802class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1803 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1804 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1805 _FEED_NAME = 'subscriptions'
1806 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1807
1808class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1809 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1810 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1811 _FEED_NAME = 'recommended'
1812 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1813
43ba5456
JMF
1814class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1815 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1816 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1817 _FEED_NAME = 'watch_later'
1818 _PLAYLIST_TITLE = u'Youtube Watch Later'
1819 _PAGING_STEP = 100
1820 _PERSONAL_FEED = True
c626a3d9
JMF
1821
1822class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1823 IE_NAME = u'youtube:favorites'
1824 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1825 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1826 _LOGIN_REQUIRED = True
1827
1828 def _real_extract(self, url):
1829 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1830 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1831 return self.url_result(playlist_id, 'YoutubePlaylist')