]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Do not warn if fallback is without alternatives (because we did not get the flash...
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af
PH
9import re
10import socket
e0df6211
PH
11import string
12import struct
13import traceback
0ca96d48 14import xml.etree.ElementTree
e0df6211 15import zlib
c5e8d7af 16
b05654f0 17from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 18from .subtitles import SubtitlesInfoExtractor
c5e8d7af 19from ..utils import (
edf3e38e 20 compat_chr,
c5e8d7af
PH
21 compat_http_client,
22 compat_parse_qs,
23 compat_urllib_error,
24 compat_urllib_parse,
25 compat_urllib_request,
26 compat_str,
27
28 clean_html,
29 get_element_by_id,
30 ExtractorError,
31 unescapeHTML,
32 unified_strdate,
04cc9617 33 orderedSet,
edf3e38e 34 write_json_file,
c5e8d7af
PH
35)
36
de7f3446 37class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
49
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
52 try:
53 self.report_lang()
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
57 return False
58 return True
59
60 def _login(self):
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 return False
67
68 request = compat_urllib_request.Request(self._LOGIN_URL)
69 try:
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
73 return False
74
75 galx = None
76 dsh = None
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
78 if match:
79 galx = match.group(1)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
81 if match:
82 dsh = match.group(1)
c5e8d7af 83
b2e8bc1b
JMF
84 # Log in
85 login_form_strs = {
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
87 u'Email': username,
88 u'GALX': galx,
89 u'Passwd': password,
90 u'PersistentCookie': u'yes',
91 u'_utf8': u'霱',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
95 u'dnConn': u'',
96 u'dsh': dsh,
97 u'pstMsg': u'0',
98 u'rmShown': u'1',
99 u'secTok': u'',
100 u'signIn': u'Sign in',
101 u'timeStmp': u'',
102 u'service': u'youtube',
103 u'uilel': u'3',
104 u'hl': u'en_US',
105 }
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
107 # chokes on unicode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
111 try:
112 self.report_login()
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
116 return False
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
119 return False
120 return True
121
122 def _confirm_age(self):
123 age_form = {
124 'next_url': '/',
125 'action_confirm': 'Confirm',
126 }
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
128 try:
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
133 return True
134
135 def _real_initialize(self):
136 if self._downloader is None:
137 return
138 if not self._set_language():
139 return
140 if not self._login():
141 return
142 self._confirm_age()
c5e8d7af 143
8377574c 144
de7f3446 145class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 146 IE_DESC = u'YouTube.com'
c5e8d7af
PH
147 _VALID_URL = r"""^
148 (
149 (?:https?://)? # http(s):// (optional)
f4b05232 150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
e69ae5b9
JMF
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
d741e55a 157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
160 v=
161 )
f4b05232
JMF
162 ))
163 |youtu\.be/ # just youtu.be/xxxx
164 )
c5e8d7af 165 )? # all until now is optional -> you can pass the naked ID
8963d9c2 166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
167 (?(1).+)? # if we found the ID, everything can follow
168 $"""
c5e8d7af 169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 170 # Listed in order of quality
bdc6b3fc 171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 172 # Apple HTTP Live Streaming
bdc6b3fc 173 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
174 # 3D
175 '85', '84', '102', '83', '101', '82', '100',
176 # Dash video
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
179 # Dash audio
180 '141', '172', '140', '171', '139',
1d043b93 181 ]
bdc6b3fc 182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 183 # Apple HTTP Live Streaming
bdc6b3fc
AZ
184 '96', '95', '94', '93', '92', '132', '151',
185 # 3D
86fe61c8 186 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
187 # Dash video
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
190 # Dash audio
191 '172', '141', '171', '140', '139',
1d043b93 192 ]
bdc6b3fc
AZ
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
198 }
c5e8d7af
PH
199 _video_extensions = {
200 '13': '3gp',
bdc6b3fc 201 '17': '3gp',
c5e8d7af
PH
202 '18': 'mp4',
203 '22': 'mp4',
bdc6b3fc 204 '36': '3gp',
c5e8d7af 205 '37': 'mp4',
d69cf69a 206 '38': 'mp4',
c5e8d7af
PH
207 '43': 'webm',
208 '44': 'webm',
209 '45': 'webm',
210 '46': 'webm',
1d043b93 211
86fe61c8
AZ
212 # 3d videos
213 '82': 'mp4',
214 '83': 'mp4',
215 '84': 'mp4',
216 '85': 'mp4',
217 '100': 'webm',
218 '101': 'webm',
219 '102': 'webm',
836a086c 220
96fb5605 221 # Apple HTTP Live Streaming
1d043b93
JMF
222 '92': 'mp4',
223 '93': 'mp4',
224 '94': 'mp4',
225 '95': 'mp4',
226 '96': 'mp4',
227 '132': 'mp4',
228 '151': 'mp4',
836a086c
AZ
229
230 # Dash mp4
231 '133': 'mp4',
232 '134': 'mp4',
233 '135': 'mp4',
234 '136': 'mp4',
235 '137': 'mp4',
236 '138': 'mp4',
237 '139': 'mp4',
238 '140': 'mp4',
239 '141': 'mp4',
240 '160': 'mp4',
241
242 # Dash webm
243 '171': 'webm',
244 '172': 'webm',
245 '242': 'webm',
246 '243': 'webm',
247 '244': 'webm',
248 '245': 'webm',
249 '246': 'webm',
250 '247': 'webm',
251 '248': 'webm',
c5e8d7af
PH
252 }
253 _video_dimensions = {
254 '5': '240x400',
255 '6': '???',
256 '13': '???',
257 '17': '144x176',
258 '18': '360x640',
259 '22': '720x1280',
260 '34': '360x640',
261 '35': '480x854',
bdc6b3fc 262 '36': '240x320',
c5e8d7af
PH
263 '37': '1080x1920',
264 '38': '3072x4096',
265 '43': '360x640',
266 '44': '480x854',
267 '45': '720x1280',
268 '46': '1080x1920',
86fe61c8
AZ
269 '82': '360p',
270 '83': '480p',
271 '84': '720p',
272 '85': '1080p',
1d043b93
JMF
273 '92': '240p',
274 '93': '360p',
275 '94': '480p',
276 '95': '720p',
277 '96': '1080p',
86fe61c8
AZ
278 '100': '360p',
279 '101': '480p',
836a086c 280 '102': '720p',
1d043b93
JMF
281 '132': '240p',
282 '151': '72p',
836a086c
AZ
283 '133': '240p',
284 '134': '360p',
285 '135': '480p',
286 '136': '720p',
287 '137': '1080p',
288 '138': '>1080p',
289 '139': '48k',
290 '140': '128k',
291 '141': '256k',
292 '160': '192p',
293 '171': '128k',
294 '172': '256k',
295 '242': '240p',
296 '243': '360p',
297 '244': '480p',
298 '245': '480p',
299 '246': '480p',
300 '247': '720p',
301 '248': '1080p',
c5e8d7af 302 }
836a086c
AZ
303 _special_itags = {
304 '82': '3D',
305 '83': '3D',
306 '84': '3D',
307 '85': '3D',
308 '100': '3D',
309 '101': '3D',
310 '102': '3D',
311 '133': 'DASH Video',
312 '134': 'DASH Video',
313 '135': 'DASH Video',
314 '136': 'DASH Video',
315 '137': 'DASH Video',
316 '138': 'DASH Video',
317 '139': 'DASH Audio',
318 '140': 'DASH Audio',
319 '141': 'DASH Audio',
320 '160': 'DASH Video',
321 '171': 'DASH Audio',
322 '172': 'DASH Audio',
323 '242': 'DASH Video',
324 '243': 'DASH Video',
325 '244': 'DASH Video',
326 '245': 'DASH Video',
327 '246': 'DASH Video',
328 '247': 'DASH Video',
329 '248': 'DASH Video',
c5e8d7af 330 }
836a086c 331
c5e8d7af 332 IE_NAME = u'youtube'
2eb88d95
PH
333 _TESTS = [
334 {
0e853ca4
PH
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
337 u"info_dict": {
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 343 }
0e853ca4
PH
344 },
345 {
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
349 u"info_dict": {
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
2eb88d95 355 }
0e853ca4
PH
356 },
357 {
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
361 u"info_dict": {
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
c7bf7366 364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
45ed795c 365 u"uploader": u"Icona Pop",
0e853ca4 366 u"uploader_id": u"IconaPop"
2eb88d95 367 }
c108eb73
JMF
368 },
369 {
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
373 u"info_dict": {
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
379 }
380 },
1d043b93
JMF
381 {
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
385 u'info_dict': {
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
391 },
392 u'params': {
393 u'skip_download': True,
394 },
395 },
2eb88d95
PH
396 ]
397
c5e8d7af
PH
398
399 @classmethod
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 402 if YoutubePlaylistIE.suitable(url): return False
c5e8d7af
PH
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
404
e0df6211
PH
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 407 self._player_cache = {}
e0df6211 408
c5e8d7af
PH
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
412
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
416
c5e8d7af
PH
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
420
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
424
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
428
c4417ddb
PH
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 431 player_url)
e0df6211
PH
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
434
c4417ddb
PH
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
edf3e38e
PH
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
c4417ddb 440
f8061589
PH
441 cache_enabled = cache_dir != u'NONE'
442 if cache_enabled:
c4417ddb
PH
443 cache_fn = os.path.join(os.path.expanduser(cache_dir),
444 u'youtube-sigfuncs',
445 func_id + '.json')
446 try:
edf3e38e 447 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
448 cache_spec = json.load(cachef)
449 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 450 except IOError:
c4417ddb 451 pass # No cache available
83799698 452
e0df6211
PH
453 if player_type == 'js':
454 code = self._download_webpage(
455 player_url, video_id,
83799698 456 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 457 errnote=u'Download of %s failed' % player_url)
83799698 458 res = self._parse_sig_js(code)
c4417ddb 459 elif player_type == 'swf':
e0df6211
PH
460 urlh = self._request_webpage(
461 player_url, video_id,
83799698 462 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
463 errnote=u'Download of %s failed' % player_url)
464 code = urlh.read()
83799698 465 res = self._parse_sig_swf(code)
e0df6211
PH
466 else:
467 assert False, 'Invalid player type %r' % player_type
468
f8061589 469 if cache_enabled:
edf3e38e
PH
470 try:
471 cache_res = res(map(compat_chr, range(slen)))
472 cache_spec = [ord(c) for c in cache_res]
473 try:
474 os.makedirs(os.path.dirname(cache_fn))
475 except OSError as ose:
476 if ose.errno != errno.EEXIST:
477 raise
478 write_json_file(cache_spec, cache_fn)
0ca96d48 479 except Exception:
edf3e38e
PH
480 tb = traceback.format_exc()
481 self._downloader.report_warning(
482 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
483
484 return res
485
edf3e38e
PH
486 def _print_sig_code(self, func, slen):
487 def gen_sig_code(idxs):
488 def _genslice(start, end, step):
489 starts = u'' if start == 0 else str(start)
490 ends = u':%d' % (end+step)
491 steps = u'' if step == 1 else (':%d' % step)
492 return u's[%s%s%s]' % (starts, ends, steps)
493
494 step = None
0ca96d48
PH
495 start = '(Never used)' # Quelch pyflakes warnings - start will be
496 # set as soon as step is set
edf3e38e
PH
497 for i, prev in zip(idxs[1:], idxs[:-1]):
498 if step is not None:
499 if i - prev == step:
500 continue
501 yield _genslice(start, prev, step)
502 step = None
503 continue
504 if i - prev in [-1, 1]:
505 step = i - prev
506 start = prev
507 continue
508 else:
509 yield u's[%d]' % prev
510 if step is None:
511 yield u's[%d]' % i
512 else:
513 yield _genslice(start, i, step)
514
515 cache_res = func(map(compat_chr, range(slen)))
516 cache_spec = [ord(c) for c in cache_res]
517 expr_code = u' + '.join(gen_sig_code(cache_spec))
518 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 519 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 520
e0df6211
PH
521 def _parse_sig_js(self, jscode):
522 funcname = self._search_regex(
523 r'signature=([a-zA-Z]+)', jscode,
524 u'Initial JS player signature function name')
525
526 functions = {}
527
528 def argidx(varname):
529 return string.lowercase.index(varname)
530
531 def interpret_statement(stmt, local_vars, allow_recursion=20):
532 if allow_recursion < 0:
0ca96d48 533 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
534
535 if stmt.startswith(u'var '):
536 stmt = stmt[len(u'var '):]
537 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
538 r'=(?P<expr>.*)$', stmt)
539 if ass_m:
540 if ass_m.groupdict().get('index'):
541 def assign(val):
542 lvar = local_vars[ass_m.group('out')]
543 idx = interpret_expression(ass_m.group('index'),
544 local_vars, allow_recursion)
545 assert isinstance(idx, int)
546 lvar[idx] = val
547 return val
548 expr = ass_m.group('expr')
549 else:
550 def assign(val):
551 local_vars[ass_m.group('out')] = val
552 return val
553 expr = ass_m.group('expr')
554 elif stmt.startswith(u'return '):
555 assign = lambda v: v
556 expr = stmt[len(u'return '):]
557 else:
558 raise ExtractorError(
559 u'Cannot determine left side of statement in %r' % stmt)
560
561 v = interpret_expression(expr, local_vars, allow_recursion)
562 return assign(v)
563
564 def interpret_expression(expr, local_vars, allow_recursion):
565 if expr.isdigit():
566 return int(expr)
567
568 if expr.isalpha():
569 return local_vars[expr]
570
571 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
572 if m:
573 member = m.group('member')
574 val = local_vars[m.group('in')]
575 if member == 'split("")':
576 return list(val)
577 if member == 'join("")':
578 return u''.join(val)
579 if member == 'length':
580 return len(val)
581 if member == 'reverse()':
582 return val[::-1]
583 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
584 if slice_m:
585 idx = interpret_expression(
586 slice_m.group('idx'), local_vars, allow_recursion-1)
587 return val[idx:]
588
589 m = re.match(
590 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
591 if m:
592 val = local_vars[m.group('in')]
593 idx = interpret_expression(m.group('idx'), local_vars,
594 allow_recursion-1)
595 return val[idx]
596
597 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
598 if m:
599 a = interpret_expression(m.group('a'),
600 local_vars, allow_recursion)
601 b = interpret_expression(m.group('b'),
602 local_vars, allow_recursion)
603 return a % b
604
605 m = re.match(
606 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
607 if m:
608 fname = m.group('func')
609 if fname not in functions:
610 functions[fname] = extract_function(fname)
611 argvals = [int(v) if v.isdigit() else local_vars[v]
612 for v in m.group('args').split(',')]
613 return functions[fname](argvals)
614 raise ExtractorError(u'Unsupported JS expression %r' % expr)
615
616 def extract_function(funcname):
617 func_m = re.search(
618 r'function ' + re.escape(funcname) +
619 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
620 jscode)
621 argnames = func_m.group('args').split(',')
622
623 def resf(args):
624 local_vars = dict(zip(argnames, args))
625 for stmt in func_m.group('code').split(';'):
626 res = interpret_statement(stmt, local_vars)
627 return res
628 return resf
629
630 initial_function = extract_function(funcname)
631 return lambda s: initial_function([s])
632
633 def _parse_sig_swf(self, file_contents):
634 if file_contents[1:3] != b'WS':
635 raise ExtractorError(
636 u'Not an SWF file; header is %r' % file_contents[:3])
637 if file_contents[:1] == b'C':
638 content = zlib.decompress(file_contents[8:])
639 else:
640 raise NotImplementedError(u'Unsupported compression format %r' %
641 file_contents[:1])
642
643 def extract_tags(content):
644 pos = 0
645 while pos < len(content):
646 header16 = struct.unpack('<H', content[pos:pos+2])[0]
647 pos += 2
648 tag_code = header16 >> 6
649 tag_len = header16 & 0x3f
650 if tag_len == 0x3f:
651 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
652 pos += 4
653 assert pos+tag_len <= len(content)
654 yield (tag_code, content[pos:pos+tag_len])
655 pos += tag_len
656
657 code_tag = next(tag
658 for tag_code, tag in extract_tags(content)
659 if tag_code == 82)
660 p = code_tag.index(b'\0', 4) + 1
ba552f54 661 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
662
663 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
664 def read_int(reader=None):
665 if reader is None:
666 reader = code_reader
e0df6211
PH
667 res = 0
668 shift = 0
669 for _ in range(5):
ba552f54
PH
670 buf = reader.read(1)
671 assert len(buf) == 1
672 b = struct.unpack('<B', buf)[0]
e0df6211
PH
673 res = res | ((b & 0x7f) << shift)
674 if b & 0x80 == 0:
675 break
676 shift += 7
ba552f54
PH
677 return res
678
679 def u30(reader=None):
680 res = read_int(reader)
681 assert res & 0xf0000000 == 0
e0df6211
PH
682 return res
683 u32 = read_int
684
ba552f54
PH
685 def s32(reader=None):
686 v = read_int(reader)
e0df6211
PH
687 if v & 0x80000000 != 0:
688 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
689 return v
690
0ca96d48 691 def read_string(reader=None):
ba552f54
PH
692 if reader is None:
693 reader = code_reader
694 slen = u30(reader)
695 resb = reader.read(slen)
696 assert len(resb) == slen
697 return resb.decode('utf-8')
698
699 def read_bytes(count, reader=None):
700 if reader is None:
701 reader = code_reader
702 resb = reader.read(count)
703 assert len(resb) == count
704 return resb
705
706 def read_byte(reader=None):
707 resb = read_bytes(1, reader=reader)
708 res = struct.unpack('<B', resb)[0]
709 return res
e0df6211
PH
710
711 # minor_version + major_version
0ca96d48 712 read_bytes(2 + 2)
e0df6211
PH
713
714 # Constant pool
ba552f54 715 int_count = u30()
e0df6211 716 for _c in range(1, int_count):
0ca96d48 717 s32()
ba552f54 718 uint_count = u30()
e0df6211 719 for _c in range(1, uint_count):
0ca96d48 720 u32()
ba552f54 721 double_count = u30()
0ca96d48 722 read_bytes((double_count-1) * 8)
ba552f54 723 string_count = u30()
e0df6211
PH
724 constant_strings = [u'']
725 for _c in range(1, string_count):
0ca96d48 726 s = read_string()
e0df6211 727 constant_strings.append(s)
ba552f54 728 namespace_count = u30()
e0df6211 729 for _c in range(1, namespace_count):
0ca96d48
PH
730 read_bytes(1) # kind
731 u30() # name
ba552f54 732 ns_set_count = u30()
e0df6211 733 for _c in range(1, ns_set_count):
ba552f54 734 count = u30()
e0df6211 735 for _c2 in range(count):
0ca96d48 736 u30()
ba552f54 737 multiname_count = u30()
e0df6211
PH
738 MULTINAME_SIZES = {
739 0x07: 2, # QName
740 0x0d: 2, # QNameA
741 0x0f: 1, # RTQName
742 0x10: 1, # RTQNameA
743 0x11: 0, # RTQNameL
744 0x12: 0, # RTQNameLA
745 0x09: 2, # Multiname
746 0x0e: 2, # MultinameA
747 0x1b: 1, # MultinameL
748 0x1c: 1, # MultinameLA
749 }
750 multinames = [u'']
751 for _c in range(1, multiname_count):
ba552f54 752 kind = u30()
e0df6211
PH
753 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
754 if kind == 0x07:
0ca96d48 755 u30() # namespace_idx
ba552f54 756 name_idx = u30()
e0df6211
PH
757 multinames.append(constant_strings[name_idx])
758 else:
759 multinames.append('[MULTINAME kind: %d]' % kind)
760 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 761 u30()
e0df6211
PH
762
763 # Methods
ba552f54 764 method_count = u30()
e0df6211
PH
765 MethodInfo = collections.namedtuple(
766 'MethodInfo',
767 ['NEED_ARGUMENTS', 'NEED_REST'])
768 method_infos = []
769 for method_id in range(method_count):
ba552f54 770 param_count = u30()
0ca96d48 771 u30() # return type
e0df6211 772 for _ in range(param_count):
0ca96d48
PH
773 u30() # param type
774 u30() # name index (always 0 for youtube)
ba552f54 775 flags = read_byte()
e0df6211
PH
776 if flags & 0x08 != 0:
777 # Options present
ba552f54 778 option_count = u30()
e0df6211 779 for c in range(option_count):
0ca96d48
PH
780 u30() # val
781 read_bytes(1) # kind
e0df6211
PH
782 if flags & 0x80 != 0:
783 # Param names present
784 for _ in range(param_count):
0ca96d48 785 u30() # param name
e0df6211
PH
786 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
787 method_infos.append(mi)
788
789 # Metadata
ba552f54 790 metadata_count = u30()
e0df6211 791 for _c in range(metadata_count):
0ca96d48 792 u30() # name
ba552f54 793 item_count = u30()
e0df6211 794 for _c2 in range(item_count):
0ca96d48
PH
795 u30() # key
796 u30() # value
ba552f54
PH
797
798 def parse_traits_info():
799 trait_name_idx = u30()
800 kind_full = read_byte()
e0df6211
PH
801 kind = kind_full & 0x0f
802 attrs = kind_full >> 4
803 methods = {}
804 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
805 u30() # Slot id
806 u30() # type_name_idx
ba552f54 807 vindex = u30()
e0df6211 808 if vindex != 0:
0ca96d48 809 read_byte() # vkind
e0df6211 810 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 811 u30() # disp_id
ba552f54 812 method_idx = u30()
e0df6211
PH
813 methods[multinames[trait_name_idx]] = method_idx
814 elif kind == 0x04: # Class
0ca96d48
PH
815 u30() # slot_id
816 u30() # classi
e0df6211 817 elif kind == 0x05: # Function
0ca96d48 818 u30() # slot_id
ba552f54 819 function_idx = u30()
e0df6211
PH
820 methods[function_idx] = multinames[trait_name_idx]
821 else:
822 raise ExtractorError(u'Unsupported trait kind %d' % kind)
823
824 if attrs & 0x4 != 0: # Metadata present
ba552f54 825 metadata_count = u30()
e0df6211 826 for _c3 in range(metadata_count):
0ca96d48 827 u30() # metadata index
e0df6211 828
ba552f54 829 return methods
e0df6211
PH
830
831 # Classes
832 TARGET_CLASSNAME = u'SignatureDecipher'
833 searched_idx = multinames.index(TARGET_CLASSNAME)
834 searched_class_id = None
ba552f54 835 class_count = u30()
e0df6211 836 for class_id in range(class_count):
ba552f54 837 name_idx = u30()
e0df6211
PH
838 if name_idx == searched_idx:
839 # We found the class we're looking for!
840 searched_class_id = class_id
0ca96d48 841 u30() # super_name idx
ba552f54 842 flags = read_byte()
e0df6211 843 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 844 u30() # protected_ns_idx
ba552f54 845 intrf_count = u30()
e0df6211 846 for _c2 in range(intrf_count):
0ca96d48
PH
847 u30()
848 u30() # iinit
ba552f54 849 trait_count = u30()
e0df6211 850 for _c2 in range(trait_count):
0ca96d48 851 parse_traits_info()
e0df6211
PH
852
853 if searched_class_id is None:
854 raise ExtractorError(u'Target class %r not found' %
855 TARGET_CLASSNAME)
856
857 method_names = {}
858 method_idxs = {}
859 for class_id in range(class_count):
0ca96d48 860 u30() # cinit
ba552f54 861 trait_count = u30()
e0df6211 862 for _c2 in range(trait_count):
ba552f54 863 trait_methods = parse_traits_info()
e0df6211
PH
864 if class_id == searched_class_id:
865 method_names.update(trait_methods.items())
866 method_idxs.update(dict(
867 (idx, name)
868 for name, idx in trait_methods.items()))
869
870 # Scripts
ba552f54 871 script_count = u30()
e0df6211 872 for _c in range(script_count):
0ca96d48 873 u30() # init
ba552f54 874 trait_count = u30()
e0df6211 875 for _c2 in range(trait_count):
0ca96d48 876 parse_traits_info()
e0df6211
PH
877
878 # Method bodies
ba552f54 879 method_body_count = u30()
e0df6211
PH
880 Method = collections.namedtuple('Method', ['code', 'local_count'])
881 methods = {}
882 for _c in range(method_body_count):
ba552f54 883 method_idx = u30()
0ca96d48 884 u30() # max_stack
ba552f54 885 local_count = u30()
0ca96d48
PH
886 u30() # init_scope_depth
887 u30() # max_scope_depth
ba552f54
PH
888 code_length = u30()
889 code = read_bytes(code_length)
e0df6211 890 if method_idx in method_idxs:
ba552f54 891 m = Method(code, local_count)
e0df6211 892 methods[method_idxs[method_idx]] = m
ba552f54 893 exception_count = u30()
e0df6211 894 for _c2 in range(exception_count):
0ca96d48
PH
895 u30() # from
896 u30() # to
897 u30() # target
898 u30() # exc_type
899 u30() # var_name
ba552f54 900 trait_count = u30()
e0df6211 901 for _c2 in range(trait_count):
0ca96d48 902 parse_traits_info()
e0df6211 903
ba552f54 904 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
905 assert len(methods) == len(method_idxs)
906
907 method_pyfunctions = {}
908
909 def extract_function(func_name):
910 if func_name in method_pyfunctions:
911 return method_pyfunctions[func_name]
912 if func_name not in methods:
913 raise ExtractorError(u'Cannot find function %r' % func_name)
914 m = methods[func_name]
915
916 def resfunc(args):
e0df6211
PH
917 registers = ['(this)'] + list(args) + [None] * m.local_count
918 stack = []
919 coder = io.BytesIO(m.code)
920 while True:
921 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 922 if opcode == 36: # pushbyte
e0df6211
PH
923 v = struct.unpack('!B', coder.read(1))[0]
924 stack.append(v)
925 elif opcode == 44: # pushstring
926 idx = u30(coder)
927 stack.append(constant_strings[idx])
928 elif opcode == 48: # pushscope
929 # We don't implement the scope register, so we'll just
930 # ignore the popped value
931 stack.pop()
932 elif opcode == 70: # callproperty
933 index = u30(coder)
934 mname = multinames[index]
935 arg_count = u30(coder)
936 args = list(reversed(
937 [stack.pop() for _ in range(arg_count)]))
938 obj = stack.pop()
939 if mname == u'split':
940 assert len(args) == 1
941 assert isinstance(args[0], compat_str)
942 assert isinstance(obj, compat_str)
943 if args[0] == u'':
944 res = list(obj)
945 else:
946 res = obj.split(args[0])
947 stack.append(res)
a7177865
PH
948 elif mname == u'slice':
949 assert len(args) == 1
950 assert isinstance(args[0], int)
951 assert isinstance(obj, list)
952 res = obj[args[0]:]
953 stack.append(res)
954 elif mname == u'join':
955 assert len(args) == 1
956 assert isinstance(args[0], compat_str)
957 assert isinstance(obj, list)
958 res = args[0].join(obj)
959 stack.append(res)
e0df6211
PH
960 elif mname in method_pyfunctions:
961 stack.append(method_pyfunctions[mname](args))
962 else:
963 raise NotImplementedError(
964 u'Unsupported property %r on %r'
965 % (mname, obj))
a7177865
PH
966 elif opcode == 72: # returnvalue
967 res = stack.pop()
968 return res
969 elif opcode == 79: # callpropvoid
970 index = u30(coder)
971 mname = multinames[index]
972 arg_count = u30(coder)
973 args = list(reversed(
974 [stack.pop() for _ in range(arg_count)]))
975 obj = stack.pop()
976 if mname == u'reverse':
977 assert isinstance(obj, list)
978 obj.reverse()
979 else:
980 raise NotImplementedError(
981 u'Unsupported (void) property %r on %r'
982 % (mname, obj))
e0df6211
PH
983 elif opcode == 93: # findpropstrict
984 index = u30(coder)
985 mname = multinames[index]
986 res = extract_function(mname)
987 stack.append(res)
988 elif opcode == 97: # setproperty
989 index = u30(coder)
990 value = stack.pop()
991 idx = stack.pop()
992 obj = stack.pop()
993 assert isinstance(obj, list)
994 assert isinstance(idx, int)
995 obj[idx] = value
996 elif opcode == 98: # getlocal
997 index = u30(coder)
998 stack.append(registers[index])
999 elif opcode == 99: # setlocal
1000 index = u30(coder)
1001 value = stack.pop()
1002 registers[index] = value
1003 elif opcode == 102: # getproperty
1004 index = u30(coder)
1005 pname = multinames[index]
1006 if pname == u'length':
1007 obj = stack.pop()
1008 assert isinstance(obj, list)
1009 stack.append(len(obj))
1010 else: # Assume attribute access
1011 idx = stack.pop()
1012 assert isinstance(idx, int)
1013 obj = stack.pop()
1014 assert isinstance(obj, list)
1015 stack.append(obj[idx])
1016 elif opcode == 128: # coerce
0ca96d48 1017 u30(coder)
e0df6211
PH
1018 elif opcode == 133: # coerce_s
1019 assert isinstance(stack[-1], (type(None), compat_str))
1020 elif opcode == 164: # modulo
1021 value2 = stack.pop()
1022 value1 = stack.pop()
1023 res = value1 % value2
1024 stack.append(res)
a7177865
PH
1025 elif opcode == 208: # getlocal_0
1026 stack.append(registers[0])
1027 elif opcode == 209: # getlocal_1
1028 stack.append(registers[1])
1029 elif opcode == 210: # getlocal_2
1030 stack.append(registers[2])
1031 elif opcode == 211: # getlocal_3
1032 stack.append(registers[3])
e0df6211
PH
1033 elif opcode == 214: # setlocal_2
1034 registers[2] = stack.pop()
1035 elif opcode == 215: # setlocal_3
1036 registers[3] = stack.pop()
1037 else:
1038 raise NotImplementedError(
1039 u'Unsupported opcode %d' % opcode)
1040
1041 method_pyfunctions[func_name] = resfunc
1042 return resfunc
1043
1044 initial_function = extract_function(u'decipher')
1045 return lambda s: initial_function([s])
1046
83799698 1047 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1048 """Turn the encrypted s field into a working signature"""
6b37f0be 1049
83799698 1050 if player_url is not None:
e0df6211 1051 try:
83799698
PH
1052 if player_url not in self._player_cache:
1053 func = self._extract_signature_function(
c4417ddb 1054 video_id, player_url, len(s)
e0df6211 1055 )
83799698 1056 self._player_cache[player_url] = func
edf3e38e
PH
1057 func = self._player_cache[player_url]
1058 if self._downloader.params.get('youtube_print_sig_code'):
1059 self._print_sig_code(func, len(s))
1060 return func(s)
0ca96d48 1061 except Exception:
e0df6211 1062 tb = traceback.format_exc()
83799698
PH
1063 self._downloader.report_warning(
1064 u'Automatic signature extraction failed: ' + tb)
e0df6211 1065
d2d8f895
PH
1066 self._downloader.report_warning(
1067 u'Warning: Falling back to static signature algorithm')
2f2ffea9
PH
1068 return self._static_decrypt_signature(
1069 s, video_id, player_url, age_gate)
e0df6211 1070
2f2ffea9 1071 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1072 if age_gate:
1073 # The videos with age protection use another player, so the
1074 # algorithms can be different.
1075 if len(s) == 86:
1076 return s[2:63] + s[82] + s[64:82] + s[63]
1077
4ba146f3
PH
1078 if len(s) == 93:
1079 return s[86:29:-1] + s[88] + s[28:5:-1]
1080 elif len(s) == 92:
444b1165 1081 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
4ba146f3
PH
1082 elif len(s) == 91:
1083 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
1084 elif len(s) == 90:
1085 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1086 elif len(s) == 89:
1087 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1088 elif len(s) == 88:
3e223834 1089 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1090 elif len(s) == 87:
3a725669 1091 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1092 elif len(s) == 86:
1cf911bc 1093 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
be547e1d 1094 elif len(s) == 85:
6ae8ee3f 1095 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1096 elif len(s) == 84:
23b00bc0 1097 return s[81:36:-1] + s[0] + s[35:2:-1]
be547e1d 1098 elif len(s) == 83:
e1842025 1099 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
be547e1d 1100 elif len(s) == 82:
ce85f022 1101 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
be547e1d 1102 elif len(s) == 81:
aedd6bb9 1103 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1104 elif len(s) == 80:
1105 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1106 elif len(s) == 79:
1107 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1108
1109 else:
1110 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1111
de7f3446 1112 def _get_available_subtitles(self, video_id):
de7f3446 1113 try:
7fad1c63
JMF
1114 sub_list = self._download_webpage(
1115 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1116 video_id, note=False)
1117 except ExtractorError as err:
de7f3446
JMF
1118 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1119 return {}
1120 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1121
1122 sub_lang_list = {}
1123 for l in lang_list:
1124 lang = l[1]
1125 params = compat_urllib_parse.urlencode({
1126 'lang': lang,
1127 'v': video_id,
1128 'fmt': self._downloader.params.get('subtitlesformat'),
1129 })
1130 url = u'http://www.youtube.com/api/timedtext?' + params
1131 sub_lang_list[lang] = url
1132 if not sub_lang_list:
1133 self._downloader.report_warning(u'video doesn\'t have subtitles')
1134 return {}
1135 return sub_lang_list
1136
055e6f36 1137 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1138 """We need the webpage for getting the captions url, pass it as an
1139 argument to speed up the process."""
de7f3446
JMF
1140 sub_format = self._downloader.params.get('subtitlesformat')
1141 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1142 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1143 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1144 if mobj is None:
1145 self._downloader.report_warning(err_msg)
1146 return {}
1147 player_config = json.loads(mobj.group(1))
1148 try:
1149 args = player_config[u'args']
1150 caption_url = args[u'ttsurl']
1151 timestamp = args[u'timestamp']
055e6f36
JMF
1152 # We get the available subtitles
1153 list_params = compat_urllib_parse.urlencode({
1154 'type': 'list',
1155 'tlangs': 1,
1156 'asrs': 1,
de7f3446 1157 })
055e6f36
JMF
1158 list_url = caption_url + '&' + list_params
1159 list_page = self._download_webpage(list_url, video_id)
1160 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
e3dc22ca
JMF
1161 original_lang_node = caption_list.find('track')
1162 if original_lang_node.attrib.get('kind') != 'asr' :
1163 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1164 return {}
1165 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1166
1167 sub_lang_list = {}
1168 for lang_node in caption_list.findall('target'):
1169 sub_lang = lang_node.attrib['lang_code']
1170 params = compat_urllib_parse.urlencode({
1171 'lang': original_lang,
1172 'tlang': sub_lang,
1173 'fmt': sub_format,
1174 'ts': timestamp,
1175 'kind': 'asr',
1176 })
1177 sub_lang_list[sub_lang] = caption_url + '&' + params
1178 return sub_lang_list
de7f3446
JMF
1179 # An extractor error can be raise by the download process if there are
1180 # no automatic captions but there are subtitles
1181 except (KeyError, ExtractorError):
1182 self._downloader.report_warning(err_msg)
1183 return {}
1184
c5e8d7af
PH
1185 def _print_formats(self, formats):
1186 print('Available formats:')
1187 for x in formats:
03cc7c20
JMF
1188 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1189 self._video_dimensions.get(x, '???'),
836a086c 1190 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1191
1192 def _extract_id(self, url):
1193 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1194 if mobj is None:
1195 raise ExtractorError(u'Invalid URL: %s' % url)
1196 video_id = mobj.group(2)
1197 return video_id
1198
1d043b93
JMF
1199 def _get_video_url_list(self, url_map):
1200 """
1201 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1202 with the requested formats.
1203 """
1204 req_format = self._downloader.params.get('format', None)
1205 format_limit = self._downloader.params.get('format_limit', None)
1206 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1207 if format_limit is not None and format_limit in available_formats:
1208 format_list = available_formats[available_formats.index(format_limit):]
1209 else:
1210 format_list = available_formats
1211 existing_formats = [x for x in format_list if x in url_map]
1212 if len(existing_formats) == 0:
1213 raise ExtractorError(u'no known formats available for video')
1214 if self._downloader.params.get('listformats', None):
1215 self._print_formats(existing_formats)
1216 return
1217 if req_format is None or req_format == 'best':
1218 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1219 elif req_format == 'worst':
1220 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1221 elif req_format in ('-1', 'all'):
1222 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1223 else:
1224 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1225 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1226 # available in the specified format. For example,
1227 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1228 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1229 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1230 req_formats = req_format.split('/')
1231 video_url_list = None
1232 for rf in req_formats:
1233 if rf in url_map:
1234 video_url_list = [(rf, url_map[rf])]
1235 break
bdc6b3fc
AZ
1236 if rf in self._video_formats_map:
1237 for srf in self._video_formats_map[rf]:
1238 if srf in url_map:
1239 video_url_list = [(srf, url_map[srf])]
1240 break
1241 else:
1242 continue
1243 break
1d043b93
JMF
1244 if video_url_list is None:
1245 raise ExtractorError(u'requested format not available')
1246 return video_url_list
1247
1248 def _extract_from_m3u8(self, manifest_url, video_id):
1249 url_map = {}
1250 def _get_urls(_manifest):
1251 lines = _manifest.split('\n')
1252 urls = filter(lambda l: l and not l.startswith('#'),
1253 lines)
1254 return urls
1255 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1256 formats_urls = _get_urls(manifest)
1257 for format_url in formats_urls:
890f62e8 1258 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1259 url_map[itag] = format_url
1260 return url_map
1261
c5e8d7af 1262 def _real_extract(self, url):
d7f44b5b
PH
1263 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1264 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1265
c5e8d7af
PH
1266 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1267 mobj = re.search(self._NEXT_URL_RE, url)
1268 if mobj:
1269 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1270 video_id = self._extract_id(url)
1271
1272 # Get video webpage
1273 self.report_video_webpage_download(video_id)
1274 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1275 request = compat_urllib_request.Request(url)
1276 try:
1277 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1278 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1279 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1280
1281 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1282
1283 # Attempt to extract SWF player URL
e0df6211 1284 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1285 if mobj is not None:
1286 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1287 else:
1288 player_url = None
1289
1290 # Get video info
1291 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1292 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1293 self.report_age_confirmation()
1294 age_gate = True
1295 # We simulate the access to the video from www.youtube.com/v/{video_id}
1296 # this can be viewed without login into Youtube
1297 data = compat_urllib_parse.urlencode({'video_id': video_id,
1298 'el': 'embedded',
1299 'gl': 'US',
1300 'hl': 'en',
1301 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1302 'asv': 3,
1303 'sts':'1588',
1304 })
1305 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1306 video_info_webpage = self._download_webpage(video_info_url, video_id,
1307 note=False,
1308 errnote='unable to download video info webpage')
1309 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1310 else:
1311 age_gate = False
1312 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1313 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1314 % (video_id, el_type))
1315 video_info_webpage = self._download_webpage(video_info_url, video_id,
1316 note=False,
1317 errnote='unable to download video info webpage')
1318 video_info = compat_parse_qs(video_info_webpage)
1319 if 'token' in video_info:
1320 break
c5e8d7af
PH
1321 if 'token' not in video_info:
1322 if 'reason' in video_info:
9a82b238 1323 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1324 else:
1325 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1326
1327 # Check for "rental" videos
1328 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1329 raise ExtractorError(u'"rental" videos not supported')
1330
1331 # Start extracting information
1332 self.report_information_extraction(video_id)
1333
1334 # uploader
1335 if 'author' not in video_info:
1336 raise ExtractorError(u'Unable to extract uploader name')
1337 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1338
1339 # uploader_id
1340 video_uploader_id = None
1341 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1342 if mobj is not None:
1343 video_uploader_id = mobj.group(1)
1344 else:
1345 self._downloader.report_warning(u'unable to extract uploader nickname')
1346
1347 # title
1348 if 'title' not in video_info:
1349 raise ExtractorError(u'Unable to extract video title')
1350 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1351
1352 # thumbnail image
7763b04e
JMF
1353 # We try first to get a high quality image:
1354 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1355 video_webpage, re.DOTALL)
1356 if m_thumb is not None:
1357 video_thumbnail = m_thumb.group(1)
1358 elif 'thumbnail_url' not in video_info:
c5e8d7af
PH
1359 self._downloader.report_warning(u'unable to extract video thumbnail')
1360 video_thumbnail = ''
1361 else: # don't panic if we can't find it
1362 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1363
1364 # upload date
1365 upload_date = None
1366 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1367 if mobj is not None:
1368 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1369 upload_date = unified_strdate(upload_date)
1370
1371 # description
1372 video_description = get_element_by_id("eow-description", video_webpage)
1373 if video_description:
1374 video_description = clean_html(video_description)
1375 else:
1376 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1377 if fd_mobj:
1378 video_description = unescapeHTML(fd_mobj.group(1))
1379 else:
1380 video_description = u''
1381
1382 # subtitles
d82134c3 1383 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1384
c5e8d7af 1385 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1386 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1387 return
1388
1389 if 'length_seconds' not in video_info:
1390 self._downloader.report_warning(u'unable to extract video duration')
1391 video_duration = ''
1392 else:
1393 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1394
c5e8d7af 1395 # Decide which formats to download
c5e8d7af
PH
1396
1397 try:
1398 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1399 if not mobj:
1400 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1401 info = json.loads(mobj.group(1))
1402 args = info['args']
7ce7e394
JMF
1403 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1404 # this signatures are encrypted
1405 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1406 if m_s is not None:
1407 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1408 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
cde846b3 1409 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
b7a68384 1410 if m_s is not None:
37b6d5f6
AZ
1411 if 'url_encoded_fmt_stream_map' in video_info:
1412 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1413 else:
1414 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
211fbc13 1415 elif 'adaptive_fmts' in video_info:
37b6d5f6
AZ
1416 if 'url_encoded_fmt_stream_map' in video_info:
1417 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1418 else:
1419 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
c5e8d7af
PH
1420 except ValueError:
1421 pass
1422
1423 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1424 self.report_rtmp_download()
1425 video_url_list = [(None, video_info['conn'][0])]
1426 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
a7055eb9
JMF
1427 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1428 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af
PH
1429 url_map = {}
1430 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1431 url_data = compat_parse_qs(url_data_str)
1432 if 'itag' in url_data and 'url' in url_data:
1433 url = url_data['url'][0]
1434 if 'sig' in url_data:
1435 url += '&signature=' + url_data['sig'][0]
1436 elif 's' in url_data:
e0df6211 1437 encrypted_sig = url_data['s'][0]
769fda3c 1438 if self._downloader.params.get('verbose'):
c108eb73 1439 if age_gate:
bdde940e
PH
1440 if player_url is None:
1441 player_version = 'unknown'
1442 else:
1443 player_version = self._search_regex(
1444 r'-(.+)\.swf$', player_url,
1445 u'flash player', fatal=False)
e0df6211 1446 player_desc = 'flash player %s' % player_version
c108eb73 1447 else:
83799698
PH
1448 player_version = self._search_regex(
1449 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1450 'html5 player', fatal=False)
e0df6211
PH
1451 player_desc = u'html5 player %s' % player_version
1452
1453 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1454 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1455 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1456
83799698 1457 if not age_gate:
e0df6211
PH
1458 jsplayer_url_json = self._search_regex(
1459 r'"assets":.+?"js":\s*("[^"]+")',
1460 video_webpage, u'JS player URL')
83799698 1461 player_url = json.loads(jsplayer_url_json)
e0df6211 1462
83799698
PH
1463 signature = self._decrypt_signature(
1464 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1465 url += '&signature=' + signature
1466 if 'ratebypass' not in url:
1467 url += '&ratebypass=yes'
1468 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1469 video_url_list = self._get_video_url_list(url_map)
1470 if not video_url_list:
c5e8d7af 1471 return
1d043b93
JMF
1472 elif video_info.get('hlsvp'):
1473 manifest_url = video_info['hlsvp'][0]
1474 url_map = self._extract_from_m3u8(manifest_url, video_id)
1475 video_url_list = self._get_video_url_list(url_map)
1476 if not video_url_list:
1477 return
1478
c5e8d7af
PH
1479 else:
1480 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1481
1482 results = []
1483 for format_param, video_real_url in video_url_list:
1484 # Extension
1485 video_extension = self._video_extensions.get(format_param, 'flv')
1486
03cc7c20
JMF
1487 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1488 self._video_dimensions.get(format_param, '???'),
836a086c 1489 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
c5e8d7af
PH
1490
1491 results.append({
1492 'id': video_id,
1493 'url': video_real_url,
1494 'uploader': video_uploader,
1495 'uploader_id': video_uploader_id,
1496 'upload_date': upload_date,
1497 'title': video_title,
1498 'ext': video_extension,
1499 'format': video_format,
1500 'thumbnail': video_thumbnail,
1501 'description': video_description,
1502 'player_url': player_url,
1503 'subtitles': video_subtitles,
1504 'duration': video_duration
1505 })
1506 return results
1507
1508class YoutubePlaylistIE(InfoExtractor):
0f818663 1509 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1510 _VALID_URL = r"""(?:
1511 (?:https?://)?
1512 (?:\w+\.)?
1513 youtube\.com/
1514 (?:
1515 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1516 \? (?:.*?&)*? (?:p|a|list)=
1517 | p/
1518 )
c626a3d9 1519 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1520 .*
1521 |
c626a3d9 1522 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1523 )"""
1524 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1525 _MAX_RESULTS = 50
1526 IE_NAME = u'youtube:playlist'
1527
1528 @classmethod
1529 def suitable(cls, url):
1530 """Receives a URL and returns True if suitable for this IE."""
1531 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1532
1533 def _real_extract(self, url):
1534 # Extract playlist id
1535 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1536 if mobj is None:
1537 raise ExtractorError(u'Invalid URL: %s' % url)
1538
1539 # Download playlist videos from API
1540 playlist_id = mobj.group(1) or mobj.group(2)
c5e8d7af
PH
1541 videos = []
1542
755eb032 1543 for page_num in itertools.count(1):
771822eb
JMF
1544 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1545 if start_index >= 1000:
1546 self._downloader.report_warning(u'Max number of results reached')
1547 break
1548 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
c5e8d7af
PH
1549 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1550
1551 try:
1552 response = json.loads(page)
1553 except ValueError as err:
1554 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1555
1556 if 'feed' not in response:
1557 raise ExtractorError(u'Got a malformed response from YouTube API')
1558 playlist_title = response['feed']['title']['$t']
1559 if 'entry' not in response['feed']:
1560 # Number of videos is a multiple of self._MAX_RESULTS
1561 break
1562
1563 for entry in response['feed']['entry']:
1564 index = entry['yt$position']['$t']
c215217e
JMF
1565 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1566 videos.append((
1567 index,
1568 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1569 ))
c5e8d7af 1570
c5e8d7af
PH
1571 videos = [v[1] for v in sorted(videos)]
1572
20c3893f 1573 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
c5e8d7af
PH
1574 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1575
1576
1577class YoutubeChannelIE(InfoExtractor):
0f818663 1578 IE_DESC = u'YouTube.com channels'
c5e8d7af
PH
1579 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1580 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1581 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1582 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1583 IE_NAME = u'youtube:channel'
1584
1585 def extract_videos_from_page(self, page):
1586 ids_in_page = []
1587 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1588 if mobj.group(1) not in ids_in_page:
1589 ids_in_page.append(mobj.group(1))
1590 return ids_in_page
1591
1592 def _real_extract(self, url):
1593 # Extract channel id
1594 mobj = re.match(self._VALID_URL, url)
1595 if mobj is None:
1596 raise ExtractorError(u'Invalid URL: %s' % url)
1597
1598 # Download channel page
1599 channel_id = mobj.group(1)
1600 video_ids = []
1601 pagenum = 1
1602
1603 url = self._TEMPLATE_URL % (channel_id, pagenum)
1604 page = self._download_webpage(url, channel_id,
1605 u'Downloading page #%s' % pagenum)
1606
1607 # Extract video identifiers
1608 ids_in_page = self.extract_videos_from_page(page)
1609 video_ids.extend(ids_in_page)
1610
1611 # Download any subsequent channel pages using the json-based channel_ajax query
1612 if self._MORE_PAGES_INDICATOR in page:
755eb032 1613 for pagenum in itertools.count(1):
c5e8d7af
PH
1614 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1615 page = self._download_webpage(url, channel_id,
1616 u'Downloading page #%s' % pagenum)
1617
1618 page = json.loads(page)
1619
1620 ids_in_page = self.extract_videos_from_page(page['content_html'])
1621 video_ids.extend(ids_in_page)
1622
1623 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1624 break
1625
1626 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1627
1628 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
20c3893f 1629 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
c5e8d7af
PH
1630 return [self.playlist_result(url_entries, channel_id)]
1631
1632
1633class YoutubeUserIE(InfoExtractor):
0f818663 1634 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
faab1d38 1635 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1636 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1637 _GDATA_PAGE_SIZE = 50
fd9cf738 1638 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1639 IE_NAME = u'youtube:user'
1640
e3ea4790 1641 @classmethod
f4b05232 1642 def suitable(cls, url):
e3ea4790
JMF
1643 # Don't return True if the url can be extracted with other youtube
1644 # extractor, the regex would is too permissive and it would match.
1645 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1646 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1647 else: return super(YoutubeUserIE, cls).suitable(url)
1648
c5e8d7af
PH
1649 def _real_extract(self, url):
1650 # Extract username
1651 mobj = re.match(self._VALID_URL, url)
1652 if mobj is None:
1653 raise ExtractorError(u'Invalid URL: %s' % url)
1654
1655 username = mobj.group(1)
1656
1657 # Download video ids using YouTube Data API. Result size per
1658 # query is limited (currently to 50 videos) so we need to query
1659 # page by page until there are no video ids - it means we got
1660 # all of them.
1661
1662 video_ids = []
c5e8d7af 1663
755eb032 1664 for pagenum in itertools.count(0):
c5e8d7af
PH
1665 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1666
1667 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1668 page = self._download_webpage(gdata_url, username,
1669 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1670
fd9cf738
JMF
1671 try:
1672 response = json.loads(page)
1673 except ValueError as err:
1674 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1675 if 'entry' not in response['feed']:
1676 # Number of videos is a multiple of self._MAX_RESULTS
1677 break
fd9cf738 1678
c5e8d7af
PH
1679 # Extract video identifiers
1680 ids_in_page = []
fd9cf738
JMF
1681 for entry in response['feed']['entry']:
1682 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1683 video_ids.extend(ids_in_page)
1684
1685 # A little optimization - if current page is not
1686 # "full", ie. does not contain PAGE_SIZE video ids then
1687 # we can assume that this page is the last one - there
1688 # are no more ids on further pages - no need to query
1689 # again.
1690
1691 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1692 break
1693
c5e8d7af 1694 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
20c3893f 1695 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
c5e8d7af 1696 return [self.playlist_result(url_results, playlist_title = username)]
b05654f0
PH
1697
1698class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1699 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1700 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1701 _MAX_RESULTS = 1000
1702 IE_NAME = u'youtube:search'
1703 _SEARCH_KEY = 'ytsearch'
1704
1705 def report_download_page(self, query, pagenum):
1706 """Report attempt to download search page with given number."""
1707 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1708
1709 def _get_n_results(self, query, n):
1710 """Get a specified number of results for a query"""
1711
1712 video_ids = []
1713 pagenum = 0
1714 limit = n
1715
1716 while (50 * pagenum) < limit:
1717 self.report_download_page(query, pagenum+1)
1718 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1719 request = compat_urllib_request.Request(result_url)
1720 try:
1721 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1722 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1723 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1724 api_response = json.loads(data)['data']
1725
1726 if not 'items' in api_response:
1727 raise ExtractorError(u'[youtube] No video results')
1728
1729 new_ids = list(video['id'] for video in api_response['items'])
1730 video_ids += new_ids
1731
1732 limit = min(n, api_response['totalItems'])
1733 pagenum += 1
1734
1735 if len(video_ids) > n:
1736 video_ids = video_ids[:n]
1737 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1738 return self.playlist_result(videos, query)
75dff0ee
JMF
1739
1740
1741class YoutubeShowIE(InfoExtractor):
0f818663 1742 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1743 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1744 IE_NAME = u'youtube:show'
1745
1746 def _real_extract(self, url):
1747 mobj = re.match(self._VALID_URL, url)
1748 show_name = mobj.group(1)
1749 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1750 # There's one playlist for each season of the show
1751 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1752 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1753 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1754
1755
b2e8bc1b 1756class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1757 """
1758 Base class for extractors that fetch info from
1759 http://www.youtube.com/feed_ajax
1760 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1761 """
b2e8bc1b 1762 _LOGIN_REQUIRED = True
04cc9617 1763 _PAGING_STEP = 30
43ba5456
JMF
1764 # use action_load_personal_feed instead of action_load_system_feed
1765 _PERSONAL_FEED = False
04cc9617 1766
d7ae0639
JMF
1767 @property
1768 def _FEED_TEMPLATE(self):
43ba5456
JMF
1769 action = 'action_load_system_feed'
1770 if self._PERSONAL_FEED:
1771 action = 'action_load_personal_feed'
1772 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1773
1774 @property
1775 def IE_NAME(self):
1776 return u'youtube:%s' % self._FEED_NAME
04cc9617 1777
81f0259b 1778 def _real_initialize(self):
b2e8bc1b 1779 self._login()
81f0259b 1780
04cc9617
JMF
1781 def _real_extract(self, url):
1782 feed_entries = []
1783 # The step argument is available only in 2.7 or higher
1784 for i in itertools.count(0):
1785 paging = i*self._PAGING_STEP
d7ae0639
JMF
1786 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1787 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1788 u'Downloading page %s' % i)
1789 info = json.loads(info)
1790 feed_html = info['feed_html']
43ba5456 1791 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617
JMF
1792 ids = orderedSet(m.group(1) for m in m_ids)
1793 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1794 if info['paging'] is None:
1795 break
d7ae0639
JMF
1796 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1797
1798class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1799 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1800 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1801 _FEED_NAME = 'subscriptions'
1802 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1803
1804class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1805 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1806 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1807 _FEED_NAME = 'recommended'
1808 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1809
43ba5456
JMF
1810class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1811 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1812 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1813 _FEED_NAME = 'watch_later'
1814 _PLAYLIST_TITLE = u'Youtube Watch Later'
1815 _PAGING_STEP = 100
1816 _PERSONAL_FEED = True
c626a3d9
JMF
1817
1818class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1819 IE_NAME = u'youtube:favorites'
1820 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1821 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1822 _LOGIN_REQUIRED = True
1823
1824 def _real_extract(self, url):
1825 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1826 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1827 return self.url_result(playlist_id, 'YoutubePlaylist')