]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Fix output of --youtube-print-sig-code when counting down to 0
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af
PH
9import re
10import socket
e0df6211
PH
11import string
12import struct
13import traceback
0ca96d48 14import xml.etree.ElementTree
e0df6211 15import zlib
c5e8d7af 16
b05654f0 17from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 18from .subtitles import SubtitlesInfoExtractor
c5e8d7af 19from ..utils import (
edf3e38e 20 compat_chr,
c5e8d7af
PH
21 compat_http_client,
22 compat_parse_qs,
23 compat_urllib_error,
24 compat_urllib_parse,
25 compat_urllib_request,
26 compat_str,
27
28 clean_html,
29 get_element_by_id,
30 ExtractorError,
31 unescapeHTML,
32 unified_strdate,
04cc9617 33 orderedSet,
edf3e38e 34 write_json_file,
c5e8d7af
PH
35)
36
de7f3446 37class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
49
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
52 try:
53 self.report_lang()
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
57 return False
58 return True
59
60 def _login(self):
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 return False
67
68 request = compat_urllib_request.Request(self._LOGIN_URL)
69 try:
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
73 return False
74
75 galx = None
76 dsh = None
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
78 if match:
79 galx = match.group(1)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
81 if match:
82 dsh = match.group(1)
c5e8d7af 83
b2e8bc1b
JMF
84 # Log in
85 login_form_strs = {
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
87 u'Email': username,
88 u'GALX': galx,
89 u'Passwd': password,
90 u'PersistentCookie': u'yes',
91 u'_utf8': u'霱',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
95 u'dnConn': u'',
96 u'dsh': dsh,
97 u'pstMsg': u'0',
98 u'rmShown': u'1',
99 u'secTok': u'',
100 u'signIn': u'Sign in',
101 u'timeStmp': u'',
102 u'service': u'youtube',
103 u'uilel': u'3',
104 u'hl': u'en_US',
105 }
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
107 # chokes on unicode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
111 try:
112 self.report_login()
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
116 return False
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
119 return False
120 return True
121
122 def _confirm_age(self):
123 age_form = {
124 'next_url': '/',
125 'action_confirm': 'Confirm',
126 }
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
128 try:
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
133 return True
134
135 def _real_initialize(self):
136 if self._downloader is None:
137 return
138 if not self._set_language():
139 return
140 if not self._login():
141 return
142 self._confirm_age()
c5e8d7af 143
8377574c 144
de7f3446 145class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 146 IE_DESC = u'YouTube.com'
c5e8d7af
PH
147 _VALID_URL = r"""^
148 (
149 (?:https?://)? # http(s):// (optional)
f4b05232 150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
e69ae5b9
JMF
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
d741e55a 157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
160 v=
161 )
f4b05232
JMF
162 ))
163 |youtu\.be/ # just youtu.be/xxxx
164 )
c5e8d7af 165 )? # all until now is optional -> you can pass the naked ID
8963d9c2 166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
167 (?(1).+)? # if we found the ID, everything can follow
168 $"""
c5e8d7af 169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 170 # Listed in order of quality
bdc6b3fc 171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 172 # Apple HTTP Live Streaming
bdc6b3fc 173 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
174 # 3D
175 '85', '84', '102', '83', '101', '82', '100',
176 # Dash video
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
179 # Dash audio
180 '141', '172', '140', '171', '139',
1d043b93 181 ]
bdc6b3fc 182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 183 # Apple HTTP Live Streaming
bdc6b3fc
AZ
184 '96', '95', '94', '93', '92', '132', '151',
185 # 3D
86fe61c8 186 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
187 # Dash video
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
190 # Dash audio
191 '172', '141', '171', '140', '139',
1d043b93 192 ]
bdc6b3fc
AZ
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
198 }
c5e8d7af
PH
199 _video_extensions = {
200 '13': '3gp',
bdc6b3fc 201 '17': '3gp',
c5e8d7af
PH
202 '18': 'mp4',
203 '22': 'mp4',
bdc6b3fc 204 '36': '3gp',
c5e8d7af 205 '37': 'mp4',
d69cf69a 206 '38': 'mp4',
c5e8d7af
PH
207 '43': 'webm',
208 '44': 'webm',
209 '45': 'webm',
210 '46': 'webm',
1d043b93 211
86fe61c8
AZ
212 # 3d videos
213 '82': 'mp4',
214 '83': 'mp4',
215 '84': 'mp4',
216 '85': 'mp4',
217 '100': 'webm',
218 '101': 'webm',
219 '102': 'webm',
836a086c 220
96fb5605 221 # Apple HTTP Live Streaming
1d043b93
JMF
222 '92': 'mp4',
223 '93': 'mp4',
224 '94': 'mp4',
225 '95': 'mp4',
226 '96': 'mp4',
227 '132': 'mp4',
228 '151': 'mp4',
836a086c
AZ
229
230 # Dash mp4
231 '133': 'mp4',
232 '134': 'mp4',
233 '135': 'mp4',
234 '136': 'mp4',
235 '137': 'mp4',
236 '138': 'mp4',
237 '139': 'mp4',
238 '140': 'mp4',
239 '141': 'mp4',
240 '160': 'mp4',
241
242 # Dash webm
243 '171': 'webm',
244 '172': 'webm',
245 '242': 'webm',
246 '243': 'webm',
247 '244': 'webm',
248 '245': 'webm',
249 '246': 'webm',
250 '247': 'webm',
251 '248': 'webm',
c5e8d7af
PH
252 }
253 _video_dimensions = {
254 '5': '240x400',
255 '6': '???',
256 '13': '???',
257 '17': '144x176',
258 '18': '360x640',
259 '22': '720x1280',
260 '34': '360x640',
261 '35': '480x854',
bdc6b3fc 262 '36': '240x320',
c5e8d7af
PH
263 '37': '1080x1920',
264 '38': '3072x4096',
265 '43': '360x640',
266 '44': '480x854',
267 '45': '720x1280',
268 '46': '1080x1920',
86fe61c8
AZ
269 '82': '360p',
270 '83': '480p',
271 '84': '720p',
272 '85': '1080p',
1d043b93
JMF
273 '92': '240p',
274 '93': '360p',
275 '94': '480p',
276 '95': '720p',
277 '96': '1080p',
86fe61c8
AZ
278 '100': '360p',
279 '101': '480p',
836a086c 280 '102': '720p',
1d043b93
JMF
281 '132': '240p',
282 '151': '72p',
836a086c
AZ
283 '133': '240p',
284 '134': '360p',
285 '135': '480p',
286 '136': '720p',
287 '137': '1080p',
288 '138': '>1080p',
289 '139': '48k',
290 '140': '128k',
291 '141': '256k',
292 '160': '192p',
293 '171': '128k',
294 '172': '256k',
295 '242': '240p',
296 '243': '360p',
297 '244': '480p',
298 '245': '480p',
299 '246': '480p',
300 '247': '720p',
301 '248': '1080p',
c5e8d7af 302 }
836a086c
AZ
303 _special_itags = {
304 '82': '3D',
305 '83': '3D',
306 '84': '3D',
307 '85': '3D',
308 '100': '3D',
309 '101': '3D',
310 '102': '3D',
311 '133': 'DASH Video',
312 '134': 'DASH Video',
313 '135': 'DASH Video',
314 '136': 'DASH Video',
315 '137': 'DASH Video',
316 '138': 'DASH Video',
317 '139': 'DASH Audio',
318 '140': 'DASH Audio',
319 '141': 'DASH Audio',
320 '160': 'DASH Video',
321 '171': 'DASH Audio',
322 '172': 'DASH Audio',
323 '242': 'DASH Video',
324 '243': 'DASH Video',
325 '244': 'DASH Video',
326 '245': 'DASH Video',
327 '246': 'DASH Video',
328 '247': 'DASH Video',
329 '248': 'DASH Video',
c5e8d7af 330 }
836a086c 331
c5e8d7af 332 IE_NAME = u'youtube'
2eb88d95
PH
333 _TESTS = [
334 {
0e853ca4
PH
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
337 u"info_dict": {
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 343 }
0e853ca4
PH
344 },
345 {
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
349 u"info_dict": {
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
2eb88d95 355 }
0e853ca4
PH
356 },
357 {
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
361 u"info_dict": {
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
c7bf7366 364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
45ed795c 365 u"uploader": u"Icona Pop",
0e853ca4 366 u"uploader_id": u"IconaPop"
2eb88d95 367 }
c108eb73
JMF
368 },
369 {
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
373 u"info_dict": {
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
379 }
380 },
1d043b93
JMF
381 {
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
385 u'info_dict': {
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
391 },
392 u'params': {
393 u'skip_download': True,
394 },
395 },
2eb88d95
PH
396 ]
397
c5e8d7af
PH
398
399 @classmethod
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 402 if YoutubePlaylistIE.suitable(url): return False
c5e8d7af
PH
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
404
e0df6211
PH
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 407 self._player_cache = {}
e0df6211 408
c5e8d7af
PH
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
412
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
416
c5e8d7af
PH
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
420
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
424
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
428
c4417ddb
PH
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 431 player_url)
e0df6211
PH
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
434
c4417ddb
PH
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
edf3e38e
PH
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
c4417ddb 440
c3c88a26 441 cache_enabled = cache_dir is not None
f8061589 442 if cache_enabled:
c4417ddb
PH
443 cache_fn = os.path.join(os.path.expanduser(cache_dir),
444 u'youtube-sigfuncs',
445 func_id + '.json')
446 try:
edf3e38e 447 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
448 cache_spec = json.load(cachef)
449 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 450 except IOError:
c4417ddb 451 pass # No cache available
83799698 452
e0df6211
PH
453 if player_type == 'js':
454 code = self._download_webpage(
455 player_url, video_id,
83799698 456 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 457 errnote=u'Download of %s failed' % player_url)
83799698 458 res = self._parse_sig_js(code)
c4417ddb 459 elif player_type == 'swf':
e0df6211
PH
460 urlh = self._request_webpage(
461 player_url, video_id,
83799698 462 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
463 errnote=u'Download of %s failed' % player_url)
464 code = urlh.read()
83799698 465 res = self._parse_sig_swf(code)
e0df6211
PH
466 else:
467 assert False, 'Invalid player type %r' % player_type
468
f8061589 469 if cache_enabled:
edf3e38e 470 try:
c705320f
PH
471 test_string = u''.join(map(compat_chr, range(slen)))
472 cache_res = res(test_string)
edf3e38e
PH
473 cache_spec = [ord(c) for c in cache_res]
474 try:
475 os.makedirs(os.path.dirname(cache_fn))
476 except OSError as ose:
477 if ose.errno != errno.EEXIST:
478 raise
479 write_json_file(cache_spec, cache_fn)
0ca96d48 480 except Exception:
edf3e38e
PH
481 tb = traceback.format_exc()
482 self._downloader.report_warning(
483 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
484
485 return res
486
edf3e38e
PH
487 def _print_sig_code(self, func, slen):
488 def gen_sig_code(idxs):
489 def _genslice(start, end, step):
490 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
491 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
492 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
493 return u's[%s%s%s]' % (starts, ends, steps)
494
495 step = None
0ca96d48
PH
496 start = '(Never used)' # Quelch pyflakes warnings - start will be
497 # set as soon as step is set
edf3e38e
PH
498 for i, prev in zip(idxs[1:], idxs[:-1]):
499 if step is not None:
500 if i - prev == step:
501 continue
502 yield _genslice(start, prev, step)
503 step = None
504 continue
505 if i - prev in [-1, 1]:
506 step = i - prev
507 start = prev
508 continue
509 else:
510 yield u's[%d]' % prev
511 if step is None:
512 yield u's[%d]' % i
513 else:
514 yield _genslice(start, i, step)
515
c705320f
PH
516 test_string = u''.join(map(compat_chr, range(slen)))
517 cache_res = func(test_string)
edf3e38e
PH
518 cache_spec = [ord(c) for c in cache_res]
519 expr_code = u' + '.join(gen_sig_code(cache_spec))
520 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 521 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 522
e0df6211
PH
523 def _parse_sig_js(self, jscode):
524 funcname = self._search_regex(
525 r'signature=([a-zA-Z]+)', jscode,
526 u'Initial JS player signature function name')
527
528 functions = {}
529
530 def argidx(varname):
531 return string.lowercase.index(varname)
532
533 def interpret_statement(stmt, local_vars, allow_recursion=20):
534 if allow_recursion < 0:
0ca96d48 535 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
536
537 if stmt.startswith(u'var '):
538 stmt = stmt[len(u'var '):]
539 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
540 r'=(?P<expr>.*)$', stmt)
541 if ass_m:
542 if ass_m.groupdict().get('index'):
543 def assign(val):
544 lvar = local_vars[ass_m.group('out')]
545 idx = interpret_expression(ass_m.group('index'),
546 local_vars, allow_recursion)
547 assert isinstance(idx, int)
548 lvar[idx] = val
549 return val
550 expr = ass_m.group('expr')
551 else:
552 def assign(val):
553 local_vars[ass_m.group('out')] = val
554 return val
555 expr = ass_m.group('expr')
556 elif stmt.startswith(u'return '):
557 assign = lambda v: v
558 expr = stmt[len(u'return '):]
559 else:
560 raise ExtractorError(
561 u'Cannot determine left side of statement in %r' % stmt)
562
563 v = interpret_expression(expr, local_vars, allow_recursion)
564 return assign(v)
565
566 def interpret_expression(expr, local_vars, allow_recursion):
567 if expr.isdigit():
568 return int(expr)
569
570 if expr.isalpha():
571 return local_vars[expr]
572
573 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
574 if m:
575 member = m.group('member')
576 val = local_vars[m.group('in')]
577 if member == 'split("")':
578 return list(val)
579 if member == 'join("")':
580 return u''.join(val)
581 if member == 'length':
582 return len(val)
583 if member == 'reverse()':
584 return val[::-1]
585 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
586 if slice_m:
587 idx = interpret_expression(
588 slice_m.group('idx'), local_vars, allow_recursion-1)
589 return val[idx:]
590
591 m = re.match(
592 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
593 if m:
594 val = local_vars[m.group('in')]
595 idx = interpret_expression(m.group('idx'), local_vars,
596 allow_recursion-1)
597 return val[idx]
598
599 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
600 if m:
601 a = interpret_expression(m.group('a'),
602 local_vars, allow_recursion)
603 b = interpret_expression(m.group('b'),
604 local_vars, allow_recursion)
605 return a % b
606
607 m = re.match(
608 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
609 if m:
610 fname = m.group('func')
611 if fname not in functions:
612 functions[fname] = extract_function(fname)
613 argvals = [int(v) if v.isdigit() else local_vars[v]
614 for v in m.group('args').split(',')]
615 return functions[fname](argvals)
616 raise ExtractorError(u'Unsupported JS expression %r' % expr)
617
618 def extract_function(funcname):
619 func_m = re.search(
620 r'function ' + re.escape(funcname) +
621 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
622 jscode)
623 argnames = func_m.group('args').split(',')
624
625 def resf(args):
626 local_vars = dict(zip(argnames, args))
627 for stmt in func_m.group('code').split(';'):
628 res = interpret_statement(stmt, local_vars)
629 return res
630 return resf
631
632 initial_function = extract_function(funcname)
633 return lambda s: initial_function([s])
634
635 def _parse_sig_swf(self, file_contents):
636 if file_contents[1:3] != b'WS':
637 raise ExtractorError(
638 u'Not an SWF file; header is %r' % file_contents[:3])
639 if file_contents[:1] == b'C':
640 content = zlib.decompress(file_contents[8:])
641 else:
642 raise NotImplementedError(u'Unsupported compression format %r' %
643 file_contents[:1])
644
645 def extract_tags(content):
646 pos = 0
647 while pos < len(content):
648 header16 = struct.unpack('<H', content[pos:pos+2])[0]
649 pos += 2
650 tag_code = header16 >> 6
651 tag_len = header16 & 0x3f
652 if tag_len == 0x3f:
653 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
654 pos += 4
655 assert pos+tag_len <= len(content)
656 yield (tag_code, content[pos:pos+tag_len])
657 pos += tag_len
658
659 code_tag = next(tag
660 for tag_code, tag in extract_tags(content)
661 if tag_code == 82)
662 p = code_tag.index(b'\0', 4) + 1
ba552f54 663 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
664
665 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
666 def read_int(reader=None):
667 if reader is None:
668 reader = code_reader
e0df6211
PH
669 res = 0
670 shift = 0
671 for _ in range(5):
ba552f54
PH
672 buf = reader.read(1)
673 assert len(buf) == 1
674 b = struct.unpack('<B', buf)[0]
e0df6211
PH
675 res = res | ((b & 0x7f) << shift)
676 if b & 0x80 == 0:
677 break
678 shift += 7
ba552f54
PH
679 return res
680
681 def u30(reader=None):
682 res = read_int(reader)
683 assert res & 0xf0000000 == 0
e0df6211
PH
684 return res
685 u32 = read_int
686
ba552f54
PH
687 def s32(reader=None):
688 v = read_int(reader)
e0df6211
PH
689 if v & 0x80000000 != 0:
690 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
691 return v
692
0ca96d48 693 def read_string(reader=None):
ba552f54
PH
694 if reader is None:
695 reader = code_reader
696 slen = u30(reader)
697 resb = reader.read(slen)
698 assert len(resb) == slen
699 return resb.decode('utf-8')
700
701 def read_bytes(count, reader=None):
702 if reader is None:
703 reader = code_reader
704 resb = reader.read(count)
705 assert len(resb) == count
706 return resb
707
708 def read_byte(reader=None):
709 resb = read_bytes(1, reader=reader)
710 res = struct.unpack('<B', resb)[0]
711 return res
e0df6211
PH
712
713 # minor_version + major_version
0ca96d48 714 read_bytes(2 + 2)
e0df6211
PH
715
716 # Constant pool
ba552f54 717 int_count = u30()
e0df6211 718 for _c in range(1, int_count):
0ca96d48 719 s32()
ba552f54 720 uint_count = u30()
e0df6211 721 for _c in range(1, uint_count):
0ca96d48 722 u32()
ba552f54 723 double_count = u30()
0ca96d48 724 read_bytes((double_count-1) * 8)
ba552f54 725 string_count = u30()
e0df6211
PH
726 constant_strings = [u'']
727 for _c in range(1, string_count):
0ca96d48 728 s = read_string()
e0df6211 729 constant_strings.append(s)
ba552f54 730 namespace_count = u30()
e0df6211 731 for _c in range(1, namespace_count):
0ca96d48
PH
732 read_bytes(1) # kind
733 u30() # name
ba552f54 734 ns_set_count = u30()
e0df6211 735 for _c in range(1, ns_set_count):
ba552f54 736 count = u30()
e0df6211 737 for _c2 in range(count):
0ca96d48 738 u30()
ba552f54 739 multiname_count = u30()
e0df6211
PH
740 MULTINAME_SIZES = {
741 0x07: 2, # QName
742 0x0d: 2, # QNameA
743 0x0f: 1, # RTQName
744 0x10: 1, # RTQNameA
745 0x11: 0, # RTQNameL
746 0x12: 0, # RTQNameLA
747 0x09: 2, # Multiname
748 0x0e: 2, # MultinameA
749 0x1b: 1, # MultinameL
750 0x1c: 1, # MultinameLA
751 }
752 multinames = [u'']
753 for _c in range(1, multiname_count):
ba552f54 754 kind = u30()
e0df6211
PH
755 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
756 if kind == 0x07:
0ca96d48 757 u30() # namespace_idx
ba552f54 758 name_idx = u30()
e0df6211
PH
759 multinames.append(constant_strings[name_idx])
760 else:
761 multinames.append('[MULTINAME kind: %d]' % kind)
762 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 763 u30()
e0df6211
PH
764
765 # Methods
ba552f54 766 method_count = u30()
e0df6211
PH
767 MethodInfo = collections.namedtuple(
768 'MethodInfo',
769 ['NEED_ARGUMENTS', 'NEED_REST'])
770 method_infos = []
771 for method_id in range(method_count):
ba552f54 772 param_count = u30()
0ca96d48 773 u30() # return type
e0df6211 774 for _ in range(param_count):
0ca96d48
PH
775 u30() # param type
776 u30() # name index (always 0 for youtube)
ba552f54 777 flags = read_byte()
e0df6211
PH
778 if flags & 0x08 != 0:
779 # Options present
ba552f54 780 option_count = u30()
e0df6211 781 for c in range(option_count):
0ca96d48
PH
782 u30() # val
783 read_bytes(1) # kind
e0df6211
PH
784 if flags & 0x80 != 0:
785 # Param names present
786 for _ in range(param_count):
0ca96d48 787 u30() # param name
e0df6211
PH
788 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
789 method_infos.append(mi)
790
791 # Metadata
ba552f54 792 metadata_count = u30()
e0df6211 793 for _c in range(metadata_count):
0ca96d48 794 u30() # name
ba552f54 795 item_count = u30()
e0df6211 796 for _c2 in range(item_count):
0ca96d48
PH
797 u30() # key
798 u30() # value
ba552f54
PH
799
800 def parse_traits_info():
801 trait_name_idx = u30()
802 kind_full = read_byte()
e0df6211
PH
803 kind = kind_full & 0x0f
804 attrs = kind_full >> 4
805 methods = {}
806 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
807 u30() # Slot id
808 u30() # type_name_idx
ba552f54 809 vindex = u30()
e0df6211 810 if vindex != 0:
0ca96d48 811 read_byte() # vkind
e0df6211 812 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 813 u30() # disp_id
ba552f54 814 method_idx = u30()
e0df6211
PH
815 methods[multinames[trait_name_idx]] = method_idx
816 elif kind == 0x04: # Class
0ca96d48
PH
817 u30() # slot_id
818 u30() # classi
e0df6211 819 elif kind == 0x05: # Function
0ca96d48 820 u30() # slot_id
ba552f54 821 function_idx = u30()
e0df6211
PH
822 methods[function_idx] = multinames[trait_name_idx]
823 else:
824 raise ExtractorError(u'Unsupported trait kind %d' % kind)
825
826 if attrs & 0x4 != 0: # Metadata present
ba552f54 827 metadata_count = u30()
e0df6211 828 for _c3 in range(metadata_count):
0ca96d48 829 u30() # metadata index
e0df6211 830
ba552f54 831 return methods
e0df6211
PH
832
833 # Classes
834 TARGET_CLASSNAME = u'SignatureDecipher'
835 searched_idx = multinames.index(TARGET_CLASSNAME)
836 searched_class_id = None
ba552f54 837 class_count = u30()
e0df6211 838 for class_id in range(class_count):
ba552f54 839 name_idx = u30()
e0df6211
PH
840 if name_idx == searched_idx:
841 # We found the class we're looking for!
842 searched_class_id = class_id
0ca96d48 843 u30() # super_name idx
ba552f54 844 flags = read_byte()
e0df6211 845 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 846 u30() # protected_ns_idx
ba552f54 847 intrf_count = u30()
e0df6211 848 for _c2 in range(intrf_count):
0ca96d48
PH
849 u30()
850 u30() # iinit
ba552f54 851 trait_count = u30()
e0df6211 852 for _c2 in range(trait_count):
0ca96d48 853 parse_traits_info()
e0df6211
PH
854
855 if searched_class_id is None:
856 raise ExtractorError(u'Target class %r not found' %
857 TARGET_CLASSNAME)
858
859 method_names = {}
860 method_idxs = {}
861 for class_id in range(class_count):
0ca96d48 862 u30() # cinit
ba552f54 863 trait_count = u30()
e0df6211 864 for _c2 in range(trait_count):
ba552f54 865 trait_methods = parse_traits_info()
e0df6211
PH
866 if class_id == searched_class_id:
867 method_names.update(trait_methods.items())
868 method_idxs.update(dict(
869 (idx, name)
870 for name, idx in trait_methods.items()))
871
872 # Scripts
ba552f54 873 script_count = u30()
e0df6211 874 for _c in range(script_count):
0ca96d48 875 u30() # init
ba552f54 876 trait_count = u30()
e0df6211 877 for _c2 in range(trait_count):
0ca96d48 878 parse_traits_info()
e0df6211
PH
879
880 # Method bodies
ba552f54 881 method_body_count = u30()
e0df6211
PH
882 Method = collections.namedtuple('Method', ['code', 'local_count'])
883 methods = {}
884 for _c in range(method_body_count):
ba552f54 885 method_idx = u30()
0ca96d48 886 u30() # max_stack
ba552f54 887 local_count = u30()
0ca96d48
PH
888 u30() # init_scope_depth
889 u30() # max_scope_depth
ba552f54
PH
890 code_length = u30()
891 code = read_bytes(code_length)
e0df6211 892 if method_idx in method_idxs:
ba552f54 893 m = Method(code, local_count)
e0df6211 894 methods[method_idxs[method_idx]] = m
ba552f54 895 exception_count = u30()
e0df6211 896 for _c2 in range(exception_count):
0ca96d48
PH
897 u30() # from
898 u30() # to
899 u30() # target
900 u30() # exc_type
901 u30() # var_name
ba552f54 902 trait_count = u30()
e0df6211 903 for _c2 in range(trait_count):
0ca96d48 904 parse_traits_info()
e0df6211 905
ba552f54 906 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
907 assert len(methods) == len(method_idxs)
908
909 method_pyfunctions = {}
910
911 def extract_function(func_name):
912 if func_name in method_pyfunctions:
913 return method_pyfunctions[func_name]
914 if func_name not in methods:
915 raise ExtractorError(u'Cannot find function %r' % func_name)
916 m = methods[func_name]
917
918 def resfunc(args):
e0df6211
PH
919 registers = ['(this)'] + list(args) + [None] * m.local_count
920 stack = []
921 coder = io.BytesIO(m.code)
922 while True:
923 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 924 if opcode == 36: # pushbyte
e0df6211
PH
925 v = struct.unpack('!B', coder.read(1))[0]
926 stack.append(v)
927 elif opcode == 44: # pushstring
928 idx = u30(coder)
929 stack.append(constant_strings[idx])
930 elif opcode == 48: # pushscope
931 # We don't implement the scope register, so we'll just
932 # ignore the popped value
933 stack.pop()
934 elif opcode == 70: # callproperty
935 index = u30(coder)
936 mname = multinames[index]
937 arg_count = u30(coder)
938 args = list(reversed(
939 [stack.pop() for _ in range(arg_count)]))
940 obj = stack.pop()
941 if mname == u'split':
942 assert len(args) == 1
943 assert isinstance(args[0], compat_str)
944 assert isinstance(obj, compat_str)
945 if args[0] == u'':
946 res = list(obj)
947 else:
948 res = obj.split(args[0])
949 stack.append(res)
a7177865
PH
950 elif mname == u'slice':
951 assert len(args) == 1
952 assert isinstance(args[0], int)
953 assert isinstance(obj, list)
954 res = obj[args[0]:]
955 stack.append(res)
956 elif mname == u'join':
957 assert len(args) == 1
958 assert isinstance(args[0], compat_str)
959 assert isinstance(obj, list)
960 res = args[0].join(obj)
961 stack.append(res)
e0df6211
PH
962 elif mname in method_pyfunctions:
963 stack.append(method_pyfunctions[mname](args))
964 else:
965 raise NotImplementedError(
966 u'Unsupported property %r on %r'
967 % (mname, obj))
a7177865
PH
968 elif opcode == 72: # returnvalue
969 res = stack.pop()
970 return res
971 elif opcode == 79: # callpropvoid
972 index = u30(coder)
973 mname = multinames[index]
974 arg_count = u30(coder)
975 args = list(reversed(
976 [stack.pop() for _ in range(arg_count)]))
977 obj = stack.pop()
978 if mname == u'reverse':
979 assert isinstance(obj, list)
980 obj.reverse()
981 else:
982 raise NotImplementedError(
983 u'Unsupported (void) property %r on %r'
984 % (mname, obj))
e0df6211
PH
985 elif opcode == 93: # findpropstrict
986 index = u30(coder)
987 mname = multinames[index]
988 res = extract_function(mname)
989 stack.append(res)
990 elif opcode == 97: # setproperty
991 index = u30(coder)
992 value = stack.pop()
993 idx = stack.pop()
994 obj = stack.pop()
995 assert isinstance(obj, list)
996 assert isinstance(idx, int)
997 obj[idx] = value
998 elif opcode == 98: # getlocal
999 index = u30(coder)
1000 stack.append(registers[index])
1001 elif opcode == 99: # setlocal
1002 index = u30(coder)
1003 value = stack.pop()
1004 registers[index] = value
1005 elif opcode == 102: # getproperty
1006 index = u30(coder)
1007 pname = multinames[index]
1008 if pname == u'length':
1009 obj = stack.pop()
1010 assert isinstance(obj, list)
1011 stack.append(len(obj))
1012 else: # Assume attribute access
1013 idx = stack.pop()
1014 assert isinstance(idx, int)
1015 obj = stack.pop()
1016 assert isinstance(obj, list)
1017 stack.append(obj[idx])
1018 elif opcode == 128: # coerce
0ca96d48 1019 u30(coder)
e0df6211
PH
1020 elif opcode == 133: # coerce_s
1021 assert isinstance(stack[-1], (type(None), compat_str))
1022 elif opcode == 164: # modulo
1023 value2 = stack.pop()
1024 value1 = stack.pop()
1025 res = value1 % value2
1026 stack.append(res)
a7177865
PH
1027 elif opcode == 208: # getlocal_0
1028 stack.append(registers[0])
1029 elif opcode == 209: # getlocal_1
1030 stack.append(registers[1])
1031 elif opcode == 210: # getlocal_2
1032 stack.append(registers[2])
1033 elif opcode == 211: # getlocal_3
1034 stack.append(registers[3])
e0df6211
PH
1035 elif opcode == 214: # setlocal_2
1036 registers[2] = stack.pop()
1037 elif opcode == 215: # setlocal_3
1038 registers[3] = stack.pop()
1039 else:
1040 raise NotImplementedError(
1041 u'Unsupported opcode %d' % opcode)
1042
1043 method_pyfunctions[func_name] = resfunc
1044 return resfunc
1045
1046 initial_function = extract_function(u'decipher')
1047 return lambda s: initial_function([s])
1048
83799698 1049 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1050 """Turn the encrypted s field into a working signature"""
6b37f0be 1051
83799698 1052 if player_url is not None:
e0df6211 1053 try:
83799698
PH
1054 if player_url not in self._player_cache:
1055 func = self._extract_signature_function(
c4417ddb 1056 video_id, player_url, len(s)
e0df6211 1057 )
83799698 1058 self._player_cache[player_url] = func
edf3e38e
PH
1059 func = self._player_cache[player_url]
1060 if self._downloader.params.get('youtube_print_sig_code'):
1061 self._print_sig_code(func, len(s))
1062 return func(s)
0ca96d48 1063 except Exception:
e0df6211 1064 tb = traceback.format_exc()
83799698
PH
1065 self._downloader.report_warning(
1066 u'Automatic signature extraction failed: ' + tb)
e0df6211 1067
d2d8f895
PH
1068 self._downloader.report_warning(
1069 u'Warning: Falling back to static signature algorithm')
2f2ffea9
PH
1070 return self._static_decrypt_signature(
1071 s, video_id, player_url, age_gate)
e0df6211 1072
2f2ffea9 1073 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1074 if age_gate:
1075 # The videos with age protection use another player, so the
1076 # algorithms can be different.
1077 if len(s) == 86:
1078 return s[2:63] + s[82] + s[64:82] + s[63]
1079
4ba146f3
PH
1080 if len(s) == 93:
1081 return s[86:29:-1] + s[88] + s[28:5:-1]
1082 elif len(s) == 92:
444b1165 1083 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
4ba146f3
PH
1084 elif len(s) == 91:
1085 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
1086 elif len(s) == 90:
1087 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1088 elif len(s) == 89:
1089 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1090 elif len(s) == 88:
3e223834 1091 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1092 elif len(s) == 87:
3a725669 1093 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1094 elif len(s) == 86:
1cf911bc 1095 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
be547e1d 1096 elif len(s) == 85:
6ae8ee3f 1097 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1098 elif len(s) == 84:
23b00bc0 1099 return s[81:36:-1] + s[0] + s[35:2:-1]
be547e1d 1100 elif len(s) == 83:
e1842025 1101 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
be547e1d 1102 elif len(s) == 82:
ce85f022 1103 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
be547e1d 1104 elif len(s) == 81:
aedd6bb9 1105 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1106 elif len(s) == 80:
1107 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1108 elif len(s) == 79:
1109 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1110
1111 else:
1112 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1113
de7f3446 1114 def _get_available_subtitles(self, video_id):
de7f3446 1115 try:
7fad1c63
JMF
1116 sub_list = self._download_webpage(
1117 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1118 video_id, note=False)
1119 except ExtractorError as err:
de7f3446
JMF
1120 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1121 return {}
1122 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1123
1124 sub_lang_list = {}
1125 for l in lang_list:
1126 lang = l[1]
1127 params = compat_urllib_parse.urlencode({
1128 'lang': lang,
1129 'v': video_id,
1130 'fmt': self._downloader.params.get('subtitlesformat'),
1131 })
1132 url = u'http://www.youtube.com/api/timedtext?' + params
1133 sub_lang_list[lang] = url
1134 if not sub_lang_list:
1135 self._downloader.report_warning(u'video doesn\'t have subtitles')
1136 return {}
1137 return sub_lang_list
1138
055e6f36 1139 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1140 """We need the webpage for getting the captions url, pass it as an
1141 argument to speed up the process."""
de7f3446
JMF
1142 sub_format = self._downloader.params.get('subtitlesformat')
1143 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1144 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1145 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1146 if mobj is None:
1147 self._downloader.report_warning(err_msg)
1148 return {}
1149 player_config = json.loads(mobj.group(1))
1150 try:
1151 args = player_config[u'args']
1152 caption_url = args[u'ttsurl']
1153 timestamp = args[u'timestamp']
055e6f36
JMF
1154 # We get the available subtitles
1155 list_params = compat_urllib_parse.urlencode({
1156 'type': 'list',
1157 'tlangs': 1,
1158 'asrs': 1,
de7f3446 1159 })
055e6f36
JMF
1160 list_url = caption_url + '&' + list_params
1161 list_page = self._download_webpage(list_url, video_id)
1162 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
e3dc22ca
JMF
1163 original_lang_node = caption_list.find('track')
1164 if original_lang_node.attrib.get('kind') != 'asr' :
1165 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1166 return {}
1167 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1168
1169 sub_lang_list = {}
1170 for lang_node in caption_list.findall('target'):
1171 sub_lang = lang_node.attrib['lang_code']
1172 params = compat_urllib_parse.urlencode({
1173 'lang': original_lang,
1174 'tlang': sub_lang,
1175 'fmt': sub_format,
1176 'ts': timestamp,
1177 'kind': 'asr',
1178 })
1179 sub_lang_list[sub_lang] = caption_url + '&' + params
1180 return sub_lang_list
de7f3446
JMF
1181 # An extractor error can be raise by the download process if there are
1182 # no automatic captions but there are subtitles
1183 except (KeyError, ExtractorError):
1184 self._downloader.report_warning(err_msg)
1185 return {}
1186
c5e8d7af
PH
1187 def _print_formats(self, formats):
1188 print('Available formats:')
1189 for x in formats:
03cc7c20
JMF
1190 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1191 self._video_dimensions.get(x, '???'),
836a086c 1192 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1193
1194 def _extract_id(self, url):
1195 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1196 if mobj is None:
1197 raise ExtractorError(u'Invalid URL: %s' % url)
1198 video_id = mobj.group(2)
1199 return video_id
1200
1d043b93
JMF
1201 def _get_video_url_list(self, url_map):
1202 """
1203 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1204 with the requested formats.
1205 """
1206 req_format = self._downloader.params.get('format', None)
1207 format_limit = self._downloader.params.get('format_limit', None)
1208 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1209 if format_limit is not None and format_limit in available_formats:
1210 format_list = available_formats[available_formats.index(format_limit):]
1211 else:
1212 format_list = available_formats
1213 existing_formats = [x for x in format_list if x in url_map]
1214 if len(existing_formats) == 0:
1215 raise ExtractorError(u'no known formats available for video')
1216 if self._downloader.params.get('listformats', None):
1217 self._print_formats(existing_formats)
1218 return
1219 if req_format is None or req_format == 'best':
1220 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1221 elif req_format == 'worst':
1222 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1223 elif req_format in ('-1', 'all'):
1224 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1225 else:
1226 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1227 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1228 # available in the specified format. For example,
1229 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1230 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1231 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1232 req_formats = req_format.split('/')
1233 video_url_list = None
1234 for rf in req_formats:
1235 if rf in url_map:
1236 video_url_list = [(rf, url_map[rf])]
1237 break
bdc6b3fc
AZ
1238 if rf in self._video_formats_map:
1239 for srf in self._video_formats_map[rf]:
1240 if srf in url_map:
1241 video_url_list = [(srf, url_map[srf])]
1242 break
1243 else:
1244 continue
1245 break
1d043b93
JMF
1246 if video_url_list is None:
1247 raise ExtractorError(u'requested format not available')
1248 return video_url_list
1249
1250 def _extract_from_m3u8(self, manifest_url, video_id):
1251 url_map = {}
1252 def _get_urls(_manifest):
1253 lines = _manifest.split('\n')
1254 urls = filter(lambda l: l and not l.startswith('#'),
1255 lines)
1256 return urls
1257 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1258 formats_urls = _get_urls(manifest)
1259 for format_url in formats_urls:
890f62e8 1260 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1261 url_map[itag] = format_url
1262 return url_map
1263
c5e8d7af 1264 def _real_extract(self, url):
d7f44b5b
PH
1265 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1266 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1267
c5e8d7af
PH
1268 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1269 mobj = re.search(self._NEXT_URL_RE, url)
1270 if mobj:
1271 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1272 video_id = self._extract_id(url)
1273
1274 # Get video webpage
1275 self.report_video_webpage_download(video_id)
1276 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1277 request = compat_urllib_request.Request(url)
1278 try:
1279 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1280 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1281 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1282
1283 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1284
1285 # Attempt to extract SWF player URL
e0df6211 1286 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1287 if mobj is not None:
1288 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1289 else:
1290 player_url = None
1291
1292 # Get video info
1293 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1294 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1295 self.report_age_confirmation()
1296 age_gate = True
1297 # We simulate the access to the video from www.youtube.com/v/{video_id}
1298 # this can be viewed without login into Youtube
1299 data = compat_urllib_parse.urlencode({'video_id': video_id,
1300 'el': 'embedded',
1301 'gl': 'US',
1302 'hl': 'en',
1303 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1304 'asv': 3,
1305 'sts':'1588',
1306 })
1307 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1308 video_info_webpage = self._download_webpage(video_info_url, video_id,
1309 note=False,
1310 errnote='unable to download video info webpage')
1311 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1312 else:
1313 age_gate = False
1314 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1315 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1316 % (video_id, el_type))
1317 video_info_webpage = self._download_webpage(video_info_url, video_id,
1318 note=False,
1319 errnote='unable to download video info webpage')
1320 video_info = compat_parse_qs(video_info_webpage)
1321 if 'token' in video_info:
1322 break
c5e8d7af
PH
1323 if 'token' not in video_info:
1324 if 'reason' in video_info:
9a82b238 1325 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1326 else:
1327 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1328
1329 # Check for "rental" videos
1330 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1331 raise ExtractorError(u'"rental" videos not supported')
1332
1333 # Start extracting information
1334 self.report_information_extraction(video_id)
1335
1336 # uploader
1337 if 'author' not in video_info:
1338 raise ExtractorError(u'Unable to extract uploader name')
1339 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1340
1341 # uploader_id
1342 video_uploader_id = None
1343 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1344 if mobj is not None:
1345 video_uploader_id = mobj.group(1)
1346 else:
1347 self._downloader.report_warning(u'unable to extract uploader nickname')
1348
1349 # title
1350 if 'title' not in video_info:
1351 raise ExtractorError(u'Unable to extract video title')
1352 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1353
1354 # thumbnail image
7763b04e
JMF
1355 # We try first to get a high quality image:
1356 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1357 video_webpage, re.DOTALL)
1358 if m_thumb is not None:
1359 video_thumbnail = m_thumb.group(1)
1360 elif 'thumbnail_url' not in video_info:
c5e8d7af
PH
1361 self._downloader.report_warning(u'unable to extract video thumbnail')
1362 video_thumbnail = ''
1363 else: # don't panic if we can't find it
1364 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1365
1366 # upload date
1367 upload_date = None
1368 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1369 if mobj is not None:
1370 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1371 upload_date = unified_strdate(upload_date)
1372
1373 # description
1374 video_description = get_element_by_id("eow-description", video_webpage)
1375 if video_description:
1376 video_description = clean_html(video_description)
1377 else:
1378 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1379 if fd_mobj:
1380 video_description = unescapeHTML(fd_mobj.group(1))
1381 else:
1382 video_description = u''
1383
1384 # subtitles
d82134c3 1385 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1386
c5e8d7af 1387 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1388 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1389 return
1390
1391 if 'length_seconds' not in video_info:
1392 self._downloader.report_warning(u'unable to extract video duration')
1393 video_duration = ''
1394 else:
1395 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1396
c5e8d7af 1397 # Decide which formats to download
c5e8d7af
PH
1398
1399 try:
1400 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1401 if not mobj:
1402 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1403 info = json.loads(mobj.group(1))
1404 args = info['args']
7ce7e394
JMF
1405 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1406 # this signatures are encrypted
1407 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1408 if m_s is not None:
1409 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1410 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
cde846b3 1411 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
b7a68384 1412 if m_s is not None:
37b6d5f6
AZ
1413 if 'url_encoded_fmt_stream_map' in video_info:
1414 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1415 else:
1416 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
211fbc13 1417 elif 'adaptive_fmts' in video_info:
37b6d5f6
AZ
1418 if 'url_encoded_fmt_stream_map' in video_info:
1419 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1420 else:
1421 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
c5e8d7af
PH
1422 except ValueError:
1423 pass
1424
1425 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1426 self.report_rtmp_download()
1427 video_url_list = [(None, video_info['conn'][0])]
1428 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
a7055eb9
JMF
1429 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1430 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af
PH
1431 url_map = {}
1432 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1433 url_data = compat_parse_qs(url_data_str)
1434 if 'itag' in url_data and 'url' in url_data:
1435 url = url_data['url'][0]
1436 if 'sig' in url_data:
1437 url += '&signature=' + url_data['sig'][0]
1438 elif 's' in url_data:
e0df6211 1439 encrypted_sig = url_data['s'][0]
769fda3c 1440 if self._downloader.params.get('verbose'):
c108eb73 1441 if age_gate:
bdde940e
PH
1442 if player_url is None:
1443 player_version = 'unknown'
1444 else:
1445 player_version = self._search_regex(
1446 r'-(.+)\.swf$', player_url,
1447 u'flash player', fatal=False)
e0df6211 1448 player_desc = 'flash player %s' % player_version
c108eb73 1449 else:
83799698
PH
1450 player_version = self._search_regex(
1451 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1452 'html5 player', fatal=False)
e0df6211
PH
1453 player_desc = u'html5 player %s' % player_version
1454
1455 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1456 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1457 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1458
83799698 1459 if not age_gate:
e0df6211
PH
1460 jsplayer_url_json = self._search_regex(
1461 r'"assets":.+?"js":\s*("[^"]+")',
1462 video_webpage, u'JS player URL')
83799698 1463 player_url = json.loads(jsplayer_url_json)
e0df6211 1464
83799698
PH
1465 signature = self._decrypt_signature(
1466 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1467 url += '&signature=' + signature
1468 if 'ratebypass' not in url:
1469 url += '&ratebypass=yes'
1470 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1471 video_url_list = self._get_video_url_list(url_map)
1472 if not video_url_list:
c5e8d7af 1473 return
1d043b93
JMF
1474 elif video_info.get('hlsvp'):
1475 manifest_url = video_info['hlsvp'][0]
1476 url_map = self._extract_from_m3u8(manifest_url, video_id)
1477 video_url_list = self._get_video_url_list(url_map)
1478 if not video_url_list:
1479 return
1480
c5e8d7af
PH
1481 else:
1482 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1483
1484 results = []
1485 for format_param, video_real_url in video_url_list:
1486 # Extension
1487 video_extension = self._video_extensions.get(format_param, 'flv')
1488
03cc7c20
JMF
1489 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1490 self._video_dimensions.get(format_param, '???'),
836a086c 1491 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
c5e8d7af
PH
1492
1493 results.append({
1494 'id': video_id,
1495 'url': video_real_url,
1496 'uploader': video_uploader,
1497 'uploader_id': video_uploader_id,
1498 'upload_date': upload_date,
1499 'title': video_title,
1500 'ext': video_extension,
1501 'format': video_format,
1502 'thumbnail': video_thumbnail,
1503 'description': video_description,
1504 'player_url': player_url,
1505 'subtitles': video_subtitles,
1506 'duration': video_duration
1507 })
1508 return results
1509
1510class YoutubePlaylistIE(InfoExtractor):
0f818663 1511 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1512 _VALID_URL = r"""(?:
1513 (?:https?://)?
1514 (?:\w+\.)?
1515 youtube\.com/
1516 (?:
1517 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1518 \? (?:.*?&)*? (?:p|a|list)=
1519 | p/
1520 )
c626a3d9 1521 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1522 .*
1523 |
c626a3d9 1524 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1525 )"""
1526 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1527 _MAX_RESULTS = 50
1528 IE_NAME = u'youtube:playlist'
1529
1530 @classmethod
1531 def suitable(cls, url):
1532 """Receives a URL and returns True if suitable for this IE."""
1533 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1534
1535 def _real_extract(self, url):
1536 # Extract playlist id
1537 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1538 if mobj is None:
1539 raise ExtractorError(u'Invalid URL: %s' % url)
1540
1541 # Download playlist videos from API
1542 playlist_id = mobj.group(1) or mobj.group(2)
c5e8d7af
PH
1543 videos = []
1544
755eb032 1545 for page_num in itertools.count(1):
771822eb
JMF
1546 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1547 if start_index >= 1000:
1548 self._downloader.report_warning(u'Max number of results reached')
1549 break
1550 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
c5e8d7af
PH
1551 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1552
1553 try:
1554 response = json.loads(page)
1555 except ValueError as err:
1556 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1557
1558 if 'feed' not in response:
1559 raise ExtractorError(u'Got a malformed response from YouTube API')
1560 playlist_title = response['feed']['title']['$t']
1561 if 'entry' not in response['feed']:
1562 # Number of videos is a multiple of self._MAX_RESULTS
1563 break
1564
1565 for entry in response['feed']['entry']:
1566 index = entry['yt$position']['$t']
c215217e
JMF
1567 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1568 videos.append((
1569 index,
1570 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1571 ))
c5e8d7af 1572
c5e8d7af
PH
1573 videos = [v[1] for v in sorted(videos)]
1574
20c3893f 1575 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
c5e8d7af
PH
1576 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1577
1578
1579class YoutubeChannelIE(InfoExtractor):
0f818663 1580 IE_DESC = u'YouTube.com channels'
c5e8d7af
PH
1581 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1582 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1583 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1584 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1585 IE_NAME = u'youtube:channel'
1586
1587 def extract_videos_from_page(self, page):
1588 ids_in_page = []
1589 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1590 if mobj.group(1) not in ids_in_page:
1591 ids_in_page.append(mobj.group(1))
1592 return ids_in_page
1593
1594 def _real_extract(self, url):
1595 # Extract channel id
1596 mobj = re.match(self._VALID_URL, url)
1597 if mobj is None:
1598 raise ExtractorError(u'Invalid URL: %s' % url)
1599
1600 # Download channel page
1601 channel_id = mobj.group(1)
1602 video_ids = []
1603 pagenum = 1
1604
1605 url = self._TEMPLATE_URL % (channel_id, pagenum)
1606 page = self._download_webpage(url, channel_id,
1607 u'Downloading page #%s' % pagenum)
1608
1609 # Extract video identifiers
1610 ids_in_page = self.extract_videos_from_page(page)
1611 video_ids.extend(ids_in_page)
1612
1613 # Download any subsequent channel pages using the json-based channel_ajax query
1614 if self._MORE_PAGES_INDICATOR in page:
755eb032 1615 for pagenum in itertools.count(1):
c5e8d7af
PH
1616 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1617 page = self._download_webpage(url, channel_id,
1618 u'Downloading page #%s' % pagenum)
1619
1620 page = json.loads(page)
1621
1622 ids_in_page = self.extract_videos_from_page(page['content_html'])
1623 video_ids.extend(ids_in_page)
1624
1625 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1626 break
1627
1628 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1629
1630 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
20c3893f 1631 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
c5e8d7af
PH
1632 return [self.playlist_result(url_entries, channel_id)]
1633
1634
1635class YoutubeUserIE(InfoExtractor):
0f818663 1636 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
faab1d38 1637 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1638 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1639 _GDATA_PAGE_SIZE = 50
fd9cf738 1640 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1641 IE_NAME = u'youtube:user'
1642
e3ea4790 1643 @classmethod
f4b05232 1644 def suitable(cls, url):
e3ea4790
JMF
1645 # Don't return True if the url can be extracted with other youtube
1646 # extractor, the regex would is too permissive and it would match.
1647 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1648 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1649 else: return super(YoutubeUserIE, cls).suitable(url)
1650
c5e8d7af
PH
1651 def _real_extract(self, url):
1652 # Extract username
1653 mobj = re.match(self._VALID_URL, url)
1654 if mobj is None:
1655 raise ExtractorError(u'Invalid URL: %s' % url)
1656
1657 username = mobj.group(1)
1658
1659 # Download video ids using YouTube Data API. Result size per
1660 # query is limited (currently to 50 videos) so we need to query
1661 # page by page until there are no video ids - it means we got
1662 # all of them.
1663
1664 video_ids = []
c5e8d7af 1665
755eb032 1666 for pagenum in itertools.count(0):
c5e8d7af
PH
1667 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1668
1669 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1670 page = self._download_webpage(gdata_url, username,
1671 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1672
fd9cf738
JMF
1673 try:
1674 response = json.loads(page)
1675 except ValueError as err:
1676 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1677 if 'entry' not in response['feed']:
1678 # Number of videos is a multiple of self._MAX_RESULTS
1679 break
fd9cf738 1680
c5e8d7af
PH
1681 # Extract video identifiers
1682 ids_in_page = []
fd9cf738
JMF
1683 for entry in response['feed']['entry']:
1684 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1685 video_ids.extend(ids_in_page)
1686
1687 # A little optimization - if current page is not
1688 # "full", ie. does not contain PAGE_SIZE video ids then
1689 # we can assume that this page is the last one - there
1690 # are no more ids on further pages - no need to query
1691 # again.
1692
1693 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1694 break
1695
c5e8d7af 1696 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
20c3893f 1697 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
c5e8d7af 1698 return [self.playlist_result(url_results, playlist_title = username)]
b05654f0
PH
1699
1700class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1701 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1702 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1703 _MAX_RESULTS = 1000
1704 IE_NAME = u'youtube:search'
1705 _SEARCH_KEY = 'ytsearch'
1706
1707 def report_download_page(self, query, pagenum):
1708 """Report attempt to download search page with given number."""
1709 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1710
1711 def _get_n_results(self, query, n):
1712 """Get a specified number of results for a query"""
1713
1714 video_ids = []
1715 pagenum = 0
1716 limit = n
1717
1718 while (50 * pagenum) < limit:
1719 self.report_download_page(query, pagenum+1)
1720 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1721 request = compat_urllib_request.Request(result_url)
1722 try:
1723 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1724 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1725 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1726 api_response = json.loads(data)['data']
1727
1728 if not 'items' in api_response:
1729 raise ExtractorError(u'[youtube] No video results')
1730
1731 new_ids = list(video['id'] for video in api_response['items'])
1732 video_ids += new_ids
1733
1734 limit = min(n, api_response['totalItems'])
1735 pagenum += 1
1736
1737 if len(video_ids) > n:
1738 video_ids = video_ids[:n]
1739 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1740 return self.playlist_result(videos, query)
75dff0ee
JMF
1741
1742
1743class YoutubeShowIE(InfoExtractor):
0f818663 1744 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1745 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1746 IE_NAME = u'youtube:show'
1747
1748 def _real_extract(self, url):
1749 mobj = re.match(self._VALID_URL, url)
1750 show_name = mobj.group(1)
1751 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1752 # There's one playlist for each season of the show
1753 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1754 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1755 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1756
1757
b2e8bc1b 1758class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1759 """
1760 Base class for extractors that fetch info from
1761 http://www.youtube.com/feed_ajax
1762 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1763 """
b2e8bc1b 1764 _LOGIN_REQUIRED = True
04cc9617 1765 _PAGING_STEP = 30
43ba5456
JMF
1766 # use action_load_personal_feed instead of action_load_system_feed
1767 _PERSONAL_FEED = False
04cc9617 1768
d7ae0639
JMF
1769 @property
1770 def _FEED_TEMPLATE(self):
43ba5456
JMF
1771 action = 'action_load_system_feed'
1772 if self._PERSONAL_FEED:
1773 action = 'action_load_personal_feed'
1774 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1775
1776 @property
1777 def IE_NAME(self):
1778 return u'youtube:%s' % self._FEED_NAME
04cc9617 1779
81f0259b 1780 def _real_initialize(self):
b2e8bc1b 1781 self._login()
81f0259b 1782
04cc9617
JMF
1783 def _real_extract(self, url):
1784 feed_entries = []
1785 # The step argument is available only in 2.7 or higher
1786 for i in itertools.count(0):
1787 paging = i*self._PAGING_STEP
d7ae0639
JMF
1788 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1789 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1790 u'Downloading page %s' % i)
1791 info = json.loads(info)
1792 feed_html = info['feed_html']
43ba5456 1793 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617
JMF
1794 ids = orderedSet(m.group(1) for m in m_ids)
1795 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1796 if info['paging'] is None:
1797 break
d7ae0639
JMF
1798 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1799
1800class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1801 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1802 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1803 _FEED_NAME = 'subscriptions'
1804 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1805
1806class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1807 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1808 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1809 _FEED_NAME = 'recommended'
1810 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1811
43ba5456
JMF
1812class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1813 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1814 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1815 _FEED_NAME = 'watch_later'
1816 _PLAYLIST_TITLE = u'Youtube Watch Later'
1817 _PAGING_STEP = 100
1818 _PERSONAL_FEED = True
c626a3d9
JMF
1819
1820class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1821 IE_NAME = u'youtube:favorites'
1822 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1823 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1824 _LOGIN_REQUIRED = True
1825
1826 def _real_extract(self, url):
1827 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1828 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1829 return self.url_result(playlist_id, 'YoutubePlaylist')