]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Improve source code quality
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af
PH
9import re
10import socket
e0df6211
PH
11import string
12import struct
13import traceback
0ca96d48 14import xml.etree.ElementTree
e0df6211 15import zlib
c5e8d7af 16
b05654f0 17from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 18from .subtitles import SubtitlesInfoExtractor
c5e8d7af 19from ..utils import (
edf3e38e 20 compat_chr,
c5e8d7af
PH
21 compat_http_client,
22 compat_parse_qs,
23 compat_urllib_error,
24 compat_urllib_parse,
25 compat_urllib_request,
26 compat_str,
27
28 clean_html,
29 get_element_by_id,
30 ExtractorError,
31 unescapeHTML,
32 unified_strdate,
04cc9617 33 orderedSet,
edf3e38e 34 write_json_file,
c5e8d7af
PH
35)
36
de7f3446 37class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
49
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
52 try:
53 self.report_lang()
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
57 return False
58 return True
59
60 def _login(self):
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 return False
67
68 request = compat_urllib_request.Request(self._LOGIN_URL)
69 try:
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
73 return False
74
75 galx = None
76 dsh = None
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
78 if match:
79 galx = match.group(1)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
81 if match:
82 dsh = match.group(1)
c5e8d7af 83
b2e8bc1b
JMF
84 # Log in
85 login_form_strs = {
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
87 u'Email': username,
88 u'GALX': galx,
89 u'Passwd': password,
90 u'PersistentCookie': u'yes',
91 u'_utf8': u'霱',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
95 u'dnConn': u'',
96 u'dsh': dsh,
97 u'pstMsg': u'0',
98 u'rmShown': u'1',
99 u'secTok': u'',
100 u'signIn': u'Sign in',
101 u'timeStmp': u'',
102 u'service': u'youtube',
103 u'uilel': u'3',
104 u'hl': u'en_US',
105 }
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
107 # chokes on unicode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
111 try:
112 self.report_login()
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
116 return False
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
119 return False
120 return True
121
122 def _confirm_age(self):
123 age_form = {
124 'next_url': '/',
125 'action_confirm': 'Confirm',
126 }
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
128 try:
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
133 return True
134
135 def _real_initialize(self):
136 if self._downloader is None:
137 return
138 if not self._set_language():
139 return
140 if not self._login():
141 return
142 self._confirm_age()
c5e8d7af 143
8377574c 144
de7f3446 145class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 146 IE_DESC = u'YouTube.com'
c5e8d7af
PH
147 _VALID_URL = r"""^
148 (
149 (?:https?://)? # http(s):// (optional)
f4b05232 150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
e69ae5b9
JMF
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
d741e55a 157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
160 v=
161 )
f4b05232
JMF
162 ))
163 |youtu\.be/ # just youtu.be/xxxx
164 )
c5e8d7af 165 )? # all until now is optional -> you can pass the naked ID
8963d9c2 166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
167 (?(1).+)? # if we found the ID, everything can follow
168 $"""
c5e8d7af 169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 170 # Listed in order of quality
bdc6b3fc 171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 172 # Apple HTTP Live Streaming
bdc6b3fc 173 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
174 # 3D
175 '85', '84', '102', '83', '101', '82', '100',
176 # Dash video
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
179 # Dash audio
180 '141', '172', '140', '171', '139',
1d043b93 181 ]
bdc6b3fc 182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 183 # Apple HTTP Live Streaming
bdc6b3fc
AZ
184 '96', '95', '94', '93', '92', '132', '151',
185 # 3D
86fe61c8 186 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
187 # Dash video
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
190 # Dash audio
191 '172', '141', '171', '140', '139',
1d043b93 192 ]
bdc6b3fc
AZ
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
198 }
c5e8d7af
PH
199 _video_extensions = {
200 '13': '3gp',
bdc6b3fc 201 '17': '3gp',
c5e8d7af
PH
202 '18': 'mp4',
203 '22': 'mp4',
bdc6b3fc 204 '36': '3gp',
c5e8d7af 205 '37': 'mp4',
d69cf69a 206 '38': 'mp4',
c5e8d7af
PH
207 '43': 'webm',
208 '44': 'webm',
209 '45': 'webm',
210 '46': 'webm',
1d043b93 211
86fe61c8
AZ
212 # 3d videos
213 '82': 'mp4',
214 '83': 'mp4',
215 '84': 'mp4',
216 '85': 'mp4',
217 '100': 'webm',
218 '101': 'webm',
219 '102': 'webm',
836a086c 220
96fb5605 221 # Apple HTTP Live Streaming
1d043b93
JMF
222 '92': 'mp4',
223 '93': 'mp4',
224 '94': 'mp4',
225 '95': 'mp4',
226 '96': 'mp4',
227 '132': 'mp4',
228 '151': 'mp4',
836a086c
AZ
229
230 # Dash mp4
231 '133': 'mp4',
232 '134': 'mp4',
233 '135': 'mp4',
234 '136': 'mp4',
235 '137': 'mp4',
236 '138': 'mp4',
237 '139': 'mp4',
238 '140': 'mp4',
239 '141': 'mp4',
240 '160': 'mp4',
241
242 # Dash webm
243 '171': 'webm',
244 '172': 'webm',
245 '242': 'webm',
246 '243': 'webm',
247 '244': 'webm',
248 '245': 'webm',
249 '246': 'webm',
250 '247': 'webm',
251 '248': 'webm',
c5e8d7af
PH
252 }
253 _video_dimensions = {
254 '5': '240x400',
255 '6': '???',
256 '13': '???',
257 '17': '144x176',
258 '18': '360x640',
259 '22': '720x1280',
260 '34': '360x640',
261 '35': '480x854',
bdc6b3fc 262 '36': '240x320',
c5e8d7af
PH
263 '37': '1080x1920',
264 '38': '3072x4096',
265 '43': '360x640',
266 '44': '480x854',
267 '45': '720x1280',
268 '46': '1080x1920',
86fe61c8
AZ
269 '82': '360p',
270 '83': '480p',
271 '84': '720p',
272 '85': '1080p',
1d043b93
JMF
273 '92': '240p',
274 '93': '360p',
275 '94': '480p',
276 '95': '720p',
277 '96': '1080p',
86fe61c8
AZ
278 '100': '360p',
279 '101': '480p',
836a086c 280 '102': '720p',
1d043b93
JMF
281 '132': '240p',
282 '151': '72p',
836a086c
AZ
283 '133': '240p',
284 '134': '360p',
285 '135': '480p',
286 '136': '720p',
287 '137': '1080p',
288 '138': '>1080p',
289 '139': '48k',
290 '140': '128k',
291 '141': '256k',
292 '160': '192p',
293 '171': '128k',
294 '172': '256k',
295 '242': '240p',
296 '243': '360p',
297 '244': '480p',
298 '245': '480p',
299 '246': '480p',
300 '247': '720p',
301 '248': '1080p',
c5e8d7af 302 }
836a086c
AZ
303 _special_itags = {
304 '82': '3D',
305 '83': '3D',
306 '84': '3D',
307 '85': '3D',
308 '100': '3D',
309 '101': '3D',
310 '102': '3D',
311 '133': 'DASH Video',
312 '134': 'DASH Video',
313 '135': 'DASH Video',
314 '136': 'DASH Video',
315 '137': 'DASH Video',
316 '138': 'DASH Video',
317 '139': 'DASH Audio',
318 '140': 'DASH Audio',
319 '141': 'DASH Audio',
320 '160': 'DASH Video',
321 '171': 'DASH Audio',
322 '172': 'DASH Audio',
323 '242': 'DASH Video',
324 '243': 'DASH Video',
325 '244': 'DASH Video',
326 '245': 'DASH Video',
327 '246': 'DASH Video',
328 '247': 'DASH Video',
329 '248': 'DASH Video',
c5e8d7af 330 }
836a086c 331
c5e8d7af 332 IE_NAME = u'youtube'
2eb88d95
PH
333 _TESTS = [
334 {
0e853ca4
PH
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
337 u"info_dict": {
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 343 }
0e853ca4
PH
344 },
345 {
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
349 u"info_dict": {
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
2eb88d95 355 }
0e853ca4
PH
356 },
357 {
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
361 u"info_dict": {
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
c7bf7366 364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
45ed795c 365 u"uploader": u"Icona Pop",
0e853ca4 366 u"uploader_id": u"IconaPop"
2eb88d95 367 }
c108eb73
JMF
368 },
369 {
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
373 u"info_dict": {
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
379 }
380 },
1d043b93
JMF
381 {
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
385 u'info_dict': {
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
391 },
392 u'params': {
393 u'skip_download': True,
394 },
395 },
2eb88d95
PH
396 ]
397
c5e8d7af
PH
398
399 @classmethod
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 402 if YoutubePlaylistIE.suitable(url): return False
c5e8d7af
PH
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
404
e0df6211
PH
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 407 self._player_cache = {}
e0df6211 408
c5e8d7af
PH
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
412
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
416
c5e8d7af
PH
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
420
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
424
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
428
c4417ddb
PH
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 431 player_url)
e0df6211
PH
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
434
c4417ddb
PH
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
edf3e38e
PH
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
c4417ddb 440
edf3e38e 441 if cache_dir != u'NONE':
c4417ddb
PH
442 cache_fn = os.path.join(os.path.expanduser(cache_dir),
443 u'youtube-sigfuncs',
444 func_id + '.json')
445 try:
edf3e38e 446 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
447 cache_spec = json.load(cachef)
448 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 449 except IOError:
c4417ddb 450 pass # No cache available
83799698 451
e0df6211
PH
452 if player_type == 'js':
453 code = self._download_webpage(
454 player_url, video_id,
83799698 455 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 456 errnote=u'Download of %s failed' % player_url)
83799698 457 res = self._parse_sig_js(code)
c4417ddb 458 elif player_type == 'swf':
e0df6211
PH
459 urlh = self._request_webpage(
460 player_url, video_id,
83799698 461 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
462 errnote=u'Download of %s failed' % player_url)
463 code = urlh.read()
83799698 464 res = self._parse_sig_swf(code)
e0df6211
PH
465 else:
466 assert False, 'Invalid player type %r' % player_type
467
c4417ddb 468 if cache_dir is not False:
edf3e38e
PH
469 try:
470 cache_res = res(map(compat_chr, range(slen)))
471 cache_spec = [ord(c) for c in cache_res]
472 try:
473 os.makedirs(os.path.dirname(cache_fn))
474 except OSError as ose:
475 if ose.errno != errno.EEXIST:
476 raise
477 write_json_file(cache_spec, cache_fn)
0ca96d48 478 except Exception:
edf3e38e
PH
479 tb = traceback.format_exc()
480 self._downloader.report_warning(
481 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
482
483 return res
484
edf3e38e
PH
485 def _print_sig_code(self, func, slen):
486 def gen_sig_code(idxs):
487 def _genslice(start, end, step):
488 starts = u'' if start == 0 else str(start)
489 ends = u':%d' % (end+step)
490 steps = u'' if step == 1 else (':%d' % step)
491 return u's[%s%s%s]' % (starts, ends, steps)
492
493 step = None
0ca96d48
PH
494 start = '(Never used)' # Quelch pyflakes warnings - start will be
495 # set as soon as step is set
edf3e38e
PH
496 for i, prev in zip(idxs[1:], idxs[:-1]):
497 if step is not None:
498 if i - prev == step:
499 continue
500 yield _genslice(start, prev, step)
501 step = None
502 continue
503 if i - prev in [-1, 1]:
504 step = i - prev
505 start = prev
506 continue
507 else:
508 yield u's[%d]' % prev
509 if step is None:
510 yield u's[%d]' % i
511 else:
512 yield _genslice(start, i, step)
513
514 cache_res = func(map(compat_chr, range(slen)))
515 cache_spec = [ord(c) for c in cache_res]
516 expr_code = u' + '.join(gen_sig_code(cache_spec))
517 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
518 self.to_screen(u'Extracted signature:\n' + code)
519
e0df6211
PH
520 def _parse_sig_js(self, jscode):
521 funcname = self._search_regex(
522 r'signature=([a-zA-Z]+)', jscode,
523 u'Initial JS player signature function name')
524
525 functions = {}
526
527 def argidx(varname):
528 return string.lowercase.index(varname)
529
530 def interpret_statement(stmt, local_vars, allow_recursion=20):
531 if allow_recursion < 0:
0ca96d48 532 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
533
534 if stmt.startswith(u'var '):
535 stmt = stmt[len(u'var '):]
536 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
537 r'=(?P<expr>.*)$', stmt)
538 if ass_m:
539 if ass_m.groupdict().get('index'):
540 def assign(val):
541 lvar = local_vars[ass_m.group('out')]
542 idx = interpret_expression(ass_m.group('index'),
543 local_vars, allow_recursion)
544 assert isinstance(idx, int)
545 lvar[idx] = val
546 return val
547 expr = ass_m.group('expr')
548 else:
549 def assign(val):
550 local_vars[ass_m.group('out')] = val
551 return val
552 expr = ass_m.group('expr')
553 elif stmt.startswith(u'return '):
554 assign = lambda v: v
555 expr = stmt[len(u'return '):]
556 else:
557 raise ExtractorError(
558 u'Cannot determine left side of statement in %r' % stmt)
559
560 v = interpret_expression(expr, local_vars, allow_recursion)
561 return assign(v)
562
563 def interpret_expression(expr, local_vars, allow_recursion):
564 if expr.isdigit():
565 return int(expr)
566
567 if expr.isalpha():
568 return local_vars[expr]
569
570 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
571 if m:
572 member = m.group('member')
573 val = local_vars[m.group('in')]
574 if member == 'split("")':
575 return list(val)
576 if member == 'join("")':
577 return u''.join(val)
578 if member == 'length':
579 return len(val)
580 if member == 'reverse()':
581 return val[::-1]
582 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
583 if slice_m:
584 idx = interpret_expression(
585 slice_m.group('idx'), local_vars, allow_recursion-1)
586 return val[idx:]
587
588 m = re.match(
589 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
590 if m:
591 val = local_vars[m.group('in')]
592 idx = interpret_expression(m.group('idx'), local_vars,
593 allow_recursion-1)
594 return val[idx]
595
596 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
597 if m:
598 a = interpret_expression(m.group('a'),
599 local_vars, allow_recursion)
600 b = interpret_expression(m.group('b'),
601 local_vars, allow_recursion)
602 return a % b
603
604 m = re.match(
605 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
606 if m:
607 fname = m.group('func')
608 if fname not in functions:
609 functions[fname] = extract_function(fname)
610 argvals = [int(v) if v.isdigit() else local_vars[v]
611 for v in m.group('args').split(',')]
612 return functions[fname](argvals)
613 raise ExtractorError(u'Unsupported JS expression %r' % expr)
614
615 def extract_function(funcname):
616 func_m = re.search(
617 r'function ' + re.escape(funcname) +
618 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
619 jscode)
620 argnames = func_m.group('args').split(',')
621
622 def resf(args):
623 local_vars = dict(zip(argnames, args))
624 for stmt in func_m.group('code').split(';'):
625 res = interpret_statement(stmt, local_vars)
626 return res
627 return resf
628
629 initial_function = extract_function(funcname)
630 return lambda s: initial_function([s])
631
632 def _parse_sig_swf(self, file_contents):
633 if file_contents[1:3] != b'WS':
634 raise ExtractorError(
635 u'Not an SWF file; header is %r' % file_contents[:3])
636 if file_contents[:1] == b'C':
637 content = zlib.decompress(file_contents[8:])
638 else:
639 raise NotImplementedError(u'Unsupported compression format %r' %
640 file_contents[:1])
641
642 def extract_tags(content):
643 pos = 0
644 while pos < len(content):
645 header16 = struct.unpack('<H', content[pos:pos+2])[0]
646 pos += 2
647 tag_code = header16 >> 6
648 tag_len = header16 & 0x3f
649 if tag_len == 0x3f:
650 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
651 pos += 4
652 assert pos+tag_len <= len(content)
653 yield (tag_code, content[pos:pos+tag_len])
654 pos += tag_len
655
656 code_tag = next(tag
657 for tag_code, tag in extract_tags(content)
658 if tag_code == 82)
659 p = code_tag.index(b'\0', 4) + 1
ba552f54 660 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
661
662 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
663 def read_int(reader=None):
664 if reader is None:
665 reader = code_reader
e0df6211
PH
666 res = 0
667 shift = 0
668 for _ in range(5):
ba552f54
PH
669 buf = reader.read(1)
670 assert len(buf) == 1
671 b = struct.unpack('<B', buf)[0]
e0df6211
PH
672 res = res | ((b & 0x7f) << shift)
673 if b & 0x80 == 0:
674 break
675 shift += 7
ba552f54
PH
676 return res
677
678 def u30(reader=None):
679 res = read_int(reader)
680 assert res & 0xf0000000 == 0
e0df6211
PH
681 return res
682 u32 = read_int
683
ba552f54
PH
684 def s32(reader=None):
685 v = read_int(reader)
e0df6211
PH
686 if v & 0x80000000 != 0:
687 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
688 return v
689
0ca96d48 690 def read_string(reader=None):
ba552f54
PH
691 if reader is None:
692 reader = code_reader
693 slen = u30(reader)
694 resb = reader.read(slen)
695 assert len(resb) == slen
696 return resb.decode('utf-8')
697
698 def read_bytes(count, reader=None):
699 if reader is None:
700 reader = code_reader
701 resb = reader.read(count)
702 assert len(resb) == count
703 return resb
704
705 def read_byte(reader=None):
706 resb = read_bytes(1, reader=reader)
707 res = struct.unpack('<B', resb)[0]
708 return res
e0df6211
PH
709
710 # minor_version + major_version
0ca96d48 711 read_bytes(2 + 2)
e0df6211
PH
712
713 # Constant pool
ba552f54 714 int_count = u30()
e0df6211 715 for _c in range(1, int_count):
0ca96d48 716 s32()
ba552f54 717 uint_count = u30()
e0df6211 718 for _c in range(1, uint_count):
0ca96d48 719 u32()
ba552f54 720 double_count = u30()
0ca96d48 721 read_bytes((double_count-1) * 8)
ba552f54 722 string_count = u30()
e0df6211
PH
723 constant_strings = [u'']
724 for _c in range(1, string_count):
0ca96d48 725 s = read_string()
e0df6211 726 constant_strings.append(s)
ba552f54 727 namespace_count = u30()
e0df6211 728 for _c in range(1, namespace_count):
0ca96d48
PH
729 read_bytes(1) # kind
730 u30() # name
ba552f54 731 ns_set_count = u30()
e0df6211 732 for _c in range(1, ns_set_count):
ba552f54 733 count = u30()
e0df6211 734 for _c2 in range(count):
0ca96d48 735 u30()
ba552f54 736 multiname_count = u30()
e0df6211
PH
737 MULTINAME_SIZES = {
738 0x07: 2, # QName
739 0x0d: 2, # QNameA
740 0x0f: 1, # RTQName
741 0x10: 1, # RTQNameA
742 0x11: 0, # RTQNameL
743 0x12: 0, # RTQNameLA
744 0x09: 2, # Multiname
745 0x0e: 2, # MultinameA
746 0x1b: 1, # MultinameL
747 0x1c: 1, # MultinameLA
748 }
749 multinames = [u'']
750 for _c in range(1, multiname_count):
ba552f54 751 kind = u30()
e0df6211
PH
752 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
753 if kind == 0x07:
0ca96d48 754 u30() # namespace_idx
ba552f54 755 name_idx = u30()
e0df6211
PH
756 multinames.append(constant_strings[name_idx])
757 else:
758 multinames.append('[MULTINAME kind: %d]' % kind)
759 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 760 u30()
e0df6211
PH
761
762 # Methods
ba552f54 763 method_count = u30()
e0df6211
PH
764 MethodInfo = collections.namedtuple(
765 'MethodInfo',
766 ['NEED_ARGUMENTS', 'NEED_REST'])
767 method_infos = []
768 for method_id in range(method_count):
ba552f54 769 param_count = u30()
0ca96d48 770 u30() # return type
e0df6211 771 for _ in range(param_count):
0ca96d48
PH
772 u30() # param type
773 u30() # name index (always 0 for youtube)
ba552f54 774 flags = read_byte()
e0df6211
PH
775 if flags & 0x08 != 0:
776 # Options present
ba552f54 777 option_count = u30()
e0df6211 778 for c in range(option_count):
0ca96d48
PH
779 u30() # val
780 read_bytes(1) # kind
e0df6211
PH
781 if flags & 0x80 != 0:
782 # Param names present
783 for _ in range(param_count):
0ca96d48 784 u30() # param name
e0df6211
PH
785 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
786 method_infos.append(mi)
787
788 # Metadata
ba552f54 789 metadata_count = u30()
e0df6211 790 for _c in range(metadata_count):
0ca96d48 791 u30() # name
ba552f54 792 item_count = u30()
e0df6211 793 for _c2 in range(item_count):
0ca96d48
PH
794 u30() # key
795 u30() # value
ba552f54
PH
796
797 def parse_traits_info():
798 trait_name_idx = u30()
799 kind_full = read_byte()
e0df6211
PH
800 kind = kind_full & 0x0f
801 attrs = kind_full >> 4
802 methods = {}
803 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
804 u30() # Slot id
805 u30() # type_name_idx
ba552f54 806 vindex = u30()
e0df6211 807 if vindex != 0:
0ca96d48 808 read_byte() # vkind
e0df6211 809 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 810 u30() # disp_id
ba552f54 811 method_idx = u30()
e0df6211
PH
812 methods[multinames[trait_name_idx]] = method_idx
813 elif kind == 0x04: # Class
0ca96d48
PH
814 u30() # slot_id
815 u30() # classi
e0df6211 816 elif kind == 0x05: # Function
0ca96d48 817 u30() # slot_id
ba552f54 818 function_idx = u30()
e0df6211
PH
819 methods[function_idx] = multinames[trait_name_idx]
820 else:
821 raise ExtractorError(u'Unsupported trait kind %d' % kind)
822
823 if attrs & 0x4 != 0: # Metadata present
ba552f54 824 metadata_count = u30()
e0df6211 825 for _c3 in range(metadata_count):
0ca96d48 826 u30() # metadata index
e0df6211 827
ba552f54 828 return methods
e0df6211
PH
829
830 # Classes
831 TARGET_CLASSNAME = u'SignatureDecipher'
832 searched_idx = multinames.index(TARGET_CLASSNAME)
833 searched_class_id = None
ba552f54 834 class_count = u30()
e0df6211 835 for class_id in range(class_count):
ba552f54 836 name_idx = u30()
e0df6211
PH
837 if name_idx == searched_idx:
838 # We found the class we're looking for!
839 searched_class_id = class_id
0ca96d48 840 u30() # super_name idx
ba552f54 841 flags = read_byte()
e0df6211 842 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 843 u30() # protected_ns_idx
ba552f54 844 intrf_count = u30()
e0df6211 845 for _c2 in range(intrf_count):
0ca96d48
PH
846 u30()
847 u30() # iinit
ba552f54 848 trait_count = u30()
e0df6211 849 for _c2 in range(trait_count):
0ca96d48 850 parse_traits_info()
e0df6211
PH
851
852 if searched_class_id is None:
853 raise ExtractorError(u'Target class %r not found' %
854 TARGET_CLASSNAME)
855
856 method_names = {}
857 method_idxs = {}
858 for class_id in range(class_count):
0ca96d48 859 u30() # cinit
ba552f54 860 trait_count = u30()
e0df6211 861 for _c2 in range(trait_count):
ba552f54 862 trait_methods = parse_traits_info()
e0df6211
PH
863 if class_id == searched_class_id:
864 method_names.update(trait_methods.items())
865 method_idxs.update(dict(
866 (idx, name)
867 for name, idx in trait_methods.items()))
868
869 # Scripts
ba552f54 870 script_count = u30()
e0df6211 871 for _c in range(script_count):
0ca96d48 872 u30() # init
ba552f54 873 trait_count = u30()
e0df6211 874 for _c2 in range(trait_count):
0ca96d48 875 parse_traits_info()
e0df6211
PH
876
877 # Method bodies
ba552f54 878 method_body_count = u30()
e0df6211
PH
879 Method = collections.namedtuple('Method', ['code', 'local_count'])
880 methods = {}
881 for _c in range(method_body_count):
ba552f54 882 method_idx = u30()
0ca96d48 883 u30() # max_stack
ba552f54 884 local_count = u30()
0ca96d48
PH
885 u30() # init_scope_depth
886 u30() # max_scope_depth
ba552f54
PH
887 code_length = u30()
888 code = read_bytes(code_length)
e0df6211 889 if method_idx in method_idxs:
ba552f54 890 m = Method(code, local_count)
e0df6211 891 methods[method_idxs[method_idx]] = m
ba552f54 892 exception_count = u30()
e0df6211 893 for _c2 in range(exception_count):
0ca96d48
PH
894 u30() # from
895 u30() # to
896 u30() # target
897 u30() # exc_type
898 u30() # var_name
ba552f54 899 trait_count = u30()
e0df6211 900 for _c2 in range(trait_count):
0ca96d48 901 parse_traits_info()
e0df6211 902
ba552f54 903 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
904 assert len(methods) == len(method_idxs)
905
906 method_pyfunctions = {}
907
908 def extract_function(func_name):
909 if func_name in method_pyfunctions:
910 return method_pyfunctions[func_name]
911 if func_name not in methods:
912 raise ExtractorError(u'Cannot find function %r' % func_name)
913 m = methods[func_name]
914
915 def resfunc(args):
e0df6211
PH
916 registers = ['(this)'] + list(args) + [None] * m.local_count
917 stack = []
918 coder = io.BytesIO(m.code)
919 while True:
920 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 921 if opcode == 36: # pushbyte
e0df6211
PH
922 v = struct.unpack('!B', coder.read(1))[0]
923 stack.append(v)
924 elif opcode == 44: # pushstring
925 idx = u30(coder)
926 stack.append(constant_strings[idx])
927 elif opcode == 48: # pushscope
928 # We don't implement the scope register, so we'll just
929 # ignore the popped value
930 stack.pop()
931 elif opcode == 70: # callproperty
932 index = u30(coder)
933 mname = multinames[index]
934 arg_count = u30(coder)
935 args = list(reversed(
936 [stack.pop() for _ in range(arg_count)]))
937 obj = stack.pop()
938 if mname == u'split':
939 assert len(args) == 1
940 assert isinstance(args[0], compat_str)
941 assert isinstance(obj, compat_str)
942 if args[0] == u'':
943 res = list(obj)
944 else:
945 res = obj.split(args[0])
946 stack.append(res)
a7177865
PH
947 elif mname == u'slice':
948 assert len(args) == 1
949 assert isinstance(args[0], int)
950 assert isinstance(obj, list)
951 res = obj[args[0]:]
952 stack.append(res)
953 elif mname == u'join':
954 assert len(args) == 1
955 assert isinstance(args[0], compat_str)
956 assert isinstance(obj, list)
957 res = args[0].join(obj)
958 stack.append(res)
e0df6211
PH
959 elif mname in method_pyfunctions:
960 stack.append(method_pyfunctions[mname](args))
961 else:
962 raise NotImplementedError(
963 u'Unsupported property %r on %r'
964 % (mname, obj))
a7177865
PH
965 elif opcode == 72: # returnvalue
966 res = stack.pop()
967 return res
968 elif opcode == 79: # callpropvoid
969 index = u30(coder)
970 mname = multinames[index]
971 arg_count = u30(coder)
972 args = list(reversed(
973 [stack.pop() for _ in range(arg_count)]))
974 obj = stack.pop()
975 if mname == u'reverse':
976 assert isinstance(obj, list)
977 obj.reverse()
978 else:
979 raise NotImplementedError(
980 u'Unsupported (void) property %r on %r'
981 % (mname, obj))
e0df6211
PH
982 elif opcode == 93: # findpropstrict
983 index = u30(coder)
984 mname = multinames[index]
985 res = extract_function(mname)
986 stack.append(res)
987 elif opcode == 97: # setproperty
988 index = u30(coder)
989 value = stack.pop()
990 idx = stack.pop()
991 obj = stack.pop()
992 assert isinstance(obj, list)
993 assert isinstance(idx, int)
994 obj[idx] = value
995 elif opcode == 98: # getlocal
996 index = u30(coder)
997 stack.append(registers[index])
998 elif opcode == 99: # setlocal
999 index = u30(coder)
1000 value = stack.pop()
1001 registers[index] = value
1002 elif opcode == 102: # getproperty
1003 index = u30(coder)
1004 pname = multinames[index]
1005 if pname == u'length':
1006 obj = stack.pop()
1007 assert isinstance(obj, list)
1008 stack.append(len(obj))
1009 else: # Assume attribute access
1010 idx = stack.pop()
1011 assert isinstance(idx, int)
1012 obj = stack.pop()
1013 assert isinstance(obj, list)
1014 stack.append(obj[idx])
1015 elif opcode == 128: # coerce
0ca96d48 1016 u30(coder)
e0df6211
PH
1017 elif opcode == 133: # coerce_s
1018 assert isinstance(stack[-1], (type(None), compat_str))
1019 elif opcode == 164: # modulo
1020 value2 = stack.pop()
1021 value1 = stack.pop()
1022 res = value1 % value2
1023 stack.append(res)
a7177865
PH
1024 elif opcode == 208: # getlocal_0
1025 stack.append(registers[0])
1026 elif opcode == 209: # getlocal_1
1027 stack.append(registers[1])
1028 elif opcode == 210: # getlocal_2
1029 stack.append(registers[2])
1030 elif opcode == 211: # getlocal_3
1031 stack.append(registers[3])
e0df6211
PH
1032 elif opcode == 214: # setlocal_2
1033 registers[2] = stack.pop()
1034 elif opcode == 215: # setlocal_3
1035 registers[3] = stack.pop()
1036 else:
1037 raise NotImplementedError(
1038 u'Unsupported opcode %d' % opcode)
1039
1040 method_pyfunctions[func_name] = resfunc
1041 return resfunc
1042
1043 initial_function = extract_function(u'decipher')
1044 return lambda s: initial_function([s])
1045
83799698 1046 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1047 """Turn the encrypted s field into a working signature"""
6b37f0be 1048
83799698 1049 if player_url is not None:
e0df6211 1050 try:
83799698
PH
1051 if player_url not in self._player_cache:
1052 func = self._extract_signature_function(
c4417ddb 1053 video_id, player_url, len(s)
e0df6211 1054 )
83799698 1055 self._player_cache[player_url] = func
edf3e38e
PH
1056 func = self._player_cache[player_url]
1057 if self._downloader.params.get('youtube_print_sig_code'):
1058 self._print_sig_code(func, len(s))
1059 return func(s)
0ca96d48 1060 except Exception:
e0df6211 1061 tb = traceback.format_exc()
83799698
PH
1062 self._downloader.report_warning(
1063 u'Automatic signature extraction failed: ' + tb)
e0df6211 1064
83799698
PH
1065 self._downloader.report_warning(
1066 u'Warning: Falling back to static signature algorithm')
2f2ffea9
PH
1067 return self._static_decrypt_signature(
1068 s, video_id, player_url, age_gate)
e0df6211 1069
2f2ffea9 1070 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1071 if age_gate:
1072 # The videos with age protection use another player, so the
1073 # algorithms can be different.
1074 if len(s) == 86:
1075 return s[2:63] + s[82] + s[64:82] + s[63]
1076
4ba146f3
PH
1077 if len(s) == 93:
1078 return s[86:29:-1] + s[88] + s[28:5:-1]
1079 elif len(s) == 92:
444b1165 1080 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
4ba146f3
PH
1081 elif len(s) == 91:
1082 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
1083 elif len(s) == 90:
1084 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1085 elif len(s) == 89:
1086 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1087 elif len(s) == 88:
3e223834 1088 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1089 elif len(s) == 87:
3a725669 1090 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1091 elif len(s) == 86:
1cf911bc 1092 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
be547e1d 1093 elif len(s) == 85:
6ae8ee3f 1094 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1095 elif len(s) == 84:
23b00bc0 1096 return s[81:36:-1] + s[0] + s[35:2:-1]
be547e1d 1097 elif len(s) == 83:
e1842025 1098 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
be547e1d 1099 elif len(s) == 82:
ce85f022 1100 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
be547e1d 1101 elif len(s) == 81:
aedd6bb9 1102 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1103 elif len(s) == 80:
1104 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1105 elif len(s) == 79:
1106 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1107
1108 else:
1109 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1110
75952c6e
JMF
1111 def _decrypt_signature_age_gate(self, s):
1112 # The videos with age protection use another player, so the algorithms
1113 # can be different.
1114 if len(s) == 86:
1115 return s[2:63] + s[82] + s[64:82] + s[63]
1116 else:
1117 # Fallback to the other algortihms
b072a9de 1118 return self._decrypt_signature(s)
c5e8d7af 1119
de7f3446 1120 def _get_available_subtitles(self, video_id):
de7f3446 1121 try:
7fad1c63
JMF
1122 sub_list = self._download_webpage(
1123 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1124 video_id, note=False)
1125 except ExtractorError as err:
de7f3446
JMF
1126 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1127 return {}
1128 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1129
1130 sub_lang_list = {}
1131 for l in lang_list:
1132 lang = l[1]
1133 params = compat_urllib_parse.urlencode({
1134 'lang': lang,
1135 'v': video_id,
1136 'fmt': self._downloader.params.get('subtitlesformat'),
1137 })
1138 url = u'http://www.youtube.com/api/timedtext?' + params
1139 sub_lang_list[lang] = url
1140 if not sub_lang_list:
1141 self._downloader.report_warning(u'video doesn\'t have subtitles')
1142 return {}
1143 return sub_lang_list
1144
055e6f36 1145 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1146 """We need the webpage for getting the captions url, pass it as an
1147 argument to speed up the process."""
de7f3446
JMF
1148 sub_format = self._downloader.params.get('subtitlesformat')
1149 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1150 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1151 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1152 if mobj is None:
1153 self._downloader.report_warning(err_msg)
1154 return {}
1155 player_config = json.loads(mobj.group(1))
1156 try:
1157 args = player_config[u'args']
1158 caption_url = args[u'ttsurl']
1159 timestamp = args[u'timestamp']
055e6f36
JMF
1160 # We get the available subtitles
1161 list_params = compat_urllib_parse.urlencode({
1162 'type': 'list',
1163 'tlangs': 1,
1164 'asrs': 1,
de7f3446 1165 })
055e6f36
JMF
1166 list_url = caption_url + '&' + list_params
1167 list_page = self._download_webpage(list_url, video_id)
1168 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
e3dc22ca
JMF
1169 original_lang_node = caption_list.find('track')
1170 if original_lang_node.attrib.get('kind') != 'asr' :
1171 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1172 return {}
1173 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1174
1175 sub_lang_list = {}
1176 for lang_node in caption_list.findall('target'):
1177 sub_lang = lang_node.attrib['lang_code']
1178 params = compat_urllib_parse.urlencode({
1179 'lang': original_lang,
1180 'tlang': sub_lang,
1181 'fmt': sub_format,
1182 'ts': timestamp,
1183 'kind': 'asr',
1184 })
1185 sub_lang_list[sub_lang] = caption_url + '&' + params
1186 return sub_lang_list
de7f3446
JMF
1187 # An extractor error can be raise by the download process if there are
1188 # no automatic captions but there are subtitles
1189 except (KeyError, ExtractorError):
1190 self._downloader.report_warning(err_msg)
1191 return {}
1192
c5e8d7af
PH
1193 def _print_formats(self, formats):
1194 print('Available formats:')
1195 for x in formats:
03cc7c20
JMF
1196 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1197 self._video_dimensions.get(x, '???'),
836a086c 1198 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1199
1200 def _extract_id(self, url):
1201 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1202 if mobj is None:
1203 raise ExtractorError(u'Invalid URL: %s' % url)
1204 video_id = mobj.group(2)
1205 return video_id
1206
1d043b93
JMF
1207 def _get_video_url_list(self, url_map):
1208 """
1209 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1210 with the requested formats.
1211 """
1212 req_format = self._downloader.params.get('format', None)
1213 format_limit = self._downloader.params.get('format_limit', None)
1214 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1215 if format_limit is not None and format_limit in available_formats:
1216 format_list = available_formats[available_formats.index(format_limit):]
1217 else:
1218 format_list = available_formats
1219 existing_formats = [x for x in format_list if x in url_map]
1220 if len(existing_formats) == 0:
1221 raise ExtractorError(u'no known formats available for video')
1222 if self._downloader.params.get('listformats', None):
1223 self._print_formats(existing_formats)
1224 return
1225 if req_format is None or req_format == 'best':
1226 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1227 elif req_format == 'worst':
1228 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1229 elif req_format in ('-1', 'all'):
1230 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1231 else:
1232 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1233 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1234 # available in the specified format. For example,
1235 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1236 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1237 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1238 req_formats = req_format.split('/')
1239 video_url_list = None
1240 for rf in req_formats:
1241 if rf in url_map:
1242 video_url_list = [(rf, url_map[rf])]
1243 break
bdc6b3fc
AZ
1244 if rf in self._video_formats_map:
1245 for srf in self._video_formats_map[rf]:
1246 if srf in url_map:
1247 video_url_list = [(srf, url_map[srf])]
1248 break
1249 else:
1250 continue
1251 break
1d043b93
JMF
1252 if video_url_list is None:
1253 raise ExtractorError(u'requested format not available')
1254 return video_url_list
1255
1256 def _extract_from_m3u8(self, manifest_url, video_id):
1257 url_map = {}
1258 def _get_urls(_manifest):
1259 lines = _manifest.split('\n')
1260 urls = filter(lambda l: l and not l.startswith('#'),
1261 lines)
1262 return urls
1263 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1264 formats_urls = _get_urls(manifest)
1265 for format_url in formats_urls:
890f62e8 1266 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1267 url_map[itag] = format_url
1268 return url_map
1269
c5e8d7af 1270 def _real_extract(self, url):
d7f44b5b
PH
1271 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1272 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1273
c5e8d7af
PH
1274 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1275 mobj = re.search(self._NEXT_URL_RE, url)
1276 if mobj:
1277 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1278 video_id = self._extract_id(url)
1279
1280 # Get video webpage
1281 self.report_video_webpage_download(video_id)
1282 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1283 request = compat_urllib_request.Request(url)
1284 try:
1285 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1286 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1287 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1288
1289 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1290
1291 # Attempt to extract SWF player URL
e0df6211 1292 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1293 if mobj is not None:
1294 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1295 else:
1296 player_url = None
1297
1298 # Get video info
1299 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1300 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1301 self.report_age_confirmation()
1302 age_gate = True
1303 # We simulate the access to the video from www.youtube.com/v/{video_id}
1304 # this can be viewed without login into Youtube
1305 data = compat_urllib_parse.urlencode({'video_id': video_id,
1306 'el': 'embedded',
1307 'gl': 'US',
1308 'hl': 'en',
1309 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1310 'asv': 3,
1311 'sts':'1588',
1312 })
1313 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1314 video_info_webpage = self._download_webpage(video_info_url, video_id,
1315 note=False,
1316 errnote='unable to download video info webpage')
1317 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1318 else:
1319 age_gate = False
1320 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1321 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1322 % (video_id, el_type))
1323 video_info_webpage = self._download_webpage(video_info_url, video_id,
1324 note=False,
1325 errnote='unable to download video info webpage')
1326 video_info = compat_parse_qs(video_info_webpage)
1327 if 'token' in video_info:
1328 break
c5e8d7af
PH
1329 if 'token' not in video_info:
1330 if 'reason' in video_info:
9a82b238 1331 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1332 else:
1333 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1334
1335 # Check for "rental" videos
1336 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1337 raise ExtractorError(u'"rental" videos not supported')
1338
1339 # Start extracting information
1340 self.report_information_extraction(video_id)
1341
1342 # uploader
1343 if 'author' not in video_info:
1344 raise ExtractorError(u'Unable to extract uploader name')
1345 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1346
1347 # uploader_id
1348 video_uploader_id = None
1349 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1350 if mobj is not None:
1351 video_uploader_id = mobj.group(1)
1352 else:
1353 self._downloader.report_warning(u'unable to extract uploader nickname')
1354
1355 # title
1356 if 'title' not in video_info:
1357 raise ExtractorError(u'Unable to extract video title')
1358 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1359
1360 # thumbnail image
7763b04e
JMF
1361 # We try first to get a high quality image:
1362 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1363 video_webpage, re.DOTALL)
1364 if m_thumb is not None:
1365 video_thumbnail = m_thumb.group(1)
1366 elif 'thumbnail_url' not in video_info:
c5e8d7af
PH
1367 self._downloader.report_warning(u'unable to extract video thumbnail')
1368 video_thumbnail = ''
1369 else: # don't panic if we can't find it
1370 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1371
1372 # upload date
1373 upload_date = None
1374 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1375 if mobj is not None:
1376 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1377 upload_date = unified_strdate(upload_date)
1378
1379 # description
1380 video_description = get_element_by_id("eow-description", video_webpage)
1381 if video_description:
1382 video_description = clean_html(video_description)
1383 else:
1384 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1385 if fd_mobj:
1386 video_description = unescapeHTML(fd_mobj.group(1))
1387 else:
1388 video_description = u''
1389
1390 # subtitles
d82134c3 1391 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1392
c5e8d7af 1393 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1394 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1395 return
1396
1397 if 'length_seconds' not in video_info:
1398 self._downloader.report_warning(u'unable to extract video duration')
1399 video_duration = ''
1400 else:
1401 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1402
c5e8d7af 1403 # Decide which formats to download
c5e8d7af
PH
1404
1405 try:
1406 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1407 if not mobj:
1408 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1409 info = json.loads(mobj.group(1))
1410 args = info['args']
7ce7e394
JMF
1411 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1412 # this signatures are encrypted
1413 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1414 if m_s is not None:
1415 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1416 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
cde846b3 1417 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
b7a68384 1418 if m_s is not None:
37b6d5f6
AZ
1419 if 'url_encoded_fmt_stream_map' in video_info:
1420 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1421 else:
1422 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
211fbc13 1423 elif 'adaptive_fmts' in video_info:
37b6d5f6
AZ
1424 if 'url_encoded_fmt_stream_map' in video_info:
1425 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1426 else:
1427 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
c5e8d7af
PH
1428 except ValueError:
1429 pass
1430
1431 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1432 self.report_rtmp_download()
1433 video_url_list = [(None, video_info['conn'][0])]
1434 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
a7055eb9
JMF
1435 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1436 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af
PH
1437 url_map = {}
1438 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1439 url_data = compat_parse_qs(url_data_str)
1440 if 'itag' in url_data and 'url' in url_data:
1441 url = url_data['url'][0]
1442 if 'sig' in url_data:
1443 url += '&signature=' + url_data['sig'][0]
1444 elif 's' in url_data:
e0df6211 1445 encrypted_sig = url_data['s'][0]
769fda3c 1446 if self._downloader.params.get('verbose'):
c108eb73 1447 if age_gate:
83799698
PH
1448 player_version = self._search_regex(
1449 r'-(.+)\.swf$',
1450 player_url if player_url else None,
e0df6211
PH
1451 'flash player', fatal=False)
1452 player_desc = 'flash player %s' % player_version
c108eb73 1453 else:
83799698
PH
1454 player_version = self._search_regex(
1455 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1456 'html5 player', fatal=False)
e0df6211
PH
1457 player_desc = u'html5 player %s' % player_version
1458
1459 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1460 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1461 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1462
83799698 1463 if not age_gate:
e0df6211
PH
1464 jsplayer_url_json = self._search_regex(
1465 r'"assets":.+?"js":\s*("[^"]+")',
1466 video_webpage, u'JS player URL')
83799698 1467 player_url = json.loads(jsplayer_url_json)
e0df6211 1468
83799698
PH
1469 signature = self._decrypt_signature(
1470 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1471 url += '&signature=' + signature
1472 if 'ratebypass' not in url:
1473 url += '&ratebypass=yes'
1474 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1475 video_url_list = self._get_video_url_list(url_map)
1476 if not video_url_list:
c5e8d7af 1477 return
1d043b93
JMF
1478 elif video_info.get('hlsvp'):
1479 manifest_url = video_info['hlsvp'][0]
1480 url_map = self._extract_from_m3u8(manifest_url, video_id)
1481 video_url_list = self._get_video_url_list(url_map)
1482 if not video_url_list:
1483 return
1484
c5e8d7af
PH
1485 else:
1486 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1487
1488 results = []
1489 for format_param, video_real_url in video_url_list:
1490 # Extension
1491 video_extension = self._video_extensions.get(format_param, 'flv')
1492
03cc7c20
JMF
1493 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1494 self._video_dimensions.get(format_param, '???'),
836a086c 1495 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
c5e8d7af
PH
1496
1497 results.append({
1498 'id': video_id,
1499 'url': video_real_url,
1500 'uploader': video_uploader,
1501 'uploader_id': video_uploader_id,
1502 'upload_date': upload_date,
1503 'title': video_title,
1504 'ext': video_extension,
1505 'format': video_format,
1506 'thumbnail': video_thumbnail,
1507 'description': video_description,
1508 'player_url': player_url,
1509 'subtitles': video_subtitles,
1510 'duration': video_duration
1511 })
1512 return results
1513
1514class YoutubePlaylistIE(InfoExtractor):
0f818663 1515 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1516 _VALID_URL = r"""(?:
1517 (?:https?://)?
1518 (?:\w+\.)?
1519 youtube\.com/
1520 (?:
1521 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1522 \? (?:.*?&)*? (?:p|a|list)=
1523 | p/
1524 )
c626a3d9 1525 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1526 .*
1527 |
c626a3d9 1528 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1529 )"""
1530 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1531 _MAX_RESULTS = 50
1532 IE_NAME = u'youtube:playlist'
1533
1534 @classmethod
1535 def suitable(cls, url):
1536 """Receives a URL and returns True if suitable for this IE."""
1537 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1538
1539 def _real_extract(self, url):
1540 # Extract playlist id
1541 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1542 if mobj is None:
1543 raise ExtractorError(u'Invalid URL: %s' % url)
1544
1545 # Download playlist videos from API
1546 playlist_id = mobj.group(1) or mobj.group(2)
c5e8d7af
PH
1547 videos = []
1548
755eb032 1549 for page_num in itertools.count(1):
771822eb
JMF
1550 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1551 if start_index >= 1000:
1552 self._downloader.report_warning(u'Max number of results reached')
1553 break
1554 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
c5e8d7af
PH
1555 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1556
1557 try:
1558 response = json.loads(page)
1559 except ValueError as err:
1560 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1561
1562 if 'feed' not in response:
1563 raise ExtractorError(u'Got a malformed response from YouTube API')
1564 playlist_title = response['feed']['title']['$t']
1565 if 'entry' not in response['feed']:
1566 # Number of videos is a multiple of self._MAX_RESULTS
1567 break
1568
1569 for entry in response['feed']['entry']:
1570 index = entry['yt$position']['$t']
c215217e
JMF
1571 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1572 videos.append((
1573 index,
1574 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1575 ))
c5e8d7af 1576
c5e8d7af
PH
1577 videos = [v[1] for v in sorted(videos)]
1578
20c3893f 1579 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
c5e8d7af
PH
1580 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1581
1582
1583class YoutubeChannelIE(InfoExtractor):
0f818663 1584 IE_DESC = u'YouTube.com channels'
c5e8d7af
PH
1585 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1586 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1587 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1588 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1589 IE_NAME = u'youtube:channel'
1590
1591 def extract_videos_from_page(self, page):
1592 ids_in_page = []
1593 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1594 if mobj.group(1) not in ids_in_page:
1595 ids_in_page.append(mobj.group(1))
1596 return ids_in_page
1597
1598 def _real_extract(self, url):
1599 # Extract channel id
1600 mobj = re.match(self._VALID_URL, url)
1601 if mobj is None:
1602 raise ExtractorError(u'Invalid URL: %s' % url)
1603
1604 # Download channel page
1605 channel_id = mobj.group(1)
1606 video_ids = []
1607 pagenum = 1
1608
1609 url = self._TEMPLATE_URL % (channel_id, pagenum)
1610 page = self._download_webpage(url, channel_id,
1611 u'Downloading page #%s' % pagenum)
1612
1613 # Extract video identifiers
1614 ids_in_page = self.extract_videos_from_page(page)
1615 video_ids.extend(ids_in_page)
1616
1617 # Download any subsequent channel pages using the json-based channel_ajax query
1618 if self._MORE_PAGES_INDICATOR in page:
755eb032 1619 for pagenum in itertools.count(1):
c5e8d7af
PH
1620 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1621 page = self._download_webpage(url, channel_id,
1622 u'Downloading page #%s' % pagenum)
1623
1624 page = json.loads(page)
1625
1626 ids_in_page = self.extract_videos_from_page(page['content_html'])
1627 video_ids.extend(ids_in_page)
1628
1629 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1630 break
1631
1632 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1633
1634 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
20c3893f 1635 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
c5e8d7af
PH
1636 return [self.playlist_result(url_entries, channel_id)]
1637
1638
1639class YoutubeUserIE(InfoExtractor):
0f818663 1640 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
faab1d38 1641 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1642 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1643 _GDATA_PAGE_SIZE = 50
fd9cf738 1644 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1645 IE_NAME = u'youtube:user'
1646
e3ea4790 1647 @classmethod
f4b05232 1648 def suitable(cls, url):
e3ea4790
JMF
1649 # Don't return True if the url can be extracted with other youtube
1650 # extractor, the regex would is too permissive and it would match.
1651 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1652 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1653 else: return super(YoutubeUserIE, cls).suitable(url)
1654
c5e8d7af
PH
1655 def _real_extract(self, url):
1656 # Extract username
1657 mobj = re.match(self._VALID_URL, url)
1658 if mobj is None:
1659 raise ExtractorError(u'Invalid URL: %s' % url)
1660
1661 username = mobj.group(1)
1662
1663 # Download video ids using YouTube Data API. Result size per
1664 # query is limited (currently to 50 videos) so we need to query
1665 # page by page until there are no video ids - it means we got
1666 # all of them.
1667
1668 video_ids = []
c5e8d7af 1669
755eb032 1670 for pagenum in itertools.count(0):
c5e8d7af
PH
1671 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1672
1673 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1674 page = self._download_webpage(gdata_url, username,
1675 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1676
fd9cf738
JMF
1677 try:
1678 response = json.loads(page)
1679 except ValueError as err:
1680 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1681 if 'entry' not in response['feed']:
1682 # Number of videos is a multiple of self._MAX_RESULTS
1683 break
fd9cf738 1684
c5e8d7af
PH
1685 # Extract video identifiers
1686 ids_in_page = []
fd9cf738
JMF
1687 for entry in response['feed']['entry']:
1688 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1689 video_ids.extend(ids_in_page)
1690
1691 # A little optimization - if current page is not
1692 # "full", ie. does not contain PAGE_SIZE video ids then
1693 # we can assume that this page is the last one - there
1694 # are no more ids on further pages - no need to query
1695 # again.
1696
1697 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1698 break
1699
c5e8d7af 1700 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
20c3893f 1701 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
c5e8d7af 1702 return [self.playlist_result(url_results, playlist_title = username)]
b05654f0
PH
1703
1704class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1705 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1706 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1707 _MAX_RESULTS = 1000
1708 IE_NAME = u'youtube:search'
1709 _SEARCH_KEY = 'ytsearch'
1710
1711 def report_download_page(self, query, pagenum):
1712 """Report attempt to download search page with given number."""
1713 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1714
1715 def _get_n_results(self, query, n):
1716 """Get a specified number of results for a query"""
1717
1718 video_ids = []
1719 pagenum = 0
1720 limit = n
1721
1722 while (50 * pagenum) < limit:
1723 self.report_download_page(query, pagenum+1)
1724 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1725 request = compat_urllib_request.Request(result_url)
1726 try:
1727 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1728 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1729 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1730 api_response = json.loads(data)['data']
1731
1732 if not 'items' in api_response:
1733 raise ExtractorError(u'[youtube] No video results')
1734
1735 new_ids = list(video['id'] for video in api_response['items'])
1736 video_ids += new_ids
1737
1738 limit = min(n, api_response['totalItems'])
1739 pagenum += 1
1740
1741 if len(video_ids) > n:
1742 video_ids = video_ids[:n]
1743 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1744 return self.playlist_result(videos, query)
75dff0ee
JMF
1745
1746
1747class YoutubeShowIE(InfoExtractor):
0f818663 1748 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1749 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1750 IE_NAME = u'youtube:show'
1751
1752 def _real_extract(self, url):
1753 mobj = re.match(self._VALID_URL, url)
1754 show_name = mobj.group(1)
1755 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1756 # There's one playlist for each season of the show
1757 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1758 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1759 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1760
1761
b2e8bc1b 1762class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1763 """
1764 Base class for extractors that fetch info from
1765 http://www.youtube.com/feed_ajax
1766 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1767 """
b2e8bc1b 1768 _LOGIN_REQUIRED = True
04cc9617 1769 _PAGING_STEP = 30
43ba5456
JMF
1770 # use action_load_personal_feed instead of action_load_system_feed
1771 _PERSONAL_FEED = False
04cc9617 1772
d7ae0639
JMF
1773 @property
1774 def _FEED_TEMPLATE(self):
43ba5456
JMF
1775 action = 'action_load_system_feed'
1776 if self._PERSONAL_FEED:
1777 action = 'action_load_personal_feed'
1778 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1779
1780 @property
1781 def IE_NAME(self):
1782 return u'youtube:%s' % self._FEED_NAME
04cc9617 1783
81f0259b 1784 def _real_initialize(self):
b2e8bc1b 1785 self._login()
81f0259b 1786
04cc9617
JMF
1787 def _real_extract(self, url):
1788 feed_entries = []
1789 # The step argument is available only in 2.7 or higher
1790 for i in itertools.count(0):
1791 paging = i*self._PAGING_STEP
d7ae0639
JMF
1792 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1793 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1794 u'Downloading page %s' % i)
1795 info = json.loads(info)
1796 feed_html = info['feed_html']
43ba5456 1797 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617
JMF
1798 ids = orderedSet(m.group(1) for m in m_ids)
1799 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1800 if info['paging'] is None:
1801 break
d7ae0639
JMF
1802 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1803
1804class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1805 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1806 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1807 _FEED_NAME = 'subscriptions'
1808 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1809
1810class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1811 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1812 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1813 _FEED_NAME = 'recommended'
1814 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1815
43ba5456
JMF
1816class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1817 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1818 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1819 _FEED_NAME = 'watch_later'
1820 _PLAYLIST_TITLE = u'Youtube Watch Later'
1821 _PAGING_STEP = 100
1822 _PERSONAL_FEED = True
c626a3d9
JMF
1823
1824class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1825 IE_NAME = u'youtube:favorites'
1826 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1827 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1828 _LOGIN_REQUIRED = True
1829
1830 def _real_extract(self, url):
1831 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1832 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1833 return self.url_result(playlist_id, 'YoutubePlaylist')