]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
extractor: youtube: Fix extension of dash formats.
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af
PH
9import re
10import socket
e0df6211
PH
11import string
12import struct
13import traceback
055e6f36 14import xml.etree.ElementTree
e0df6211 15import zlib
c5e8d7af 16
b05654f0 17from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 18from .subtitles import SubtitlesInfoExtractor
c5e8d7af 19from ..utils import (
edf3e38e 20 compat_chr,
c5e8d7af
PH
21 compat_http_client,
22 compat_parse_qs,
23 compat_urllib_error,
24 compat_urllib_parse,
25 compat_urllib_request,
7c61bd36 26 compat_urlparse,
c5e8d7af
PH
27 compat_str,
28
29 clean_html,
c38b1e77 30 get_cachedir,
c5e8d7af
PH
31 get_element_by_id,
32 ExtractorError,
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def report_lang(self):
49 """Report attempt to set language."""
50 self.to_screen(u'Setting language')
51
52 def _set_language(self):
53 request = compat_urllib_request.Request(self._LANG_URL)
54 try:
55 self.report_lang()
56 compat_urllib_request.urlopen(request).read()
57 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
58 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
59 return False
60 return True
61
62 def _login(self):
63 (username, password) = self._get_login_info()
64 # No authentication to be performed
65 if username is None:
66 if self._LOGIN_REQUIRED:
67 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 return False
69
70 request = compat_urllib_request.Request(self._LOGIN_URL)
71 try:
72 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
73 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
74 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
75 return False
76
77 galx = None
78 dsh = None
79 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
80 if match:
81 galx = match.group(1)
82 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
83 if match:
84 dsh = match.group(1)
c5e8d7af 85
b2e8bc1b
JMF
86 # Log in
87 login_form_strs = {
88 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
89 u'Email': username,
90 u'GALX': galx,
91 u'Passwd': password,
92 u'PersistentCookie': u'yes',
93 u'_utf8': u'霱',
94 u'bgresponse': u'js_disabled',
95 u'checkConnection': u'',
96 u'checkedDomains': u'youtube',
97 u'dnConn': u'',
98 u'dsh': dsh,
99 u'pstMsg': u'0',
100 u'rmShown': u'1',
101 u'secTok': u'',
102 u'signIn': u'Sign in',
103 u'timeStmp': u'',
104 u'service': u'youtube',
105 u'uilel': u'3',
106 u'hl': u'en_US',
107 }
108 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
109 # chokes on unicode
110 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
111 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
112 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
113 try:
114 self.report_login()
115 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
116 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
117 self._downloader.report_warning(u'unable to log in: bad username or password')
118 return False
119 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
121 return False
122 return True
123
124 def _confirm_age(self):
125 age_form = {
126 'next_url': '/',
127 'action_confirm': 'Confirm',
128 }
129 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
130 try:
131 self.report_age_confirmation()
132 compat_urllib_request.urlopen(request).read().decode('utf-8')
133 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
134 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
135 return True
136
137 def _real_initialize(self):
138 if self._downloader is None:
139 return
140 if not self._set_language():
141 return
142 if not self._login():
143 return
144 self._confirm_age()
c5e8d7af 145
8377574c 146
de7f3446 147class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 148 IE_DESC = u'YouTube.com'
c5e8d7af
PH
149 _VALID_URL = r"""^
150 (
151 (?:https?://)? # http(s):// (optional)
f4b05232 152 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
e69ae5b9
JMF
153 tube\.majestyc\.net/|
154 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
155 (?:.*?\#/)? # handle anchor (#/) redirect urls
156 (?: # the various things that can precede the ID:
157 (?:(?:v|embed|e)/) # v/ or embed/ or e/
158 |(?: # or the v= param in all its forms
d741e55a 159 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
160 (?:\?|\#!?) # the params delimiter ? or # or #!
161 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
162 v=
163 )
f4b05232
JMF
164 ))
165 |youtu\.be/ # just youtu.be/xxxx
166 )
c5e8d7af 167 )? # all until now is optional -> you can pass the naked ID
8963d9c2 168 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
169 (?(1).+)? # if we found the ID, everything can follow
170 $"""
c5e8d7af 171 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 172 # Listed in order of quality
bdc6b3fc 173 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 174 # Apple HTTP Live Streaming
bdc6b3fc 175 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
176 # 3D
177 '85', '84', '102', '83', '101', '82', '100',
178 # Dash video
179 '138', '137', '248', '136', '247', '135', '246',
180 '245', '244', '134', '243', '133', '242', '160',
181 # Dash audio
182 '141', '172', '140', '171', '139',
1d043b93 183 ]
bdc6b3fc 184 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 185 # Apple HTTP Live Streaming
bdc6b3fc
AZ
186 '96', '95', '94', '93', '92', '132', '151',
187 # 3D
86fe61c8 188 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
189 # Dash video
190 '138', '248', '137', '247', '136', '246', '245',
191 '244', '135', '243', '134', '242', '133', '160',
192 # Dash audio
193 '172', '141', '171', '140', '139',
1d043b93 194 ]
bdc6b3fc
AZ
195 _video_formats_map = {
196 'flv': ['35', '34', '6', '5'],
197 '3gp': ['36', '17', '13'],
198 'mp4': ['38', '37', '22', '18'],
199 'webm': ['46', '45', '44', '43'],
200 }
c5e8d7af
PH
201 _video_extensions = {
202 '13': '3gp',
bdc6b3fc 203 '17': '3gp',
c5e8d7af
PH
204 '18': 'mp4',
205 '22': 'mp4',
bdc6b3fc 206 '36': '3gp',
c5e8d7af 207 '37': 'mp4',
d69cf69a 208 '38': 'mp4',
c5e8d7af
PH
209 '43': 'webm',
210 '44': 'webm',
211 '45': 'webm',
212 '46': 'webm',
1d043b93 213
86fe61c8
AZ
214 # 3d videos
215 '82': 'mp4',
216 '83': 'mp4',
217 '84': 'mp4',
218 '85': 'mp4',
219 '100': 'webm',
220 '101': 'webm',
221 '102': 'webm',
836a086c 222
96fb5605 223 # Apple HTTP Live Streaming
1d043b93
JMF
224 '92': 'mp4',
225 '93': 'mp4',
226 '94': 'mp4',
227 '95': 'mp4',
228 '96': 'mp4',
229 '132': 'mp4',
230 '151': 'mp4',
836a086c
AZ
231
232 # Dash mp4
233 '133': 'mp4',
234 '134': 'mp4',
235 '135': 'mp4',
236 '136': 'mp4',
237 '137': 'mp4',
238 '138': 'mp4',
f6f1fc92
RB
239 '160': 'mp4',
240
241 # Dash mp4 audio
242 '139': 'm4a',
16f36a6f
RB
243 '140': 'm4a',
244 '141': 'm4a',
836a086c
AZ
245
246 # Dash webm
247 '171': 'webm',
248 '172': 'webm',
249 '242': 'webm',
250 '243': 'webm',
251 '244': 'webm',
252 '245': 'webm',
253 '246': 'webm',
254 '247': 'webm',
255 '248': 'webm',
c5e8d7af
PH
256 }
257 _video_dimensions = {
258 '5': '240x400',
259 '6': '???',
260 '13': '???',
261 '17': '144x176',
262 '18': '360x640',
263 '22': '720x1280',
264 '34': '360x640',
265 '35': '480x854',
bdc6b3fc 266 '36': '240x320',
c5e8d7af
PH
267 '37': '1080x1920',
268 '38': '3072x4096',
269 '43': '360x640',
270 '44': '480x854',
271 '45': '720x1280',
272 '46': '1080x1920',
86fe61c8
AZ
273 '82': '360p',
274 '83': '480p',
275 '84': '720p',
276 '85': '1080p',
1d043b93
JMF
277 '92': '240p',
278 '93': '360p',
279 '94': '480p',
280 '95': '720p',
281 '96': '1080p',
86fe61c8
AZ
282 '100': '360p',
283 '101': '480p',
836a086c 284 '102': '720p',
1d043b93
JMF
285 '132': '240p',
286 '151': '72p',
836a086c
AZ
287 '133': '240p',
288 '134': '360p',
289 '135': '480p',
290 '136': '720p',
291 '137': '1080p',
292 '138': '>1080p',
293 '139': '48k',
294 '140': '128k',
295 '141': '256k',
296 '160': '192p',
297 '171': '128k',
298 '172': '256k',
299 '242': '240p',
300 '243': '360p',
301 '244': '480p',
302 '245': '480p',
303 '246': '480p',
304 '247': '720p',
305 '248': '1080p',
c5e8d7af 306 }
836a086c
AZ
307 _special_itags = {
308 '82': '3D',
309 '83': '3D',
310 '84': '3D',
311 '85': '3D',
312 '100': '3D',
313 '101': '3D',
314 '102': '3D',
315 '133': 'DASH Video',
316 '134': 'DASH Video',
317 '135': 'DASH Video',
318 '136': 'DASH Video',
319 '137': 'DASH Video',
320 '138': 'DASH Video',
321 '139': 'DASH Audio',
322 '140': 'DASH Audio',
323 '141': 'DASH Audio',
324 '160': 'DASH Video',
325 '171': 'DASH Audio',
326 '172': 'DASH Audio',
327 '242': 'DASH Video',
328 '243': 'DASH Video',
329 '244': 'DASH Video',
330 '245': 'DASH Video',
331 '246': 'DASH Video',
332 '247': 'DASH Video',
333 '248': 'DASH Video',
c5e8d7af 334 }
836a086c 335
c5e8d7af 336 IE_NAME = u'youtube'
2eb88d95
PH
337 _TESTS = [
338 {
0e853ca4
PH
339 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
340 u"file": u"BaW_jenozKc.mp4",
341 u"info_dict": {
342 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
343 u"uploader": u"Philipp Hagemeister",
344 u"uploader_id": u"phihag",
345 u"upload_date": u"20121002",
346 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 347 }
0e853ca4
PH
348 },
349 {
350 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
351 u"file": u"1ltcDfZMA3U.flv",
352 u"note": u"Test VEVO video (#897)",
353 u"info_dict": {
354 u"upload_date": u"20070518",
355 u"title": u"Maps - It Will Find You",
356 u"description": u"Music video by Maps performing It Will Find You.",
357 u"uploader": u"MuteUSA",
358 u"uploader_id": u"MuteUSA"
2eb88d95 359 }
0e853ca4
PH
360 },
361 {
362 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
363 u"file": u"UxxajLWwzqY.mp4",
364 u"note": u"Test generic use_cipher_signature video (#897)",
365 u"info_dict": {
366 u"upload_date": u"20120506",
367 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 368 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 369 u"uploader": u"Icona Pop",
0e853ca4 370 u"uploader_id": u"IconaPop"
2eb88d95 371 }
c108eb73
JMF
372 },
373 {
374 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
375 u"file": u"07FYdnEawAQ.mp4",
376 u"note": u"Test VEVO video with age protection (#956)",
377 u"info_dict": {
378 u"upload_date": u"20130703",
379 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
380 u"description": u"md5:64249768eec3bc4276236606ea996373",
381 u"uploader": u"justintimberlakeVEVO",
382 u"uploader_id": u"justintimberlakeVEVO"
383 }
384 },
2eb88d95
PH
385 ]
386
c5e8d7af
PH
387
388 @classmethod
389 def suitable(cls, url):
390 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 391 if YoutubePlaylistIE.suitable(url): return False
c5e8d7af
PH
392 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
393
e0df6211
PH
394 def __init__(self, *args, **kwargs):
395 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 396 self._player_cache = {}
e0df6211 397
c5e8d7af
PH
398 def report_video_webpage_download(self, video_id):
399 """Report attempt to download video webpage."""
400 self.to_screen(u'%s: Downloading video webpage' % video_id)
401
402 def report_video_info_webpage_download(self, video_id):
403 """Report attempt to download video info webpage."""
404 self.to_screen(u'%s: Downloading video info webpage' % video_id)
405
c5e8d7af
PH
406 def report_information_extraction(self, video_id):
407 """Report attempt to extract video information."""
408 self.to_screen(u'%s: Extracting video information' % video_id)
409
410 def report_unavailable_format(self, video_id, format):
411 """Report extracted video URL."""
412 self.to_screen(u'%s: Format %s not available' % (video_id, format))
413
414 def report_rtmp_download(self):
415 """Indicate the download will use the RTMP protocol."""
416 self.to_screen(u'RTMP download detected')
417
c4417ddb
PH
418 def _extract_signature_function(self, video_id, player_url, slen):
419 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 420 player_url)
e0df6211
PH
421 player_type = id_m.group('ext')
422 player_id = id_m.group('id')
423
c4417ddb
PH
424 # Read from filesystem cache
425 func_id = '%s_%s_%d' % (player_type, player_id, slen)
426 assert os.path.basename(func_id) == func_id
c38b1e77 427 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 428
c3c88a26 429 cache_enabled = cache_dir is not None
f8061589 430 if cache_enabled:
c4417ddb
PH
431 cache_fn = os.path.join(os.path.expanduser(cache_dir),
432 u'youtube-sigfuncs',
433 func_id + '.json')
434 try:
edf3e38e 435 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
436 cache_spec = json.load(cachef)
437 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 438 except IOError:
c4417ddb 439 pass # No cache available
83799698 440
e0df6211
PH
441 if player_type == 'js':
442 code = self._download_webpage(
443 player_url, video_id,
83799698 444 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 445 errnote=u'Download of %s failed' % player_url)
83799698 446 res = self._parse_sig_js(code)
c4417ddb 447 elif player_type == 'swf':
e0df6211
PH
448 urlh = self._request_webpage(
449 player_url, video_id,
83799698 450 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
451 errnote=u'Download of %s failed' % player_url)
452 code = urlh.read()
83799698 453 res = self._parse_sig_swf(code)
e0df6211
PH
454 else:
455 assert False, 'Invalid player type %r' % player_type
456
f8061589 457 if cache_enabled:
edf3e38e 458 try:
c705320f
PH
459 test_string = u''.join(map(compat_chr, range(slen)))
460 cache_res = res(test_string)
edf3e38e
PH
461 cache_spec = [ord(c) for c in cache_res]
462 try:
463 os.makedirs(os.path.dirname(cache_fn))
464 except OSError as ose:
465 if ose.errno != errno.EEXIST:
466 raise
467 write_json_file(cache_spec, cache_fn)
0ca96d48 468 except Exception:
edf3e38e
PH
469 tb = traceback.format_exc()
470 self._downloader.report_warning(
471 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
472
473 return res
474
edf3e38e
PH
475 def _print_sig_code(self, func, slen):
476 def gen_sig_code(idxs):
477 def _genslice(start, end, step):
478 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
479 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
480 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
481 return u's[%s%s%s]' % (starts, ends, steps)
482
483 step = None
0ca96d48
PH
484 start = '(Never used)' # Quelch pyflakes warnings - start will be
485 # set as soon as step is set
edf3e38e
PH
486 for i, prev in zip(idxs[1:], idxs[:-1]):
487 if step is not None:
488 if i - prev == step:
489 continue
490 yield _genslice(start, prev, step)
491 step = None
492 continue
493 if i - prev in [-1, 1]:
494 step = i - prev
495 start = prev
496 continue
497 else:
498 yield u's[%d]' % prev
499 if step is None:
500 yield u's[%d]' % i
501 else:
502 yield _genslice(start, i, step)
503
c705320f
PH
504 test_string = u''.join(map(compat_chr, range(slen)))
505 cache_res = func(test_string)
edf3e38e
PH
506 cache_spec = [ord(c) for c in cache_res]
507 expr_code = u' + '.join(gen_sig_code(cache_spec))
508 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 509 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 510
e0df6211
PH
511 def _parse_sig_js(self, jscode):
512 funcname = self._search_regex(
513 r'signature=([a-zA-Z]+)', jscode,
514 u'Initial JS player signature function name')
515
516 functions = {}
517
518 def argidx(varname):
519 return string.lowercase.index(varname)
520
521 def interpret_statement(stmt, local_vars, allow_recursion=20):
522 if allow_recursion < 0:
0ca96d48 523 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
524
525 if stmt.startswith(u'var '):
526 stmt = stmt[len(u'var '):]
527 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
528 r'=(?P<expr>.*)$', stmt)
529 if ass_m:
530 if ass_m.groupdict().get('index'):
531 def assign(val):
532 lvar = local_vars[ass_m.group('out')]
533 idx = interpret_expression(ass_m.group('index'),
534 local_vars, allow_recursion)
535 assert isinstance(idx, int)
536 lvar[idx] = val
537 return val
538 expr = ass_m.group('expr')
539 else:
540 def assign(val):
541 local_vars[ass_m.group('out')] = val
542 return val
543 expr = ass_m.group('expr')
544 elif stmt.startswith(u'return '):
545 assign = lambda v: v
546 expr = stmt[len(u'return '):]
547 else:
548 raise ExtractorError(
549 u'Cannot determine left side of statement in %r' % stmt)
550
551 v = interpret_expression(expr, local_vars, allow_recursion)
552 return assign(v)
553
554 def interpret_expression(expr, local_vars, allow_recursion):
555 if expr.isdigit():
556 return int(expr)
557
558 if expr.isalpha():
559 return local_vars[expr]
560
561 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
562 if m:
563 member = m.group('member')
564 val = local_vars[m.group('in')]
565 if member == 'split("")':
566 return list(val)
567 if member == 'join("")':
568 return u''.join(val)
569 if member == 'length':
570 return len(val)
571 if member == 'reverse()':
572 return val[::-1]
573 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
574 if slice_m:
575 idx = interpret_expression(
576 slice_m.group('idx'), local_vars, allow_recursion-1)
577 return val[idx:]
578
579 m = re.match(
580 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
581 if m:
582 val = local_vars[m.group('in')]
583 idx = interpret_expression(m.group('idx'), local_vars,
584 allow_recursion-1)
585 return val[idx]
586
587 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
588 if m:
589 a = interpret_expression(m.group('a'),
590 local_vars, allow_recursion)
591 b = interpret_expression(m.group('b'),
592 local_vars, allow_recursion)
593 return a % b
594
595 m = re.match(
596 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
597 if m:
598 fname = m.group('func')
599 if fname not in functions:
600 functions[fname] = extract_function(fname)
601 argvals = [int(v) if v.isdigit() else local_vars[v]
602 for v in m.group('args').split(',')]
603 return functions[fname](argvals)
604 raise ExtractorError(u'Unsupported JS expression %r' % expr)
605
606 def extract_function(funcname):
607 func_m = re.search(
608 r'function ' + re.escape(funcname) +
609 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
610 jscode)
611 argnames = func_m.group('args').split(',')
612
613 def resf(args):
614 local_vars = dict(zip(argnames, args))
615 for stmt in func_m.group('code').split(';'):
616 res = interpret_statement(stmt, local_vars)
617 return res
618 return resf
619
620 initial_function = extract_function(funcname)
621 return lambda s: initial_function([s])
622
623 def _parse_sig_swf(self, file_contents):
624 if file_contents[1:3] != b'WS':
625 raise ExtractorError(
626 u'Not an SWF file; header is %r' % file_contents[:3])
627 if file_contents[:1] == b'C':
628 content = zlib.decompress(file_contents[8:])
629 else:
630 raise NotImplementedError(u'Unsupported compression format %r' %
631 file_contents[:1])
632
633 def extract_tags(content):
634 pos = 0
635 while pos < len(content):
636 header16 = struct.unpack('<H', content[pos:pos+2])[0]
637 pos += 2
638 tag_code = header16 >> 6
639 tag_len = header16 & 0x3f
640 if tag_len == 0x3f:
641 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
642 pos += 4
643 assert pos+tag_len <= len(content)
644 yield (tag_code, content[pos:pos+tag_len])
645 pos += tag_len
646
647 code_tag = next(tag
648 for tag_code, tag in extract_tags(content)
649 if tag_code == 82)
650 p = code_tag.index(b'\0', 4) + 1
ba552f54 651 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
652
653 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
654 def read_int(reader=None):
655 if reader is None:
656 reader = code_reader
e0df6211
PH
657 res = 0
658 shift = 0
659 for _ in range(5):
ba552f54
PH
660 buf = reader.read(1)
661 assert len(buf) == 1
662 b = struct.unpack('<B', buf)[0]
e0df6211
PH
663 res = res | ((b & 0x7f) << shift)
664 if b & 0x80 == 0:
665 break
666 shift += 7
ba552f54
PH
667 return res
668
669 def u30(reader=None):
670 res = read_int(reader)
671 assert res & 0xf0000000 == 0
e0df6211
PH
672 return res
673 u32 = read_int
674
ba552f54
PH
675 def s32(reader=None):
676 v = read_int(reader)
e0df6211
PH
677 if v & 0x80000000 != 0:
678 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
679 return v
680
0ca96d48 681 def read_string(reader=None):
ba552f54
PH
682 if reader is None:
683 reader = code_reader
684 slen = u30(reader)
685 resb = reader.read(slen)
686 assert len(resb) == slen
687 return resb.decode('utf-8')
688
689 def read_bytes(count, reader=None):
690 if reader is None:
691 reader = code_reader
692 resb = reader.read(count)
693 assert len(resb) == count
694 return resb
695
696 def read_byte(reader=None):
697 resb = read_bytes(1, reader=reader)
698 res = struct.unpack('<B', resb)[0]
699 return res
e0df6211
PH
700
701 # minor_version + major_version
0ca96d48 702 read_bytes(2 + 2)
e0df6211
PH
703
704 # Constant pool
ba552f54 705 int_count = u30()
e0df6211 706 for _c in range(1, int_count):
0ca96d48 707 s32()
ba552f54 708 uint_count = u30()
e0df6211 709 for _c in range(1, uint_count):
0ca96d48 710 u32()
ba552f54 711 double_count = u30()
0ca96d48 712 read_bytes((double_count-1) * 8)
ba552f54 713 string_count = u30()
e0df6211
PH
714 constant_strings = [u'']
715 for _c in range(1, string_count):
0ca96d48 716 s = read_string()
e0df6211 717 constant_strings.append(s)
ba552f54 718 namespace_count = u30()
e0df6211 719 for _c in range(1, namespace_count):
0ca96d48
PH
720 read_bytes(1) # kind
721 u30() # name
ba552f54 722 ns_set_count = u30()
e0df6211 723 for _c in range(1, ns_set_count):
ba552f54 724 count = u30()
e0df6211 725 for _c2 in range(count):
0ca96d48 726 u30()
ba552f54 727 multiname_count = u30()
e0df6211
PH
728 MULTINAME_SIZES = {
729 0x07: 2, # QName
730 0x0d: 2, # QNameA
731 0x0f: 1, # RTQName
732 0x10: 1, # RTQNameA
733 0x11: 0, # RTQNameL
734 0x12: 0, # RTQNameLA
735 0x09: 2, # Multiname
736 0x0e: 2, # MultinameA
737 0x1b: 1, # MultinameL
738 0x1c: 1, # MultinameLA
739 }
740 multinames = [u'']
741 for _c in range(1, multiname_count):
ba552f54 742 kind = u30()
e0df6211
PH
743 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
744 if kind == 0x07:
0ca96d48 745 u30() # namespace_idx
ba552f54 746 name_idx = u30()
e0df6211
PH
747 multinames.append(constant_strings[name_idx])
748 else:
749 multinames.append('[MULTINAME kind: %d]' % kind)
750 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 751 u30()
e0df6211
PH
752
753 # Methods
ba552f54 754 method_count = u30()
e0df6211
PH
755 MethodInfo = collections.namedtuple(
756 'MethodInfo',
757 ['NEED_ARGUMENTS', 'NEED_REST'])
758 method_infos = []
759 for method_id in range(method_count):
ba552f54 760 param_count = u30()
0ca96d48 761 u30() # return type
e0df6211 762 for _ in range(param_count):
0ca96d48
PH
763 u30() # param type
764 u30() # name index (always 0 for youtube)
ba552f54 765 flags = read_byte()
e0df6211
PH
766 if flags & 0x08 != 0:
767 # Options present
ba552f54 768 option_count = u30()
e0df6211 769 for c in range(option_count):
0ca96d48
PH
770 u30() # val
771 read_bytes(1) # kind
e0df6211
PH
772 if flags & 0x80 != 0:
773 # Param names present
774 for _ in range(param_count):
0ca96d48 775 u30() # param name
e0df6211
PH
776 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
777 method_infos.append(mi)
778
779 # Metadata
ba552f54 780 metadata_count = u30()
e0df6211 781 for _c in range(metadata_count):
0ca96d48 782 u30() # name
ba552f54 783 item_count = u30()
e0df6211 784 for _c2 in range(item_count):
0ca96d48
PH
785 u30() # key
786 u30() # value
ba552f54
PH
787
788 def parse_traits_info():
789 trait_name_idx = u30()
790 kind_full = read_byte()
e0df6211
PH
791 kind = kind_full & 0x0f
792 attrs = kind_full >> 4
793 methods = {}
794 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
795 u30() # Slot id
796 u30() # type_name_idx
ba552f54 797 vindex = u30()
e0df6211 798 if vindex != 0:
0ca96d48 799 read_byte() # vkind
e0df6211 800 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 801 u30() # disp_id
ba552f54 802 method_idx = u30()
e0df6211
PH
803 methods[multinames[trait_name_idx]] = method_idx
804 elif kind == 0x04: # Class
0ca96d48
PH
805 u30() # slot_id
806 u30() # classi
e0df6211 807 elif kind == 0x05: # Function
0ca96d48 808 u30() # slot_id
ba552f54 809 function_idx = u30()
e0df6211
PH
810 methods[function_idx] = multinames[trait_name_idx]
811 else:
812 raise ExtractorError(u'Unsupported trait kind %d' % kind)
813
814 if attrs & 0x4 != 0: # Metadata present
ba552f54 815 metadata_count = u30()
e0df6211 816 for _c3 in range(metadata_count):
0ca96d48 817 u30() # metadata index
e0df6211 818
ba552f54 819 return methods
e0df6211
PH
820
821 # Classes
822 TARGET_CLASSNAME = u'SignatureDecipher'
823 searched_idx = multinames.index(TARGET_CLASSNAME)
824 searched_class_id = None
ba552f54 825 class_count = u30()
e0df6211 826 for class_id in range(class_count):
ba552f54 827 name_idx = u30()
e0df6211
PH
828 if name_idx == searched_idx:
829 # We found the class we're looking for!
830 searched_class_id = class_id
0ca96d48 831 u30() # super_name idx
ba552f54 832 flags = read_byte()
e0df6211 833 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 834 u30() # protected_ns_idx
ba552f54 835 intrf_count = u30()
e0df6211 836 for _c2 in range(intrf_count):
0ca96d48
PH
837 u30()
838 u30() # iinit
ba552f54 839 trait_count = u30()
e0df6211 840 for _c2 in range(trait_count):
0ca96d48 841 parse_traits_info()
e0df6211
PH
842
843 if searched_class_id is None:
844 raise ExtractorError(u'Target class %r not found' %
845 TARGET_CLASSNAME)
846
847 method_names = {}
848 method_idxs = {}
849 for class_id in range(class_count):
0ca96d48 850 u30() # cinit
ba552f54 851 trait_count = u30()
e0df6211 852 for _c2 in range(trait_count):
ba552f54 853 trait_methods = parse_traits_info()
e0df6211
PH
854 if class_id == searched_class_id:
855 method_names.update(trait_methods.items())
856 method_idxs.update(dict(
857 (idx, name)
858 for name, idx in trait_methods.items()))
859
860 # Scripts
ba552f54 861 script_count = u30()
e0df6211 862 for _c in range(script_count):
0ca96d48 863 u30() # init
ba552f54 864 trait_count = u30()
e0df6211 865 for _c2 in range(trait_count):
0ca96d48 866 parse_traits_info()
e0df6211
PH
867
868 # Method bodies
ba552f54 869 method_body_count = u30()
e0df6211
PH
870 Method = collections.namedtuple('Method', ['code', 'local_count'])
871 methods = {}
872 for _c in range(method_body_count):
ba552f54 873 method_idx = u30()
0ca96d48 874 u30() # max_stack
ba552f54 875 local_count = u30()
0ca96d48
PH
876 u30() # init_scope_depth
877 u30() # max_scope_depth
ba552f54
PH
878 code_length = u30()
879 code = read_bytes(code_length)
e0df6211 880 if method_idx in method_idxs:
ba552f54 881 m = Method(code, local_count)
e0df6211 882 methods[method_idxs[method_idx]] = m
ba552f54 883 exception_count = u30()
e0df6211 884 for _c2 in range(exception_count):
0ca96d48
PH
885 u30() # from
886 u30() # to
887 u30() # target
888 u30() # exc_type
889 u30() # var_name
ba552f54 890 trait_count = u30()
e0df6211 891 for _c2 in range(trait_count):
0ca96d48 892 parse_traits_info()
e0df6211 893
ba552f54 894 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
895 assert len(methods) == len(method_idxs)
896
897 method_pyfunctions = {}
898
899 def extract_function(func_name):
900 if func_name in method_pyfunctions:
901 return method_pyfunctions[func_name]
902 if func_name not in methods:
903 raise ExtractorError(u'Cannot find function %r' % func_name)
904 m = methods[func_name]
905
906 def resfunc(args):
e0df6211
PH
907 registers = ['(this)'] + list(args) + [None] * m.local_count
908 stack = []
909 coder = io.BytesIO(m.code)
910 while True:
911 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 912 if opcode == 36: # pushbyte
e0df6211
PH
913 v = struct.unpack('!B', coder.read(1))[0]
914 stack.append(v)
915 elif opcode == 44: # pushstring
916 idx = u30(coder)
917 stack.append(constant_strings[idx])
918 elif opcode == 48: # pushscope
919 # We don't implement the scope register, so we'll just
920 # ignore the popped value
921 stack.pop()
922 elif opcode == 70: # callproperty
923 index = u30(coder)
924 mname = multinames[index]
925 arg_count = u30(coder)
926 args = list(reversed(
927 [stack.pop() for _ in range(arg_count)]))
928 obj = stack.pop()
929 if mname == u'split':
930 assert len(args) == 1
931 assert isinstance(args[0], compat_str)
932 assert isinstance(obj, compat_str)
933 if args[0] == u'':
934 res = list(obj)
935 else:
936 res = obj.split(args[0])
937 stack.append(res)
a7177865
PH
938 elif mname == u'slice':
939 assert len(args) == 1
940 assert isinstance(args[0], int)
941 assert isinstance(obj, list)
942 res = obj[args[0]:]
943 stack.append(res)
944 elif mname == u'join':
945 assert len(args) == 1
946 assert isinstance(args[0], compat_str)
947 assert isinstance(obj, list)
948 res = args[0].join(obj)
949 stack.append(res)
e0df6211
PH
950 elif mname in method_pyfunctions:
951 stack.append(method_pyfunctions[mname](args))
952 else:
953 raise NotImplementedError(
954 u'Unsupported property %r on %r'
955 % (mname, obj))
a7177865
PH
956 elif opcode == 72: # returnvalue
957 res = stack.pop()
958 return res
959 elif opcode == 79: # callpropvoid
960 index = u30(coder)
961 mname = multinames[index]
962 arg_count = u30(coder)
963 args = list(reversed(
964 [stack.pop() for _ in range(arg_count)]))
965 obj = stack.pop()
966 if mname == u'reverse':
967 assert isinstance(obj, list)
968 obj.reverse()
969 else:
970 raise NotImplementedError(
971 u'Unsupported (void) property %r on %r'
972 % (mname, obj))
e0df6211
PH
973 elif opcode == 93: # findpropstrict
974 index = u30(coder)
975 mname = multinames[index]
976 res = extract_function(mname)
977 stack.append(res)
978 elif opcode == 97: # setproperty
979 index = u30(coder)
980 value = stack.pop()
981 idx = stack.pop()
982 obj = stack.pop()
983 assert isinstance(obj, list)
984 assert isinstance(idx, int)
985 obj[idx] = value
986 elif opcode == 98: # getlocal
987 index = u30(coder)
988 stack.append(registers[index])
989 elif opcode == 99: # setlocal
990 index = u30(coder)
991 value = stack.pop()
992 registers[index] = value
993 elif opcode == 102: # getproperty
994 index = u30(coder)
995 pname = multinames[index]
996 if pname == u'length':
997 obj = stack.pop()
998 assert isinstance(obj, list)
999 stack.append(len(obj))
1000 else: # Assume attribute access
1001 idx = stack.pop()
1002 assert isinstance(idx, int)
1003 obj = stack.pop()
1004 assert isinstance(obj, list)
1005 stack.append(obj[idx])
1006 elif opcode == 128: # coerce
0ca96d48 1007 u30(coder)
e0df6211
PH
1008 elif opcode == 133: # coerce_s
1009 assert isinstance(stack[-1], (type(None), compat_str))
1010 elif opcode == 164: # modulo
1011 value2 = stack.pop()
1012 value1 = stack.pop()
1013 res = value1 % value2
1014 stack.append(res)
a7177865
PH
1015 elif opcode == 208: # getlocal_0
1016 stack.append(registers[0])
1017 elif opcode == 209: # getlocal_1
1018 stack.append(registers[1])
1019 elif opcode == 210: # getlocal_2
1020 stack.append(registers[2])
1021 elif opcode == 211: # getlocal_3
1022 stack.append(registers[3])
e0df6211
PH
1023 elif opcode == 214: # setlocal_2
1024 registers[2] = stack.pop()
1025 elif opcode == 215: # setlocal_3
1026 registers[3] = stack.pop()
1027 else:
1028 raise NotImplementedError(
1029 u'Unsupported opcode %d' % opcode)
1030
1031 method_pyfunctions[func_name] = resfunc
1032 return resfunc
1033
1034 initial_function = extract_function(u'decipher')
1035 return lambda s: initial_function([s])
1036
83799698 1037 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1038 """Turn the encrypted s field into a working signature"""
6b37f0be 1039
83799698 1040 if player_url is not None:
e0df6211 1041 try:
7f8ae73a
PH
1042 player_id = (player_url, len(s))
1043 if player_id not in self._player_cache:
83799698 1044 func = self._extract_signature_function(
c4417ddb 1045 video_id, player_url, len(s)
e0df6211 1046 )
7f8ae73a
PH
1047 self._player_cache[player_id] = func
1048 func = self._player_cache[player_id]
edf3e38e
PH
1049 if self._downloader.params.get('youtube_print_sig_code'):
1050 self._print_sig_code(func, len(s))
1051 return func(s)
0ca96d48 1052 except Exception:
e0df6211 1053 tb = traceback.format_exc()
83799698
PH
1054 self._downloader.report_warning(
1055 u'Automatic signature extraction failed: ' + tb)
e0df6211 1056
d2d8f895
PH
1057 self._downloader.report_warning(
1058 u'Warning: Falling back to static signature algorithm')
920de7a2 1059
2f2ffea9
PH
1060 return self._static_decrypt_signature(
1061 s, video_id, player_url, age_gate)
e0df6211 1062
2f2ffea9 1063 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1064 if age_gate:
1065 # The videos with age protection use another player, so the
1066 # algorithms can be different.
1067 if len(s) == 86:
1068 return s[2:63] + s[82] + s[64:82] + s[63]
1069
bc4b9008 1070 if len(s) == 93:
1071 return s[86:29:-1] + s[88] + s[28:5:-1]
1072 elif len(s) == 92:
444b1165 1073 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
1074 elif len(s) == 91:
1075 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
1076 elif len(s) == 90:
1077 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1078 elif len(s) == 89:
1079 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1080 elif len(s) == 88:
3e223834 1081 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1082 elif len(s) == 87:
3a725669 1083 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1084 elif len(s) == 86:
f2c327fd 1085 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 1086 elif len(s) == 85:
6ae8ee3f 1087 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1088 elif len(s) == 84:
6f56389b 1089 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 1090 elif len(s) == 83:
920de7a2 1091 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1092 elif len(s) == 82:
c21315f2 1093 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 1094 elif len(s) == 81:
aedd6bb9 1095 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1096 elif len(s) == 80:
1097 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1098 elif len(s) == 79:
1099 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1100
1101 else:
1102 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1103
de7f3446 1104 def _get_available_subtitles(self, video_id):
de7f3446 1105 try:
7fad1c63
JMF
1106 sub_list = self._download_webpage(
1107 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1108 video_id, note=False)
1109 except ExtractorError as err:
de7f3446
JMF
1110 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1111 return {}
1112 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1113
1114 sub_lang_list = {}
1115 for l in lang_list:
1116 lang = l[1]
1117 params = compat_urllib_parse.urlencode({
1118 'lang': lang,
1119 'v': video_id,
1120 'fmt': self._downloader.params.get('subtitlesformat'),
a34c2faa 1121 'name': l[0],
de7f3446
JMF
1122 })
1123 url = u'http://www.youtube.com/api/timedtext?' + params
1124 sub_lang_list[lang] = url
1125 if not sub_lang_list:
1126 self._downloader.report_warning(u'video doesn\'t have subtitles')
1127 return {}
1128 return sub_lang_list
1129
055e6f36 1130 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1131 """We need the webpage for getting the captions url, pass it as an
1132 argument to speed up the process."""
de7f3446
JMF
1133 sub_format = self._downloader.params.get('subtitlesformat')
1134 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1135 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1136 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1137 if mobj is None:
1138 self._downloader.report_warning(err_msg)
1139 return {}
1140 player_config = json.loads(mobj.group(1))
1141 try:
1142 args = player_config[u'args']
1143 caption_url = args[u'ttsurl']
1144 timestamp = args[u'timestamp']
055e6f36
JMF
1145 # We get the available subtitles
1146 list_params = compat_urllib_parse.urlencode({
1147 'type': 'list',
1148 'tlangs': 1,
1149 'asrs': 1,
de7f3446 1150 })
055e6f36
JMF
1151 list_url = caption_url + '&' + list_params
1152 list_page = self._download_webpage(list_url, video_id)
1153 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
e3dc22ca 1154 original_lang_node = caption_list.find('track')
a733eb6c 1155 if not original_lang_node or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1156 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1157 return {}
1158 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1159
1160 sub_lang_list = {}
1161 for lang_node in caption_list.findall('target'):
1162 sub_lang = lang_node.attrib['lang_code']
1163 params = compat_urllib_parse.urlencode({
1164 'lang': original_lang,
1165 'tlang': sub_lang,
1166 'fmt': sub_format,
1167 'ts': timestamp,
1168 'kind': 'asr',
1169 })
1170 sub_lang_list[sub_lang] = caption_url + '&' + params
1171 return sub_lang_list
de7f3446
JMF
1172 # An extractor error can be raise by the download process if there are
1173 # no automatic captions but there are subtitles
1174 except (KeyError, ExtractorError):
1175 self._downloader.report_warning(err_msg)
1176 return {}
1177
c5e8d7af
PH
1178 def _print_formats(self, formats):
1179 print('Available formats:')
1180 for x in formats:
03cc7c20
JMF
1181 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1182 self._video_dimensions.get(x, '???'),
836a086c 1183 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1184
1185 def _extract_id(self, url):
1186 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1187 if mobj is None:
1188 raise ExtractorError(u'Invalid URL: %s' % url)
1189 video_id = mobj.group(2)
1190 return video_id
1191
1d043b93
JMF
1192 def _get_video_url_list(self, url_map):
1193 """
1194 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1195 with the requested formats.
1196 """
1197 req_format = self._downloader.params.get('format', None)
1198 format_limit = self._downloader.params.get('format_limit', None)
1199 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1200 if format_limit is not None and format_limit in available_formats:
1201 format_list = available_formats[available_formats.index(format_limit):]
1202 else:
1203 format_list = available_formats
1204 existing_formats = [x for x in format_list if x in url_map]
1205 if len(existing_formats) == 0:
1206 raise ExtractorError(u'no known formats available for video')
1207 if self._downloader.params.get('listformats', None):
1208 self._print_formats(existing_formats)
1209 return
1210 if req_format is None or req_format == 'best':
1211 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1212 elif req_format == 'worst':
1213 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1214 elif req_format in ('-1', 'all'):
1215 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1216 else:
1217 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1218 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1219 # available in the specified format. For example,
1220 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1221 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1222 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1223 req_formats = req_format.split('/')
1224 video_url_list = None
1225 for rf in req_formats:
1226 if rf in url_map:
1227 video_url_list = [(rf, url_map[rf])]
1228 break
bdc6b3fc
AZ
1229 if rf in self._video_formats_map:
1230 for srf in self._video_formats_map[rf]:
1231 if srf in url_map:
1232 video_url_list = [(srf, url_map[srf])]
1233 break
1234 else:
1235 continue
1236 break
1d043b93
JMF
1237 if video_url_list is None:
1238 raise ExtractorError(u'requested format not available')
1239 return video_url_list
1240
1241 def _extract_from_m3u8(self, manifest_url, video_id):
1242 url_map = {}
1243 def _get_urls(_manifest):
1244 lines = _manifest.split('\n')
1245 urls = filter(lambda l: l and not l.startswith('#'),
1246 lines)
1247 return urls
1248 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1249 formats_urls = _get_urls(manifest)
1250 for format_url in formats_urls:
890f62e8 1251 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1252 url_map[itag] = format_url
1253 return url_map
1254
1fb07d10
JG
1255 def _extract_annotations(self, video_id):
1256 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1257 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1258
c5e8d7af
PH
1259 def _real_extract(self, url):
1260 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1261 mobj = re.search(self._NEXT_URL_RE, url)
1262 if mobj:
1263 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1264 video_id = self._extract_id(url)
1265
1266 # Get video webpage
1267 self.report_video_webpage_download(video_id)
1268 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1269 request = compat_urllib_request.Request(url)
1270 try:
1271 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1272 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1273 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1274
1275 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1276
1277 # Attempt to extract SWF player URL
e0df6211 1278 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1279 if mobj is not None:
1280 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1281 else:
1282 player_url = None
1283
1284 # Get video info
1285 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1286 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1287 self.report_age_confirmation()
1288 age_gate = True
1289 # We simulate the access to the video from www.youtube.com/v/{video_id}
1290 # this can be viewed without login into Youtube
1291 data = compat_urllib_parse.urlencode({'video_id': video_id,
1292 'el': 'embedded',
1293 'gl': 'US',
1294 'hl': 'en',
1295 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1296 'asv': 3,
1297 'sts':'1588',
1298 })
1299 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1300 video_info_webpage = self._download_webpage(video_info_url, video_id,
1301 note=False,
1302 errnote='unable to download video info webpage')
1303 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1304 else:
1305 age_gate = False
1306 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1307 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1308 % (video_id, el_type))
1309 video_info_webpage = self._download_webpage(video_info_url, video_id,
1310 note=False,
1311 errnote='unable to download video info webpage')
1312 video_info = compat_parse_qs(video_info_webpage)
1313 if 'token' in video_info:
1314 break
c5e8d7af
PH
1315 if 'token' not in video_info:
1316 if 'reason' in video_info:
9a82b238 1317 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1318 else:
1319 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1320
1321 # Check for "rental" videos
1322 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1323 raise ExtractorError(u'"rental" videos not supported')
1324
1325 # Start extracting information
1326 self.report_information_extraction(video_id)
1327
1328 # uploader
1329 if 'author' not in video_info:
1330 raise ExtractorError(u'Unable to extract uploader name')
1331 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1332
1333 # uploader_id
1334 video_uploader_id = None
1335 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1336 if mobj is not None:
1337 video_uploader_id = mobj.group(1)
1338 else:
1339 self._downloader.report_warning(u'unable to extract uploader nickname')
1340
1341 # title
a8c6b241
PH
1342 if 'title' in video_info:
1343 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1344 else:
1345 self._downloader.report_warning(u'Unable to extract video title')
1346 video_title = u'_'
c5e8d7af
PH
1347
1348 # thumbnail image
7763b04e
JMF
1349 # We try first to get a high quality image:
1350 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1351 video_webpage, re.DOTALL)
1352 if m_thumb is not None:
1353 video_thumbnail = m_thumb.group(1)
1354 elif 'thumbnail_url' not in video_info:
c5e8d7af 1355 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1356 video_thumbnail = None
c5e8d7af
PH
1357 else: # don't panic if we can't find it
1358 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1359
1360 # upload date
1361 upload_date = None
1362 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1363 if mobj is not None:
1364 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1365 upload_date = unified_strdate(upload_date)
1366
1367 # description
1368 video_description = get_element_by_id("eow-description", video_webpage)
1369 if video_description:
1370 video_description = clean_html(video_description)
1371 else:
1372 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1373 if fd_mobj:
1374 video_description = unescapeHTML(fd_mobj.group(1))
1375 else:
1376 video_description = u''
1377
1378 # subtitles
d82134c3 1379 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1380
c5e8d7af 1381 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1382 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1383 return
1384
1385 if 'length_seconds' not in video_info:
1386 self._downloader.report_warning(u'unable to extract video duration')
1387 video_duration = ''
1388 else:
1389 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1390
1fb07d10
JG
1391 # annotations
1392 video_annotations = None
1393 if self._downloader.params.get('writeannotations', False):
1394 video_annotations = self._extract_annotations(video_id)
1395
c5e8d7af 1396 # Decide which formats to download
c5e8d7af
PH
1397
1398 try:
1399 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1400 if not mobj:
1401 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1402 info = json.loads(mobj.group(1))
1403 args = info['args']
7ce7e394
JMF
1404 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1405 # this signatures are encrypted
44d46655 1406 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1407 raise ValueError(u'No stream_map present') # caught below
7ce7e394
JMF
1408 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1409 if m_s is not None:
1410 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1411 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
cde846b3 1412 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
b7a68384 1413 if m_s is not None:
37b6d5f6
AZ
1414 if 'url_encoded_fmt_stream_map' in video_info:
1415 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1416 else:
1417 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
211fbc13 1418 elif 'adaptive_fmts' in video_info:
37b6d5f6
AZ
1419 if 'url_encoded_fmt_stream_map' in video_info:
1420 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1421 else:
1422 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
c5e8d7af
PH
1423 except ValueError:
1424 pass
1425
1426 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1427 self.report_rtmp_download()
1428 video_url_list = [(None, video_info['conn'][0])]
1429 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
a7055eb9
JMF
1430 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1431 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af
PH
1432 url_map = {}
1433 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1434 url_data = compat_parse_qs(url_data_str)
1435 if 'itag' in url_data and 'url' in url_data:
1436 url = url_data['url'][0]
1437 if 'sig' in url_data:
1438 url += '&signature=' + url_data['sig'][0]
1439 elif 's' in url_data:
e0df6211 1440 encrypted_sig = url_data['s'][0]
769fda3c 1441 if self._downloader.params.get('verbose'):
c108eb73 1442 if age_gate:
bdde940e
PH
1443 if player_url is None:
1444 player_version = 'unknown'
1445 else:
1446 player_version = self._search_regex(
1447 r'-(.+)\.swf$', player_url,
1448 u'flash player', fatal=False)
e0df6211 1449 player_desc = 'flash player %s' % player_version
c108eb73 1450 else:
83799698
PH
1451 player_version = self._search_regex(
1452 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1453 'html5 player', fatal=False)
e0df6211
PH
1454 player_desc = u'html5 player %s' % player_version
1455
1456 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1457 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1458 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1459
83799698 1460 if not age_gate:
e0df6211
PH
1461 jsplayer_url_json = self._search_regex(
1462 r'"assets":.+?"js":\s*("[^"]+")',
1463 video_webpage, u'JS player URL')
83799698 1464 player_url = json.loads(jsplayer_url_json)
e0df6211 1465
83799698
PH
1466 signature = self._decrypt_signature(
1467 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1468 url += '&signature=' + signature
1469 if 'ratebypass' not in url:
1470 url += '&ratebypass=yes'
1471 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1472 video_url_list = self._get_video_url_list(url_map)
1473 if not video_url_list:
c5e8d7af 1474 return
1d043b93
JMF
1475 elif video_info.get('hlsvp'):
1476 manifest_url = video_info['hlsvp'][0]
1477 url_map = self._extract_from_m3u8(manifest_url, video_id)
1478 video_url_list = self._get_video_url_list(url_map)
1479 if not video_url_list:
1480 return
1481
c5e8d7af 1482 else:
9abb3204 1483 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af
PH
1484
1485 results = []
1486 for format_param, video_real_url in video_url_list:
1487 # Extension
1488 video_extension = self._video_extensions.get(format_param, 'flv')
1489
03cc7c20
JMF
1490 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1491 self._video_dimensions.get(format_param, '???'),
836a086c 1492 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
c5e8d7af
PH
1493
1494 results.append({
1495 'id': video_id,
1496 'url': video_real_url,
1497 'uploader': video_uploader,
1498 'uploader_id': video_uploader_id,
1499 'upload_date': upload_date,
1500 'title': video_title,
1501 'ext': video_extension,
1502 'format': video_format,
1503 'thumbnail': video_thumbnail,
1504 'description': video_description,
1505 'player_url': player_url,
1506 'subtitles': video_subtitles,
8dbe9899 1507 'duration': video_duration,
cfadd183 1508 'age_limit': 18 if age_gate else 0,
1fb07d10 1509 'annotations': video_annotations
c5e8d7af
PH
1510 })
1511 return results
1512
1513class YoutubePlaylistIE(InfoExtractor):
0f818663 1514 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1515 _VALID_URL = r"""(?:
1516 (?:https?://)?
1517 (?:\w+\.)?
1518 youtube\.com/
1519 (?:
1520 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1521 \? (?:.*?&)*? (?:p|a|list)=
1522 | p/
1523 )
c626a3d9 1524 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1525 .*
1526 |
c626a3d9 1527 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1528 )"""
1529 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1530 _MAX_RESULTS = 50
1531 IE_NAME = u'youtube:playlist'
1532
1533 @classmethod
1534 def suitable(cls, url):
1535 """Receives a URL and returns True if suitable for this IE."""
1536 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1537
1538 def _real_extract(self, url):
1539 # Extract playlist id
1540 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1541 if mobj is None:
1542 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1543 playlist_id = mobj.group(1) or mobj.group(2)
1544
1545 # Check if it's a video-specific URL
7c61bd36 1546 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1547 if 'v' in query_dict:
1548 video_id = query_dict['v'][0]
1549 if self._downloader.params.get('noplaylist'):
1550 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1551 return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
1552 else:
1553 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af
PH
1554
1555 # Download playlist videos from API
c5e8d7af
PH
1556 videos = []
1557
755eb032 1558 for page_num in itertools.count(1):
771822eb
JMF
1559 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1560 if start_index >= 1000:
1561 self._downloader.report_warning(u'Max number of results reached')
1562 break
1563 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
c5e8d7af
PH
1564 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1565
1566 try:
1567 response = json.loads(page)
1568 except ValueError as err:
1569 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1570
1571 if 'feed' not in response:
1572 raise ExtractorError(u'Got a malformed response from YouTube API')
1573 playlist_title = response['feed']['title']['$t']
1574 if 'entry' not in response['feed']:
1575 # Number of videos is a multiple of self._MAX_RESULTS
1576 break
1577
1578 for entry in response['feed']['entry']:
1579 index = entry['yt$position']['$t']
c215217e
JMF
1580 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1581 videos.append((
1582 index,
1583 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1584 ))
c5e8d7af 1585
c5e8d7af
PH
1586 videos = [v[1] for v in sorted(videos)]
1587
20c3893f 1588 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
c5e8d7af
PH
1589 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1590
1591
1592class YoutubeChannelIE(InfoExtractor):
0f818663 1593 IE_DESC = u'YouTube.com channels'
c5e8d7af
PH
1594 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1595 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1596 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1597 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1598 IE_NAME = u'youtube:channel'
1599
1600 def extract_videos_from_page(self, page):
1601 ids_in_page = []
1602 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1603 if mobj.group(1) not in ids_in_page:
1604 ids_in_page.append(mobj.group(1))
1605 return ids_in_page
1606
1607 def _real_extract(self, url):
1608 # Extract channel id
1609 mobj = re.match(self._VALID_URL, url)
1610 if mobj is None:
1611 raise ExtractorError(u'Invalid URL: %s' % url)
1612
1613 # Download channel page
1614 channel_id = mobj.group(1)
1615 video_ids = []
1616 pagenum = 1
1617
1618 url = self._TEMPLATE_URL % (channel_id, pagenum)
1619 page = self._download_webpage(url, channel_id,
1620 u'Downloading page #%s' % pagenum)
1621
1622 # Extract video identifiers
1623 ids_in_page = self.extract_videos_from_page(page)
1624 video_ids.extend(ids_in_page)
1625
1626 # Download any subsequent channel pages using the json-based channel_ajax query
1627 if self._MORE_PAGES_INDICATOR in page:
755eb032 1628 for pagenum in itertools.count(1):
c5e8d7af
PH
1629 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1630 page = self._download_webpage(url, channel_id,
1631 u'Downloading page #%s' % pagenum)
1632
1633 page = json.loads(page)
1634
1635 ids_in_page = self.extract_videos_from_page(page['content_html'])
1636 video_ids.extend(ids_in_page)
1637
1638 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1639 break
1640
1641 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1642
1643 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
20c3893f 1644 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
c5e8d7af
PH
1645 return [self.playlist_result(url_entries, channel_id)]
1646
1647
1648class YoutubeUserIE(InfoExtractor):
0f818663 1649 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1650 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1651 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1652 _GDATA_PAGE_SIZE = 50
fd9cf738 1653 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1654 IE_NAME = u'youtube:user'
1655
e3ea4790 1656 @classmethod
f4b05232 1657 def suitable(cls, url):
e3ea4790
JMF
1658 # Don't return True if the url can be extracted with other youtube
1659 # extractor, the regex would is too permissive and it would match.
1660 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1661 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1662 else: return super(YoutubeUserIE, cls).suitable(url)
1663
c5e8d7af
PH
1664 def _real_extract(self, url):
1665 # Extract username
1666 mobj = re.match(self._VALID_URL, url)
1667 if mobj is None:
1668 raise ExtractorError(u'Invalid URL: %s' % url)
1669
1670 username = mobj.group(1)
1671
1672 # Download video ids using YouTube Data API. Result size per
1673 # query is limited (currently to 50 videos) so we need to query
1674 # page by page until there are no video ids - it means we got
1675 # all of them.
1676
1677 video_ids = []
c5e8d7af 1678
755eb032 1679 for pagenum in itertools.count(0):
c5e8d7af
PH
1680 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1681
1682 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1683 page = self._download_webpage(gdata_url, username,
1684 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1685
fd9cf738
JMF
1686 try:
1687 response = json.loads(page)
1688 except ValueError as err:
1689 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1690 if 'entry' not in response['feed']:
1691 # Number of videos is a multiple of self._MAX_RESULTS
1692 break
fd9cf738 1693
c5e8d7af
PH
1694 # Extract video identifiers
1695 ids_in_page = []
fd9cf738
JMF
1696 for entry in response['feed']['entry']:
1697 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1698 video_ids.extend(ids_in_page)
1699
1700 # A little optimization - if current page is not
1701 # "full", ie. does not contain PAGE_SIZE video ids then
1702 # we can assume that this page is the last one - there
1703 # are no more ids on further pages - no need to query
1704 # again.
1705
1706 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1707 break
1708
c5e8d7af 1709 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
20c3893f 1710 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
c5e8d7af 1711 return [self.playlist_result(url_results, playlist_title = username)]
b05654f0
PH
1712
1713class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1714 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1715 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1716 _MAX_RESULTS = 1000
1717 IE_NAME = u'youtube:search'
1718 _SEARCH_KEY = 'ytsearch'
1719
1720 def report_download_page(self, query, pagenum):
1721 """Report attempt to download search page with given number."""
1722 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1723
1724 def _get_n_results(self, query, n):
1725 """Get a specified number of results for a query"""
1726
1727 video_ids = []
1728 pagenum = 0
1729 limit = n
1730
1731 while (50 * pagenum) < limit:
1732 self.report_download_page(query, pagenum+1)
1733 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1734 request = compat_urllib_request.Request(result_url)
1735 try:
1736 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1737 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1738 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1739 api_response = json.loads(data)['data']
1740
1741 if not 'items' in api_response:
1742 raise ExtractorError(u'[youtube] No video results')
1743
1744 new_ids = list(video['id'] for video in api_response['items'])
1745 video_ids += new_ids
1746
1747 limit = min(n, api_response['totalItems'])
1748 pagenum += 1
1749
1750 if len(video_ids) > n:
1751 video_ids = video_ids[:n]
1752 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1753 return self.playlist_result(videos, query)
75dff0ee
JMF
1754
1755
1756class YoutubeShowIE(InfoExtractor):
0f818663 1757 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1758 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1759 IE_NAME = u'youtube:show'
1760
1761 def _real_extract(self, url):
1762 mobj = re.match(self._VALID_URL, url)
1763 show_name = mobj.group(1)
1764 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1765 # There's one playlist for each season of the show
1766 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1767 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1768 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1769
1770
b2e8bc1b 1771class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1772 """
1773 Base class for extractors that fetch info from
1774 http://www.youtube.com/feed_ajax
1775 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1776 """
b2e8bc1b 1777 _LOGIN_REQUIRED = True
04cc9617 1778 _PAGING_STEP = 30
43ba5456
JMF
1779 # use action_load_personal_feed instead of action_load_system_feed
1780 _PERSONAL_FEED = False
04cc9617 1781
d7ae0639
JMF
1782 @property
1783 def _FEED_TEMPLATE(self):
43ba5456
JMF
1784 action = 'action_load_system_feed'
1785 if self._PERSONAL_FEED:
1786 action = 'action_load_personal_feed'
1787 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1788
1789 @property
1790 def IE_NAME(self):
1791 return u'youtube:%s' % self._FEED_NAME
04cc9617 1792
81f0259b 1793 def _real_initialize(self):
b2e8bc1b 1794 self._login()
81f0259b 1795
04cc9617
JMF
1796 def _real_extract(self, url):
1797 feed_entries = []
1798 # The step argument is available only in 2.7 or higher
1799 for i in itertools.count(0):
1800 paging = i*self._PAGING_STEP
d7ae0639
JMF
1801 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1802 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1803 u'Downloading page %s' % i)
1804 info = json.loads(info)
1805 feed_html = info['feed_html']
43ba5456 1806 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617
JMF
1807 ids = orderedSet(m.group(1) for m in m_ids)
1808 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1809 if info['paging'] is None:
1810 break
d7ae0639
JMF
1811 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1812
1813class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1814 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1815 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1816 _FEED_NAME = 'subscriptions'
1817 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1818
1819class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1820 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1821 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1822 _FEED_NAME = 'recommended'
1823 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1824
43ba5456
JMF
1825class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1826 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1827 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1828 _FEED_NAME = 'watch_later'
1829 _PLAYLIST_TITLE = u'Youtube Watch Later'
1830 _PAGING_STEP = 100
1831 _PERSONAL_FEED = True
c626a3d9
JMF
1832
1833class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1834 IE_NAME = u'youtube:favorites'
1835 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1836 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1837 _LOGIN_REQUIRED = True
1838
1839 def _real_extract(self, url):
1840 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1841 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1842 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1843
1844
1845class YoutubeTruncatedURLIE(InfoExtractor):
1846 IE_NAME = 'youtube:truncated_url'
1847 IE_DESC = False # Do not list
1848 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1849
1850 def _real_extract(self, url):
1851 raise ExtractorError(
1852 u'Did you forget to quote the URL? Remember that & is a meta '
1853 u'character in most shells, so you want to put the URL in quotes, '
1854 u'like youtube-dl '
1855 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1856 u' (or simply youtube-dl BaW_jenozKc ).',
1857 expected=True)