]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
Do not warn if fallback is without alternatives (because we did not get the flash...
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 import collections
4 import errno
5 import io
6 import itertools
7 import json
8 import os.path
9 import re
10 import socket
11 import string
12 import struct
13 import traceback
14 import xml.etree.ElementTree
15 import zlib
16
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
19 from ..utils import (
20 compat_chr,
21 compat_http_client,
22 compat_parse_qs,
23 compat_urllib_error,
24 compat_urllib_parse,
25 compat_urllib_request,
26 compat_str,
27
28 clean_html,
29 get_element_by_id,
30 ExtractorError,
31 unescapeHTML,
32 unified_strdate,
33 orderedSet,
34 write_json_file,
35 )
36
37 class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
49
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
52 try:
53 self.report_lang()
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
57 return False
58 return True
59
60 def _login(self):
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 return False
67
68 request = compat_urllib_request.Request(self._LOGIN_URL)
69 try:
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
73 return False
74
75 galx = None
76 dsh = None
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
78 if match:
79 galx = match.group(1)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
81 if match:
82 dsh = match.group(1)
83
84 # Log in
85 login_form_strs = {
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
87 u'Email': username,
88 u'GALX': galx,
89 u'Passwd': password,
90 u'PersistentCookie': u'yes',
91 u'_utf8': u'霱',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
95 u'dnConn': u'',
96 u'dsh': dsh,
97 u'pstMsg': u'0',
98 u'rmShown': u'1',
99 u'secTok': u'',
100 u'signIn': u'Sign in',
101 u'timeStmp': u'',
102 u'service': u'youtube',
103 u'uilel': u'3',
104 u'hl': u'en_US',
105 }
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
107 # chokes on unicode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
111 try:
112 self.report_login()
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
116 return False
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
119 return False
120 return True
121
122 def _confirm_age(self):
123 age_form = {
124 'next_url': '/',
125 'action_confirm': 'Confirm',
126 }
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
128 try:
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
133 return True
134
135 def _real_initialize(self):
136 if self._downloader is None:
137 return
138 if not self._set_language():
139 return
140 if not self._login():
141 return
142 self._confirm_age()
143
144
145 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
146 IE_DESC = u'YouTube.com'
147 _VALID_URL = r"""^
148 (
149 (?:https?://)? # http(s):// (optional)
150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
160 v=
161 )
162 ))
163 |youtu\.be/ # just youtu.be/xxxx
164 )
165 )? # all until now is optional -> you can pass the naked ID
166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
167 (?(1).+)? # if we found the ID, everything can follow
168 $"""
169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
170 # Listed in order of quality
171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
172 # Apple HTTP Live Streaming
173 '96', '95', '94', '93', '92', '132', '151',
174 # 3D
175 '85', '84', '102', '83', '101', '82', '100',
176 # Dash video
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
179 # Dash audio
180 '141', '172', '140', '171', '139',
181 ]
182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
183 # Apple HTTP Live Streaming
184 '96', '95', '94', '93', '92', '132', '151',
185 # 3D
186 '85', '102', '84', '101', '83', '100', '82',
187 # Dash video
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
190 # Dash audio
191 '172', '141', '171', '140', '139',
192 ]
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
198 }
199 _video_extensions = {
200 '13': '3gp',
201 '17': '3gp',
202 '18': 'mp4',
203 '22': 'mp4',
204 '36': '3gp',
205 '37': 'mp4',
206 '38': 'mp4',
207 '43': 'webm',
208 '44': 'webm',
209 '45': 'webm',
210 '46': 'webm',
211
212 # 3d videos
213 '82': 'mp4',
214 '83': 'mp4',
215 '84': 'mp4',
216 '85': 'mp4',
217 '100': 'webm',
218 '101': 'webm',
219 '102': 'webm',
220
221 # Apple HTTP Live Streaming
222 '92': 'mp4',
223 '93': 'mp4',
224 '94': 'mp4',
225 '95': 'mp4',
226 '96': 'mp4',
227 '132': 'mp4',
228 '151': 'mp4',
229
230 # Dash mp4
231 '133': 'mp4',
232 '134': 'mp4',
233 '135': 'mp4',
234 '136': 'mp4',
235 '137': 'mp4',
236 '138': 'mp4',
237 '139': 'mp4',
238 '140': 'mp4',
239 '141': 'mp4',
240 '160': 'mp4',
241
242 # Dash webm
243 '171': 'webm',
244 '172': 'webm',
245 '242': 'webm',
246 '243': 'webm',
247 '244': 'webm',
248 '245': 'webm',
249 '246': 'webm',
250 '247': 'webm',
251 '248': 'webm',
252 }
253 _video_dimensions = {
254 '5': '240x400',
255 '6': '???',
256 '13': '???',
257 '17': '144x176',
258 '18': '360x640',
259 '22': '720x1280',
260 '34': '360x640',
261 '35': '480x854',
262 '36': '240x320',
263 '37': '1080x1920',
264 '38': '3072x4096',
265 '43': '360x640',
266 '44': '480x854',
267 '45': '720x1280',
268 '46': '1080x1920',
269 '82': '360p',
270 '83': '480p',
271 '84': '720p',
272 '85': '1080p',
273 '92': '240p',
274 '93': '360p',
275 '94': '480p',
276 '95': '720p',
277 '96': '1080p',
278 '100': '360p',
279 '101': '480p',
280 '102': '720p',
281 '132': '240p',
282 '151': '72p',
283 '133': '240p',
284 '134': '360p',
285 '135': '480p',
286 '136': '720p',
287 '137': '1080p',
288 '138': '>1080p',
289 '139': '48k',
290 '140': '128k',
291 '141': '256k',
292 '160': '192p',
293 '171': '128k',
294 '172': '256k',
295 '242': '240p',
296 '243': '360p',
297 '244': '480p',
298 '245': '480p',
299 '246': '480p',
300 '247': '720p',
301 '248': '1080p',
302 }
303 _special_itags = {
304 '82': '3D',
305 '83': '3D',
306 '84': '3D',
307 '85': '3D',
308 '100': '3D',
309 '101': '3D',
310 '102': '3D',
311 '133': 'DASH Video',
312 '134': 'DASH Video',
313 '135': 'DASH Video',
314 '136': 'DASH Video',
315 '137': 'DASH Video',
316 '138': 'DASH Video',
317 '139': 'DASH Audio',
318 '140': 'DASH Audio',
319 '141': 'DASH Audio',
320 '160': 'DASH Video',
321 '171': 'DASH Audio',
322 '172': 'DASH Audio',
323 '242': 'DASH Video',
324 '243': 'DASH Video',
325 '244': 'DASH Video',
326 '245': 'DASH Video',
327 '246': 'DASH Video',
328 '247': 'DASH Video',
329 '248': 'DASH Video',
330 }
331
332 IE_NAME = u'youtube'
333 _TESTS = [
334 {
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
337 u"info_dict": {
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
343 }
344 },
345 {
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
349 u"info_dict": {
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
355 }
356 },
357 {
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
361 u"info_dict": {
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
365 u"uploader": u"Icona Pop",
366 u"uploader_id": u"IconaPop"
367 }
368 },
369 {
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
373 u"info_dict": {
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
379 }
380 },
381 {
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
385 u'info_dict': {
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
391 },
392 u'params': {
393 u'skip_download': True,
394 },
395 },
396 ]
397
398
399 @classmethod
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
402 if YoutubePlaylistIE.suitable(url): return False
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
404
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
407 self._player_cache = {}
408
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
412
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
416
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
420
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
424
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
428
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
431 player_url)
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
434
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
440
441 cache_enabled = cache_dir != u'NONE'
442 if cache_enabled:
443 cache_fn = os.path.join(os.path.expanduser(cache_dir),
444 u'youtube-sigfuncs',
445 func_id + '.json')
446 try:
447 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
448 cache_spec = json.load(cachef)
449 return lambda s: u''.join(s[i] for i in cache_spec)
450 except IOError:
451 pass # No cache available
452
453 if player_type == 'js':
454 code = self._download_webpage(
455 player_url, video_id,
456 note=u'Downloading %s player %s' % (player_type, player_id),
457 errnote=u'Download of %s failed' % player_url)
458 res = self._parse_sig_js(code)
459 elif player_type == 'swf':
460 urlh = self._request_webpage(
461 player_url, video_id,
462 note=u'Downloading %s player %s' % (player_type, player_id),
463 errnote=u'Download of %s failed' % player_url)
464 code = urlh.read()
465 res = self._parse_sig_swf(code)
466 else:
467 assert False, 'Invalid player type %r' % player_type
468
469 if cache_enabled:
470 try:
471 cache_res = res(map(compat_chr, range(slen)))
472 cache_spec = [ord(c) for c in cache_res]
473 try:
474 os.makedirs(os.path.dirname(cache_fn))
475 except OSError as ose:
476 if ose.errno != errno.EEXIST:
477 raise
478 write_json_file(cache_spec, cache_fn)
479 except Exception:
480 tb = traceback.format_exc()
481 self._downloader.report_warning(
482 u'Writing cache to %r failed: %s' % (cache_fn, tb))
483
484 return res
485
486 def _print_sig_code(self, func, slen):
487 def gen_sig_code(idxs):
488 def _genslice(start, end, step):
489 starts = u'' if start == 0 else str(start)
490 ends = u':%d' % (end+step)
491 steps = u'' if step == 1 else (':%d' % step)
492 return u's[%s%s%s]' % (starts, ends, steps)
493
494 step = None
495 start = '(Never used)' # Quelch pyflakes warnings - start will be
496 # set as soon as step is set
497 for i, prev in zip(idxs[1:], idxs[:-1]):
498 if step is not None:
499 if i - prev == step:
500 continue
501 yield _genslice(start, prev, step)
502 step = None
503 continue
504 if i - prev in [-1, 1]:
505 step = i - prev
506 start = prev
507 continue
508 else:
509 yield u's[%d]' % prev
510 if step is None:
511 yield u's[%d]' % i
512 else:
513 yield _genslice(start, i, step)
514
515 cache_res = func(map(compat_chr, range(slen)))
516 cache_spec = [ord(c) for c in cache_res]
517 expr_code = u' + '.join(gen_sig_code(cache_spec))
518 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
519 self.to_screen(u'Extracted signature function:\n' + code)
520
521 def _parse_sig_js(self, jscode):
522 funcname = self._search_regex(
523 r'signature=([a-zA-Z]+)', jscode,
524 u'Initial JS player signature function name')
525
526 functions = {}
527
528 def argidx(varname):
529 return string.lowercase.index(varname)
530
531 def interpret_statement(stmt, local_vars, allow_recursion=20):
532 if allow_recursion < 0:
533 raise ExtractorError(u'Recursion limit reached')
534
535 if stmt.startswith(u'var '):
536 stmt = stmt[len(u'var '):]
537 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
538 r'=(?P<expr>.*)$', stmt)
539 if ass_m:
540 if ass_m.groupdict().get('index'):
541 def assign(val):
542 lvar = local_vars[ass_m.group('out')]
543 idx = interpret_expression(ass_m.group('index'),
544 local_vars, allow_recursion)
545 assert isinstance(idx, int)
546 lvar[idx] = val
547 return val
548 expr = ass_m.group('expr')
549 else:
550 def assign(val):
551 local_vars[ass_m.group('out')] = val
552 return val
553 expr = ass_m.group('expr')
554 elif stmt.startswith(u'return '):
555 assign = lambda v: v
556 expr = stmt[len(u'return '):]
557 else:
558 raise ExtractorError(
559 u'Cannot determine left side of statement in %r' % stmt)
560
561 v = interpret_expression(expr, local_vars, allow_recursion)
562 return assign(v)
563
564 def interpret_expression(expr, local_vars, allow_recursion):
565 if expr.isdigit():
566 return int(expr)
567
568 if expr.isalpha():
569 return local_vars[expr]
570
571 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
572 if m:
573 member = m.group('member')
574 val = local_vars[m.group('in')]
575 if member == 'split("")':
576 return list(val)
577 if member == 'join("")':
578 return u''.join(val)
579 if member == 'length':
580 return len(val)
581 if member == 'reverse()':
582 return val[::-1]
583 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
584 if slice_m:
585 idx = interpret_expression(
586 slice_m.group('idx'), local_vars, allow_recursion-1)
587 return val[idx:]
588
589 m = re.match(
590 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
591 if m:
592 val = local_vars[m.group('in')]
593 idx = interpret_expression(m.group('idx'), local_vars,
594 allow_recursion-1)
595 return val[idx]
596
597 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
598 if m:
599 a = interpret_expression(m.group('a'),
600 local_vars, allow_recursion)
601 b = interpret_expression(m.group('b'),
602 local_vars, allow_recursion)
603 return a % b
604
605 m = re.match(
606 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
607 if m:
608 fname = m.group('func')
609 if fname not in functions:
610 functions[fname] = extract_function(fname)
611 argvals = [int(v) if v.isdigit() else local_vars[v]
612 for v in m.group('args').split(',')]
613 return functions[fname](argvals)
614 raise ExtractorError(u'Unsupported JS expression %r' % expr)
615
616 def extract_function(funcname):
617 func_m = re.search(
618 r'function ' + re.escape(funcname) +
619 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
620 jscode)
621 argnames = func_m.group('args').split(',')
622
623 def resf(args):
624 local_vars = dict(zip(argnames, args))
625 for stmt in func_m.group('code').split(';'):
626 res = interpret_statement(stmt, local_vars)
627 return res
628 return resf
629
630 initial_function = extract_function(funcname)
631 return lambda s: initial_function([s])
632
633 def _parse_sig_swf(self, file_contents):
634 if file_contents[1:3] != b'WS':
635 raise ExtractorError(
636 u'Not an SWF file; header is %r' % file_contents[:3])
637 if file_contents[:1] == b'C':
638 content = zlib.decompress(file_contents[8:])
639 else:
640 raise NotImplementedError(u'Unsupported compression format %r' %
641 file_contents[:1])
642
643 def extract_tags(content):
644 pos = 0
645 while pos < len(content):
646 header16 = struct.unpack('<H', content[pos:pos+2])[0]
647 pos += 2
648 tag_code = header16 >> 6
649 tag_len = header16 & 0x3f
650 if tag_len == 0x3f:
651 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
652 pos += 4
653 assert pos+tag_len <= len(content)
654 yield (tag_code, content[pos:pos+tag_len])
655 pos += tag_len
656
657 code_tag = next(tag
658 for tag_code, tag in extract_tags(content)
659 if tag_code == 82)
660 p = code_tag.index(b'\0', 4) + 1
661 code_reader = io.BytesIO(code_tag[p:])
662
663 # Parse ABC (AVM2 ByteCode)
664 def read_int(reader=None):
665 if reader is None:
666 reader = code_reader
667 res = 0
668 shift = 0
669 for _ in range(5):
670 buf = reader.read(1)
671 assert len(buf) == 1
672 b = struct.unpack('<B', buf)[0]
673 res = res | ((b & 0x7f) << shift)
674 if b & 0x80 == 0:
675 break
676 shift += 7
677 return res
678
679 def u30(reader=None):
680 res = read_int(reader)
681 assert res & 0xf0000000 == 0
682 return res
683 u32 = read_int
684
685 def s32(reader=None):
686 v = read_int(reader)
687 if v & 0x80000000 != 0:
688 v = - ((v ^ 0xffffffff) + 1)
689 return v
690
691 def read_string(reader=None):
692 if reader is None:
693 reader = code_reader
694 slen = u30(reader)
695 resb = reader.read(slen)
696 assert len(resb) == slen
697 return resb.decode('utf-8')
698
699 def read_bytes(count, reader=None):
700 if reader is None:
701 reader = code_reader
702 resb = reader.read(count)
703 assert len(resb) == count
704 return resb
705
706 def read_byte(reader=None):
707 resb = read_bytes(1, reader=reader)
708 res = struct.unpack('<B', resb)[0]
709 return res
710
711 # minor_version + major_version
712 read_bytes(2 + 2)
713
714 # Constant pool
715 int_count = u30()
716 for _c in range(1, int_count):
717 s32()
718 uint_count = u30()
719 for _c in range(1, uint_count):
720 u32()
721 double_count = u30()
722 read_bytes((double_count-1) * 8)
723 string_count = u30()
724 constant_strings = [u'']
725 for _c in range(1, string_count):
726 s = read_string()
727 constant_strings.append(s)
728 namespace_count = u30()
729 for _c in range(1, namespace_count):
730 read_bytes(1) # kind
731 u30() # name
732 ns_set_count = u30()
733 for _c in range(1, ns_set_count):
734 count = u30()
735 for _c2 in range(count):
736 u30()
737 multiname_count = u30()
738 MULTINAME_SIZES = {
739 0x07: 2, # QName
740 0x0d: 2, # QNameA
741 0x0f: 1, # RTQName
742 0x10: 1, # RTQNameA
743 0x11: 0, # RTQNameL
744 0x12: 0, # RTQNameLA
745 0x09: 2, # Multiname
746 0x0e: 2, # MultinameA
747 0x1b: 1, # MultinameL
748 0x1c: 1, # MultinameLA
749 }
750 multinames = [u'']
751 for _c in range(1, multiname_count):
752 kind = u30()
753 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
754 if kind == 0x07:
755 u30() # namespace_idx
756 name_idx = u30()
757 multinames.append(constant_strings[name_idx])
758 else:
759 multinames.append('[MULTINAME kind: %d]' % kind)
760 for _c2 in range(MULTINAME_SIZES[kind]):
761 u30()
762
763 # Methods
764 method_count = u30()
765 MethodInfo = collections.namedtuple(
766 'MethodInfo',
767 ['NEED_ARGUMENTS', 'NEED_REST'])
768 method_infos = []
769 for method_id in range(method_count):
770 param_count = u30()
771 u30() # return type
772 for _ in range(param_count):
773 u30() # param type
774 u30() # name index (always 0 for youtube)
775 flags = read_byte()
776 if flags & 0x08 != 0:
777 # Options present
778 option_count = u30()
779 for c in range(option_count):
780 u30() # val
781 read_bytes(1) # kind
782 if flags & 0x80 != 0:
783 # Param names present
784 for _ in range(param_count):
785 u30() # param name
786 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
787 method_infos.append(mi)
788
789 # Metadata
790 metadata_count = u30()
791 for _c in range(metadata_count):
792 u30() # name
793 item_count = u30()
794 for _c2 in range(item_count):
795 u30() # key
796 u30() # value
797
798 def parse_traits_info():
799 trait_name_idx = u30()
800 kind_full = read_byte()
801 kind = kind_full & 0x0f
802 attrs = kind_full >> 4
803 methods = {}
804 if kind in [0x00, 0x06]: # Slot or Const
805 u30() # Slot id
806 u30() # type_name_idx
807 vindex = u30()
808 if vindex != 0:
809 read_byte() # vkind
810 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
811 u30() # disp_id
812 method_idx = u30()
813 methods[multinames[trait_name_idx]] = method_idx
814 elif kind == 0x04: # Class
815 u30() # slot_id
816 u30() # classi
817 elif kind == 0x05: # Function
818 u30() # slot_id
819 function_idx = u30()
820 methods[function_idx] = multinames[trait_name_idx]
821 else:
822 raise ExtractorError(u'Unsupported trait kind %d' % kind)
823
824 if attrs & 0x4 != 0: # Metadata present
825 metadata_count = u30()
826 for _c3 in range(metadata_count):
827 u30() # metadata index
828
829 return methods
830
831 # Classes
832 TARGET_CLASSNAME = u'SignatureDecipher'
833 searched_idx = multinames.index(TARGET_CLASSNAME)
834 searched_class_id = None
835 class_count = u30()
836 for class_id in range(class_count):
837 name_idx = u30()
838 if name_idx == searched_idx:
839 # We found the class we're looking for!
840 searched_class_id = class_id
841 u30() # super_name idx
842 flags = read_byte()
843 if flags & 0x08 != 0: # Protected namespace is present
844 u30() # protected_ns_idx
845 intrf_count = u30()
846 for _c2 in range(intrf_count):
847 u30()
848 u30() # iinit
849 trait_count = u30()
850 for _c2 in range(trait_count):
851 parse_traits_info()
852
853 if searched_class_id is None:
854 raise ExtractorError(u'Target class %r not found' %
855 TARGET_CLASSNAME)
856
857 method_names = {}
858 method_idxs = {}
859 for class_id in range(class_count):
860 u30() # cinit
861 trait_count = u30()
862 for _c2 in range(trait_count):
863 trait_methods = parse_traits_info()
864 if class_id == searched_class_id:
865 method_names.update(trait_methods.items())
866 method_idxs.update(dict(
867 (idx, name)
868 for name, idx in trait_methods.items()))
869
870 # Scripts
871 script_count = u30()
872 for _c in range(script_count):
873 u30() # init
874 trait_count = u30()
875 for _c2 in range(trait_count):
876 parse_traits_info()
877
878 # Method bodies
879 method_body_count = u30()
880 Method = collections.namedtuple('Method', ['code', 'local_count'])
881 methods = {}
882 for _c in range(method_body_count):
883 method_idx = u30()
884 u30() # max_stack
885 local_count = u30()
886 u30() # init_scope_depth
887 u30() # max_scope_depth
888 code_length = u30()
889 code = read_bytes(code_length)
890 if method_idx in method_idxs:
891 m = Method(code, local_count)
892 methods[method_idxs[method_idx]] = m
893 exception_count = u30()
894 for _c2 in range(exception_count):
895 u30() # from
896 u30() # to
897 u30() # target
898 u30() # exc_type
899 u30() # var_name
900 trait_count = u30()
901 for _c2 in range(trait_count):
902 parse_traits_info()
903
904 assert p + code_reader.tell() == len(code_tag)
905 assert len(methods) == len(method_idxs)
906
907 method_pyfunctions = {}
908
909 def extract_function(func_name):
910 if func_name in method_pyfunctions:
911 return method_pyfunctions[func_name]
912 if func_name not in methods:
913 raise ExtractorError(u'Cannot find function %r' % func_name)
914 m = methods[func_name]
915
916 def resfunc(args):
917 registers = ['(this)'] + list(args) + [None] * m.local_count
918 stack = []
919 coder = io.BytesIO(m.code)
920 while True:
921 opcode = struct.unpack('!B', coder.read(1))[0]
922 if opcode == 36: # pushbyte
923 v = struct.unpack('!B', coder.read(1))[0]
924 stack.append(v)
925 elif opcode == 44: # pushstring
926 idx = u30(coder)
927 stack.append(constant_strings[idx])
928 elif opcode == 48: # pushscope
929 # We don't implement the scope register, so we'll just
930 # ignore the popped value
931 stack.pop()
932 elif opcode == 70: # callproperty
933 index = u30(coder)
934 mname = multinames[index]
935 arg_count = u30(coder)
936 args = list(reversed(
937 [stack.pop() for _ in range(arg_count)]))
938 obj = stack.pop()
939 if mname == u'split':
940 assert len(args) == 1
941 assert isinstance(args[0], compat_str)
942 assert isinstance(obj, compat_str)
943 if args[0] == u'':
944 res = list(obj)
945 else:
946 res = obj.split(args[0])
947 stack.append(res)
948 elif mname == u'slice':
949 assert len(args) == 1
950 assert isinstance(args[0], int)
951 assert isinstance(obj, list)
952 res = obj[args[0]:]
953 stack.append(res)
954 elif mname == u'join':
955 assert len(args) == 1
956 assert isinstance(args[0], compat_str)
957 assert isinstance(obj, list)
958 res = args[0].join(obj)
959 stack.append(res)
960 elif mname in method_pyfunctions:
961 stack.append(method_pyfunctions[mname](args))
962 else:
963 raise NotImplementedError(
964 u'Unsupported property %r on %r'
965 % (mname, obj))
966 elif opcode == 72: # returnvalue
967 res = stack.pop()
968 return res
969 elif opcode == 79: # callpropvoid
970 index = u30(coder)
971 mname = multinames[index]
972 arg_count = u30(coder)
973 args = list(reversed(
974 [stack.pop() for _ in range(arg_count)]))
975 obj = stack.pop()
976 if mname == u'reverse':
977 assert isinstance(obj, list)
978 obj.reverse()
979 else:
980 raise NotImplementedError(
981 u'Unsupported (void) property %r on %r'
982 % (mname, obj))
983 elif opcode == 93: # findpropstrict
984 index = u30(coder)
985 mname = multinames[index]
986 res = extract_function(mname)
987 stack.append(res)
988 elif opcode == 97: # setproperty
989 index = u30(coder)
990 value = stack.pop()
991 idx = stack.pop()
992 obj = stack.pop()
993 assert isinstance(obj, list)
994 assert isinstance(idx, int)
995 obj[idx] = value
996 elif opcode == 98: # getlocal
997 index = u30(coder)
998 stack.append(registers[index])
999 elif opcode == 99: # setlocal
1000 index = u30(coder)
1001 value = stack.pop()
1002 registers[index] = value
1003 elif opcode == 102: # getproperty
1004 index = u30(coder)
1005 pname = multinames[index]
1006 if pname == u'length':
1007 obj = stack.pop()
1008 assert isinstance(obj, list)
1009 stack.append(len(obj))
1010 else: # Assume attribute access
1011 idx = stack.pop()
1012 assert isinstance(idx, int)
1013 obj = stack.pop()
1014 assert isinstance(obj, list)
1015 stack.append(obj[idx])
1016 elif opcode == 128: # coerce
1017 u30(coder)
1018 elif opcode == 133: # coerce_s
1019 assert isinstance(stack[-1], (type(None), compat_str))
1020 elif opcode == 164: # modulo
1021 value2 = stack.pop()
1022 value1 = stack.pop()
1023 res = value1 % value2
1024 stack.append(res)
1025 elif opcode == 208: # getlocal_0
1026 stack.append(registers[0])
1027 elif opcode == 209: # getlocal_1
1028 stack.append(registers[1])
1029 elif opcode == 210: # getlocal_2
1030 stack.append(registers[2])
1031 elif opcode == 211: # getlocal_3
1032 stack.append(registers[3])
1033 elif opcode == 214: # setlocal_2
1034 registers[2] = stack.pop()
1035 elif opcode == 215: # setlocal_3
1036 registers[3] = stack.pop()
1037 else:
1038 raise NotImplementedError(
1039 u'Unsupported opcode %d' % opcode)
1040
1041 method_pyfunctions[func_name] = resfunc
1042 return resfunc
1043
1044 initial_function = extract_function(u'decipher')
1045 return lambda s: initial_function([s])
1046
1047 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1048 """Turn the encrypted s field into a working signature"""
1049
1050 if player_url is not None:
1051 try:
1052 if player_url not in self._player_cache:
1053 func = self._extract_signature_function(
1054 video_id, player_url, len(s)
1055 )
1056 self._player_cache[player_url] = func
1057 func = self._player_cache[player_url]
1058 if self._downloader.params.get('youtube_print_sig_code'):
1059 self._print_sig_code(func, len(s))
1060 return func(s)
1061 except Exception:
1062 tb = traceback.format_exc()
1063 self._downloader.report_warning(
1064 u'Automatic signature extraction failed: ' + tb)
1065
1066 self._downloader.report_warning(
1067 u'Warning: Falling back to static signature algorithm')
1068 return self._static_decrypt_signature(
1069 s, video_id, player_url, age_gate)
1070
1071 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1072 if age_gate:
1073 # The videos with age protection use another player, so the
1074 # algorithms can be different.
1075 if len(s) == 86:
1076 return s[2:63] + s[82] + s[64:82] + s[63]
1077
1078 if len(s) == 93:
1079 return s[86:29:-1] + s[88] + s[28:5:-1]
1080 elif len(s) == 92:
1081 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1082 elif len(s) == 91:
1083 return s[84:27:-1] + s[86] + s[26:5:-1]
1084 elif len(s) == 90:
1085 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1086 elif len(s) == 89:
1087 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1088 elif len(s) == 88:
1089 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1090 elif len(s) == 87:
1091 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1092 elif len(s) == 86:
1093 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1094 elif len(s) == 85:
1095 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1096 elif len(s) == 84:
1097 return s[81:36:-1] + s[0] + s[35:2:-1]
1098 elif len(s) == 83:
1099 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1100 elif len(s) == 82:
1101 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1102 elif len(s) == 81:
1103 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1104 elif len(s) == 80:
1105 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1106 elif len(s) == 79:
1107 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1108
1109 else:
1110 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1111
1112 def _get_available_subtitles(self, video_id):
1113 try:
1114 sub_list = self._download_webpage(
1115 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1116 video_id, note=False)
1117 except ExtractorError as err:
1118 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1119 return {}
1120 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1121
1122 sub_lang_list = {}
1123 for l in lang_list:
1124 lang = l[1]
1125 params = compat_urllib_parse.urlencode({
1126 'lang': lang,
1127 'v': video_id,
1128 'fmt': self._downloader.params.get('subtitlesformat'),
1129 })
1130 url = u'http://www.youtube.com/api/timedtext?' + params
1131 sub_lang_list[lang] = url
1132 if not sub_lang_list:
1133 self._downloader.report_warning(u'video doesn\'t have subtitles')
1134 return {}
1135 return sub_lang_list
1136
1137 def _get_available_automatic_caption(self, video_id, webpage):
1138 """We need the webpage for getting the captions url, pass it as an
1139 argument to speed up the process."""
1140 sub_format = self._downloader.params.get('subtitlesformat')
1141 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1142 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1143 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1144 if mobj is None:
1145 self._downloader.report_warning(err_msg)
1146 return {}
1147 player_config = json.loads(mobj.group(1))
1148 try:
1149 args = player_config[u'args']
1150 caption_url = args[u'ttsurl']
1151 timestamp = args[u'timestamp']
1152 # We get the available subtitles
1153 list_params = compat_urllib_parse.urlencode({
1154 'type': 'list',
1155 'tlangs': 1,
1156 'asrs': 1,
1157 })
1158 list_url = caption_url + '&' + list_params
1159 list_page = self._download_webpage(list_url, video_id)
1160 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1161 original_lang_node = caption_list.find('track')
1162 if original_lang_node.attrib.get('kind') != 'asr' :
1163 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1164 return {}
1165 original_lang = original_lang_node.attrib['lang_code']
1166
1167 sub_lang_list = {}
1168 for lang_node in caption_list.findall('target'):
1169 sub_lang = lang_node.attrib['lang_code']
1170 params = compat_urllib_parse.urlencode({
1171 'lang': original_lang,
1172 'tlang': sub_lang,
1173 'fmt': sub_format,
1174 'ts': timestamp,
1175 'kind': 'asr',
1176 })
1177 sub_lang_list[sub_lang] = caption_url + '&' + params
1178 return sub_lang_list
1179 # An extractor error can be raise by the download process if there are
1180 # no automatic captions but there are subtitles
1181 except (KeyError, ExtractorError):
1182 self._downloader.report_warning(err_msg)
1183 return {}
1184
1185 def _print_formats(self, formats):
1186 print('Available formats:')
1187 for x in formats:
1188 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1189 self._video_dimensions.get(x, '???'),
1190 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1191
1192 def _extract_id(self, url):
1193 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1194 if mobj is None:
1195 raise ExtractorError(u'Invalid URL: %s' % url)
1196 video_id = mobj.group(2)
1197 return video_id
1198
1199 def _get_video_url_list(self, url_map):
1200 """
1201 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1202 with the requested formats.
1203 """
1204 req_format = self._downloader.params.get('format', None)
1205 format_limit = self._downloader.params.get('format_limit', None)
1206 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1207 if format_limit is not None and format_limit in available_formats:
1208 format_list = available_formats[available_formats.index(format_limit):]
1209 else:
1210 format_list = available_formats
1211 existing_formats = [x for x in format_list if x in url_map]
1212 if len(existing_formats) == 0:
1213 raise ExtractorError(u'no known formats available for video')
1214 if self._downloader.params.get('listformats', None):
1215 self._print_formats(existing_formats)
1216 return
1217 if req_format is None or req_format == 'best':
1218 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1219 elif req_format == 'worst':
1220 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1221 elif req_format in ('-1', 'all'):
1222 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1223 else:
1224 # Specific formats. We pick the first in a slash-delimeted sequence.
1225 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1226 # available in the specified format. For example,
1227 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1228 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1229 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1230 req_formats = req_format.split('/')
1231 video_url_list = None
1232 for rf in req_formats:
1233 if rf in url_map:
1234 video_url_list = [(rf, url_map[rf])]
1235 break
1236 if rf in self._video_formats_map:
1237 for srf in self._video_formats_map[rf]:
1238 if srf in url_map:
1239 video_url_list = [(srf, url_map[srf])]
1240 break
1241 else:
1242 continue
1243 break
1244 if video_url_list is None:
1245 raise ExtractorError(u'requested format not available')
1246 return video_url_list
1247
1248 def _extract_from_m3u8(self, manifest_url, video_id):
1249 url_map = {}
1250 def _get_urls(_manifest):
1251 lines = _manifest.split('\n')
1252 urls = filter(lambda l: l and not l.startswith('#'),
1253 lines)
1254 return urls
1255 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1256 formats_urls = _get_urls(manifest)
1257 for format_url in formats_urls:
1258 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1259 url_map[itag] = format_url
1260 return url_map
1261
1262 def _real_extract(self, url):
1263 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1264 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1265
1266 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1267 mobj = re.search(self._NEXT_URL_RE, url)
1268 if mobj:
1269 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1270 video_id = self._extract_id(url)
1271
1272 # Get video webpage
1273 self.report_video_webpage_download(video_id)
1274 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1275 request = compat_urllib_request.Request(url)
1276 try:
1277 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1278 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1279 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1280
1281 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1282
1283 # Attempt to extract SWF player URL
1284 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1285 if mobj is not None:
1286 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1287 else:
1288 player_url = None
1289
1290 # Get video info
1291 self.report_video_info_webpage_download(video_id)
1292 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1293 self.report_age_confirmation()
1294 age_gate = True
1295 # We simulate the access to the video from www.youtube.com/v/{video_id}
1296 # this can be viewed without login into Youtube
1297 data = compat_urllib_parse.urlencode({'video_id': video_id,
1298 'el': 'embedded',
1299 'gl': 'US',
1300 'hl': 'en',
1301 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1302 'asv': 3,
1303 'sts':'1588',
1304 })
1305 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1306 video_info_webpage = self._download_webpage(video_info_url, video_id,
1307 note=False,
1308 errnote='unable to download video info webpage')
1309 video_info = compat_parse_qs(video_info_webpage)
1310 else:
1311 age_gate = False
1312 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1313 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1314 % (video_id, el_type))
1315 video_info_webpage = self._download_webpage(video_info_url, video_id,
1316 note=False,
1317 errnote='unable to download video info webpage')
1318 video_info = compat_parse_qs(video_info_webpage)
1319 if 'token' in video_info:
1320 break
1321 if 'token' not in video_info:
1322 if 'reason' in video_info:
1323 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1324 else:
1325 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1326
1327 # Check for "rental" videos
1328 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1329 raise ExtractorError(u'"rental" videos not supported')
1330
1331 # Start extracting information
1332 self.report_information_extraction(video_id)
1333
1334 # uploader
1335 if 'author' not in video_info:
1336 raise ExtractorError(u'Unable to extract uploader name')
1337 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1338
1339 # uploader_id
1340 video_uploader_id = None
1341 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1342 if mobj is not None:
1343 video_uploader_id = mobj.group(1)
1344 else:
1345 self._downloader.report_warning(u'unable to extract uploader nickname')
1346
1347 # title
1348 if 'title' not in video_info:
1349 raise ExtractorError(u'Unable to extract video title')
1350 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1351
1352 # thumbnail image
1353 # We try first to get a high quality image:
1354 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1355 video_webpage, re.DOTALL)
1356 if m_thumb is not None:
1357 video_thumbnail = m_thumb.group(1)
1358 elif 'thumbnail_url' not in video_info:
1359 self._downloader.report_warning(u'unable to extract video thumbnail')
1360 video_thumbnail = ''
1361 else: # don't panic if we can't find it
1362 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1363
1364 # upload date
1365 upload_date = None
1366 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1367 if mobj is not None:
1368 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1369 upload_date = unified_strdate(upload_date)
1370
1371 # description
1372 video_description = get_element_by_id("eow-description", video_webpage)
1373 if video_description:
1374 video_description = clean_html(video_description)
1375 else:
1376 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1377 if fd_mobj:
1378 video_description = unescapeHTML(fd_mobj.group(1))
1379 else:
1380 video_description = u''
1381
1382 # subtitles
1383 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1384
1385 if self._downloader.params.get('listsubtitles', False):
1386 self._list_available_subtitles(video_id, video_webpage)
1387 return
1388
1389 if 'length_seconds' not in video_info:
1390 self._downloader.report_warning(u'unable to extract video duration')
1391 video_duration = ''
1392 else:
1393 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1394
1395 # Decide which formats to download
1396
1397 try:
1398 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1399 if not mobj:
1400 raise ValueError('Could not find vevo ID')
1401 info = json.loads(mobj.group(1))
1402 args = info['args']
1403 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1404 # this signatures are encrypted
1405 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1406 if m_s is not None:
1407 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1408 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1409 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1410 if m_s is not None:
1411 if 'url_encoded_fmt_stream_map' in video_info:
1412 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1413 else:
1414 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1415 elif 'adaptive_fmts' in video_info:
1416 if 'url_encoded_fmt_stream_map' in video_info:
1417 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1418 else:
1419 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1420 except ValueError:
1421 pass
1422
1423 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1424 self.report_rtmp_download()
1425 video_url_list = [(None, video_info['conn'][0])]
1426 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1427 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1428 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1429 url_map = {}
1430 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1431 url_data = compat_parse_qs(url_data_str)
1432 if 'itag' in url_data and 'url' in url_data:
1433 url = url_data['url'][0]
1434 if 'sig' in url_data:
1435 url += '&signature=' + url_data['sig'][0]
1436 elif 's' in url_data:
1437 encrypted_sig = url_data['s'][0]
1438 if self._downloader.params.get('verbose'):
1439 if age_gate:
1440 if player_url is None:
1441 player_version = 'unknown'
1442 else:
1443 player_version = self._search_regex(
1444 r'-(.+)\.swf$', player_url,
1445 u'flash player', fatal=False)
1446 player_desc = 'flash player %s' % player_version
1447 else:
1448 player_version = self._search_regex(
1449 r'html5player-(.+?)\.js', video_webpage,
1450 'html5 player', fatal=False)
1451 player_desc = u'html5 player %s' % player_version
1452
1453 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1454 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1455 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1456
1457 if not age_gate:
1458 jsplayer_url_json = self._search_regex(
1459 r'"assets":.+?"js":\s*("[^"]+")',
1460 video_webpage, u'JS player URL')
1461 player_url = json.loads(jsplayer_url_json)
1462
1463 signature = self._decrypt_signature(
1464 encrypted_sig, video_id, player_url, age_gate)
1465 url += '&signature=' + signature
1466 if 'ratebypass' not in url:
1467 url += '&ratebypass=yes'
1468 url_map[url_data['itag'][0]] = url
1469 video_url_list = self._get_video_url_list(url_map)
1470 if not video_url_list:
1471 return
1472 elif video_info.get('hlsvp'):
1473 manifest_url = video_info['hlsvp'][0]
1474 url_map = self._extract_from_m3u8(manifest_url, video_id)
1475 video_url_list = self._get_video_url_list(url_map)
1476 if not video_url_list:
1477 return
1478
1479 else:
1480 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1481
1482 results = []
1483 for format_param, video_real_url in video_url_list:
1484 # Extension
1485 video_extension = self._video_extensions.get(format_param, 'flv')
1486
1487 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1488 self._video_dimensions.get(format_param, '???'),
1489 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1490
1491 results.append({
1492 'id': video_id,
1493 'url': video_real_url,
1494 'uploader': video_uploader,
1495 'uploader_id': video_uploader_id,
1496 'upload_date': upload_date,
1497 'title': video_title,
1498 'ext': video_extension,
1499 'format': video_format,
1500 'thumbnail': video_thumbnail,
1501 'description': video_description,
1502 'player_url': player_url,
1503 'subtitles': video_subtitles,
1504 'duration': video_duration
1505 })
1506 return results
1507
1508 class YoutubePlaylistIE(InfoExtractor):
1509 IE_DESC = u'YouTube.com playlists'
1510 _VALID_URL = r"""(?:
1511 (?:https?://)?
1512 (?:\w+\.)?
1513 youtube\.com/
1514 (?:
1515 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1516 \? (?:.*?&)*? (?:p|a|list)=
1517 | p/
1518 )
1519 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1520 .*
1521 |
1522 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1523 )"""
1524 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1525 _MAX_RESULTS = 50
1526 IE_NAME = u'youtube:playlist'
1527
1528 @classmethod
1529 def suitable(cls, url):
1530 """Receives a URL and returns True if suitable for this IE."""
1531 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1532
1533 def _real_extract(self, url):
1534 # Extract playlist id
1535 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1536 if mobj is None:
1537 raise ExtractorError(u'Invalid URL: %s' % url)
1538
1539 # Download playlist videos from API
1540 playlist_id = mobj.group(1) or mobj.group(2)
1541 videos = []
1542
1543 for page_num in itertools.count(1):
1544 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1545 if start_index >= 1000:
1546 self._downloader.report_warning(u'Max number of results reached')
1547 break
1548 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1549 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1550
1551 try:
1552 response = json.loads(page)
1553 except ValueError as err:
1554 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1555
1556 if 'feed' not in response:
1557 raise ExtractorError(u'Got a malformed response from YouTube API')
1558 playlist_title = response['feed']['title']['$t']
1559 if 'entry' not in response['feed']:
1560 # Number of videos is a multiple of self._MAX_RESULTS
1561 break
1562
1563 for entry in response['feed']['entry']:
1564 index = entry['yt$position']['$t']
1565 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1566 videos.append((
1567 index,
1568 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1569 ))
1570
1571 videos = [v[1] for v in sorted(videos)]
1572
1573 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1574 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1575
1576
1577 class YoutubeChannelIE(InfoExtractor):
1578 IE_DESC = u'YouTube.com channels'
1579 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1580 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1581 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1582 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1583 IE_NAME = u'youtube:channel'
1584
1585 def extract_videos_from_page(self, page):
1586 ids_in_page = []
1587 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1588 if mobj.group(1) not in ids_in_page:
1589 ids_in_page.append(mobj.group(1))
1590 return ids_in_page
1591
1592 def _real_extract(self, url):
1593 # Extract channel id
1594 mobj = re.match(self._VALID_URL, url)
1595 if mobj is None:
1596 raise ExtractorError(u'Invalid URL: %s' % url)
1597
1598 # Download channel page
1599 channel_id = mobj.group(1)
1600 video_ids = []
1601 pagenum = 1
1602
1603 url = self._TEMPLATE_URL % (channel_id, pagenum)
1604 page = self._download_webpage(url, channel_id,
1605 u'Downloading page #%s' % pagenum)
1606
1607 # Extract video identifiers
1608 ids_in_page = self.extract_videos_from_page(page)
1609 video_ids.extend(ids_in_page)
1610
1611 # Download any subsequent channel pages using the json-based channel_ajax query
1612 if self._MORE_PAGES_INDICATOR in page:
1613 for pagenum in itertools.count(1):
1614 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1615 page = self._download_webpage(url, channel_id,
1616 u'Downloading page #%s' % pagenum)
1617
1618 page = json.loads(page)
1619
1620 ids_in_page = self.extract_videos_from_page(page['content_html'])
1621 video_ids.extend(ids_in_page)
1622
1623 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1624 break
1625
1626 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1627
1628 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1629 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1630 return [self.playlist_result(url_entries, channel_id)]
1631
1632
1633 class YoutubeUserIE(InfoExtractor):
1634 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1635 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1636 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1637 _GDATA_PAGE_SIZE = 50
1638 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1639 IE_NAME = u'youtube:user'
1640
1641 @classmethod
1642 def suitable(cls, url):
1643 # Don't return True if the url can be extracted with other youtube
1644 # extractor, the regex would is too permissive and it would match.
1645 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1646 if any(ie.suitable(url) for ie in other_ies): return False
1647 else: return super(YoutubeUserIE, cls).suitable(url)
1648
1649 def _real_extract(self, url):
1650 # Extract username
1651 mobj = re.match(self._VALID_URL, url)
1652 if mobj is None:
1653 raise ExtractorError(u'Invalid URL: %s' % url)
1654
1655 username = mobj.group(1)
1656
1657 # Download video ids using YouTube Data API. Result size per
1658 # query is limited (currently to 50 videos) so we need to query
1659 # page by page until there are no video ids - it means we got
1660 # all of them.
1661
1662 video_ids = []
1663
1664 for pagenum in itertools.count(0):
1665 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1666
1667 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1668 page = self._download_webpage(gdata_url, username,
1669 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1670
1671 try:
1672 response = json.loads(page)
1673 except ValueError as err:
1674 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1675 if 'entry' not in response['feed']:
1676 # Number of videos is a multiple of self._MAX_RESULTS
1677 break
1678
1679 # Extract video identifiers
1680 ids_in_page = []
1681 for entry in response['feed']['entry']:
1682 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1683 video_ids.extend(ids_in_page)
1684
1685 # A little optimization - if current page is not
1686 # "full", ie. does not contain PAGE_SIZE video ids then
1687 # we can assume that this page is the last one - there
1688 # are no more ids on further pages - no need to query
1689 # again.
1690
1691 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1692 break
1693
1694 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1695 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1696 return [self.playlist_result(url_results, playlist_title = username)]
1697
1698 class YoutubeSearchIE(SearchInfoExtractor):
1699 IE_DESC = u'YouTube.com searches'
1700 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1701 _MAX_RESULTS = 1000
1702 IE_NAME = u'youtube:search'
1703 _SEARCH_KEY = 'ytsearch'
1704
1705 def report_download_page(self, query, pagenum):
1706 """Report attempt to download search page with given number."""
1707 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1708
1709 def _get_n_results(self, query, n):
1710 """Get a specified number of results for a query"""
1711
1712 video_ids = []
1713 pagenum = 0
1714 limit = n
1715
1716 while (50 * pagenum) < limit:
1717 self.report_download_page(query, pagenum+1)
1718 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1719 request = compat_urllib_request.Request(result_url)
1720 try:
1721 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1722 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1723 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1724 api_response = json.loads(data)['data']
1725
1726 if not 'items' in api_response:
1727 raise ExtractorError(u'[youtube] No video results')
1728
1729 new_ids = list(video['id'] for video in api_response['items'])
1730 video_ids += new_ids
1731
1732 limit = min(n, api_response['totalItems'])
1733 pagenum += 1
1734
1735 if len(video_ids) > n:
1736 video_ids = video_ids[:n]
1737 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1738 return self.playlist_result(videos, query)
1739
1740
1741 class YoutubeShowIE(InfoExtractor):
1742 IE_DESC = u'YouTube.com (multi-season) shows'
1743 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1744 IE_NAME = u'youtube:show'
1745
1746 def _real_extract(self, url):
1747 mobj = re.match(self._VALID_URL, url)
1748 show_name = mobj.group(1)
1749 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1750 # There's one playlist for each season of the show
1751 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1752 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1753 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1754
1755
1756 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1757 """
1758 Base class for extractors that fetch info from
1759 http://www.youtube.com/feed_ajax
1760 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1761 """
1762 _LOGIN_REQUIRED = True
1763 _PAGING_STEP = 30
1764 # use action_load_personal_feed instead of action_load_system_feed
1765 _PERSONAL_FEED = False
1766
1767 @property
1768 def _FEED_TEMPLATE(self):
1769 action = 'action_load_system_feed'
1770 if self._PERSONAL_FEED:
1771 action = 'action_load_personal_feed'
1772 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1773
1774 @property
1775 def IE_NAME(self):
1776 return u'youtube:%s' % self._FEED_NAME
1777
1778 def _real_initialize(self):
1779 self._login()
1780
1781 def _real_extract(self, url):
1782 feed_entries = []
1783 # The step argument is available only in 2.7 or higher
1784 for i in itertools.count(0):
1785 paging = i*self._PAGING_STEP
1786 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1787 u'%s feed' % self._FEED_NAME,
1788 u'Downloading page %s' % i)
1789 info = json.loads(info)
1790 feed_html = info['feed_html']
1791 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1792 ids = orderedSet(m.group(1) for m in m_ids)
1793 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1794 if info['paging'] is None:
1795 break
1796 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1797
1798 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1799 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1800 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1801 _FEED_NAME = 'subscriptions'
1802 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1803
1804 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1805 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1806 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1807 _FEED_NAME = 'recommended'
1808 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1809
1810 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1811 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1812 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1813 _FEED_NAME = 'watch_later'
1814 _PLAYLIST_TITLE = u'Youtube Watch Later'
1815 _PAGING_STEP = 100
1816 _PERSONAL_FEED = True
1817
1818 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1819 IE_NAME = u'youtube:favorites'
1820 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1821 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1822 _LOGIN_REQUIRED = True
1823
1824 def _real_extract(self, url):
1825 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1826 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1827 return self.url_result(playlist_id, 'YoutubePlaylist')