]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[youtube] Improve source code quality
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 import collections
4 import errno
5 import io
6 import itertools
7 import json
8 import os.path
9 import re
10 import socket
11 import string
12 import struct
13 import traceback
14 import xml.etree.ElementTree
15 import zlib
16
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
19 from ..utils import (
20 compat_chr,
21 compat_http_client,
22 compat_parse_qs,
23 compat_urllib_error,
24 compat_urllib_parse,
25 compat_urllib_request,
26 compat_str,
27
28 clean_html,
29 get_element_by_id,
30 ExtractorError,
31 unescapeHTML,
32 unified_strdate,
33 orderedSet,
34 write_json_file,
35 )
36
37 class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
49
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
52 try:
53 self.report_lang()
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
57 return False
58 return True
59
60 def _login(self):
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 return False
67
68 request = compat_urllib_request.Request(self._LOGIN_URL)
69 try:
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
73 return False
74
75 galx = None
76 dsh = None
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
78 if match:
79 galx = match.group(1)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
81 if match:
82 dsh = match.group(1)
83
84 # Log in
85 login_form_strs = {
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
87 u'Email': username,
88 u'GALX': galx,
89 u'Passwd': password,
90 u'PersistentCookie': u'yes',
91 u'_utf8': u'霱',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
95 u'dnConn': u'',
96 u'dsh': dsh,
97 u'pstMsg': u'0',
98 u'rmShown': u'1',
99 u'secTok': u'',
100 u'signIn': u'Sign in',
101 u'timeStmp': u'',
102 u'service': u'youtube',
103 u'uilel': u'3',
104 u'hl': u'en_US',
105 }
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
107 # chokes on unicode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
111 try:
112 self.report_login()
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
116 return False
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
119 return False
120 return True
121
122 def _confirm_age(self):
123 age_form = {
124 'next_url': '/',
125 'action_confirm': 'Confirm',
126 }
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
128 try:
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
133 return True
134
135 def _real_initialize(self):
136 if self._downloader is None:
137 return
138 if not self._set_language():
139 return
140 if not self._login():
141 return
142 self._confirm_age()
143
144
145 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
146 IE_DESC = u'YouTube.com'
147 _VALID_URL = r"""^
148 (
149 (?:https?://)? # http(s):// (optional)
150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
160 v=
161 )
162 ))
163 |youtu\.be/ # just youtu.be/xxxx
164 )
165 )? # all until now is optional -> you can pass the naked ID
166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
167 (?(1).+)? # if we found the ID, everything can follow
168 $"""
169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
170 # Listed in order of quality
171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
172 # Apple HTTP Live Streaming
173 '96', '95', '94', '93', '92', '132', '151',
174 # 3D
175 '85', '84', '102', '83', '101', '82', '100',
176 # Dash video
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
179 # Dash audio
180 '141', '172', '140', '171', '139',
181 ]
182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
183 # Apple HTTP Live Streaming
184 '96', '95', '94', '93', '92', '132', '151',
185 # 3D
186 '85', '102', '84', '101', '83', '100', '82',
187 # Dash video
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
190 # Dash audio
191 '172', '141', '171', '140', '139',
192 ]
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
198 }
199 _video_extensions = {
200 '13': '3gp',
201 '17': '3gp',
202 '18': 'mp4',
203 '22': 'mp4',
204 '36': '3gp',
205 '37': 'mp4',
206 '38': 'mp4',
207 '43': 'webm',
208 '44': 'webm',
209 '45': 'webm',
210 '46': 'webm',
211
212 # 3d videos
213 '82': 'mp4',
214 '83': 'mp4',
215 '84': 'mp4',
216 '85': 'mp4',
217 '100': 'webm',
218 '101': 'webm',
219 '102': 'webm',
220
221 # Apple HTTP Live Streaming
222 '92': 'mp4',
223 '93': 'mp4',
224 '94': 'mp4',
225 '95': 'mp4',
226 '96': 'mp4',
227 '132': 'mp4',
228 '151': 'mp4',
229
230 # Dash mp4
231 '133': 'mp4',
232 '134': 'mp4',
233 '135': 'mp4',
234 '136': 'mp4',
235 '137': 'mp4',
236 '138': 'mp4',
237 '139': 'mp4',
238 '140': 'mp4',
239 '141': 'mp4',
240 '160': 'mp4',
241
242 # Dash webm
243 '171': 'webm',
244 '172': 'webm',
245 '242': 'webm',
246 '243': 'webm',
247 '244': 'webm',
248 '245': 'webm',
249 '246': 'webm',
250 '247': 'webm',
251 '248': 'webm',
252 }
253 _video_dimensions = {
254 '5': '240x400',
255 '6': '???',
256 '13': '???',
257 '17': '144x176',
258 '18': '360x640',
259 '22': '720x1280',
260 '34': '360x640',
261 '35': '480x854',
262 '36': '240x320',
263 '37': '1080x1920',
264 '38': '3072x4096',
265 '43': '360x640',
266 '44': '480x854',
267 '45': '720x1280',
268 '46': '1080x1920',
269 '82': '360p',
270 '83': '480p',
271 '84': '720p',
272 '85': '1080p',
273 '92': '240p',
274 '93': '360p',
275 '94': '480p',
276 '95': '720p',
277 '96': '1080p',
278 '100': '360p',
279 '101': '480p',
280 '102': '720p',
281 '132': '240p',
282 '151': '72p',
283 '133': '240p',
284 '134': '360p',
285 '135': '480p',
286 '136': '720p',
287 '137': '1080p',
288 '138': '>1080p',
289 '139': '48k',
290 '140': '128k',
291 '141': '256k',
292 '160': '192p',
293 '171': '128k',
294 '172': '256k',
295 '242': '240p',
296 '243': '360p',
297 '244': '480p',
298 '245': '480p',
299 '246': '480p',
300 '247': '720p',
301 '248': '1080p',
302 }
303 _special_itags = {
304 '82': '3D',
305 '83': '3D',
306 '84': '3D',
307 '85': '3D',
308 '100': '3D',
309 '101': '3D',
310 '102': '3D',
311 '133': 'DASH Video',
312 '134': 'DASH Video',
313 '135': 'DASH Video',
314 '136': 'DASH Video',
315 '137': 'DASH Video',
316 '138': 'DASH Video',
317 '139': 'DASH Audio',
318 '140': 'DASH Audio',
319 '141': 'DASH Audio',
320 '160': 'DASH Video',
321 '171': 'DASH Audio',
322 '172': 'DASH Audio',
323 '242': 'DASH Video',
324 '243': 'DASH Video',
325 '244': 'DASH Video',
326 '245': 'DASH Video',
327 '246': 'DASH Video',
328 '247': 'DASH Video',
329 '248': 'DASH Video',
330 }
331
332 IE_NAME = u'youtube'
333 _TESTS = [
334 {
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
337 u"info_dict": {
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
343 }
344 },
345 {
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
349 u"info_dict": {
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
355 }
356 },
357 {
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
361 u"info_dict": {
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
365 u"uploader": u"Icona Pop",
366 u"uploader_id": u"IconaPop"
367 }
368 },
369 {
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
373 u"info_dict": {
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
379 }
380 },
381 {
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
385 u'info_dict': {
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
391 },
392 u'params': {
393 u'skip_download': True,
394 },
395 },
396 ]
397
398
399 @classmethod
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
402 if YoutubePlaylistIE.suitable(url): return False
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
404
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
407 self._player_cache = {}
408
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
412
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
416
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
420
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
424
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
428
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
431 player_url)
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
434
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
440
441 if cache_dir != u'NONE':
442 cache_fn = os.path.join(os.path.expanduser(cache_dir),
443 u'youtube-sigfuncs',
444 func_id + '.json')
445 try:
446 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
447 cache_spec = json.load(cachef)
448 return lambda s: u''.join(s[i] for i in cache_spec)
449 except IOError:
450 pass # No cache available
451
452 if player_type == 'js':
453 code = self._download_webpage(
454 player_url, video_id,
455 note=u'Downloading %s player %s' % (player_type, player_id),
456 errnote=u'Download of %s failed' % player_url)
457 res = self._parse_sig_js(code)
458 elif player_type == 'swf':
459 urlh = self._request_webpage(
460 player_url, video_id,
461 note=u'Downloading %s player %s' % (player_type, player_id),
462 errnote=u'Download of %s failed' % player_url)
463 code = urlh.read()
464 res = self._parse_sig_swf(code)
465 else:
466 assert False, 'Invalid player type %r' % player_type
467
468 if cache_dir is not False:
469 try:
470 cache_res = res(map(compat_chr, range(slen)))
471 cache_spec = [ord(c) for c in cache_res]
472 try:
473 os.makedirs(os.path.dirname(cache_fn))
474 except OSError as ose:
475 if ose.errno != errno.EEXIST:
476 raise
477 write_json_file(cache_spec, cache_fn)
478 except Exception:
479 tb = traceback.format_exc()
480 self._downloader.report_warning(
481 u'Writing cache to %r failed: %s' % (cache_fn, tb))
482
483 return res
484
485 def _print_sig_code(self, func, slen):
486 def gen_sig_code(idxs):
487 def _genslice(start, end, step):
488 starts = u'' if start == 0 else str(start)
489 ends = u':%d' % (end+step)
490 steps = u'' if step == 1 else (':%d' % step)
491 return u's[%s%s%s]' % (starts, ends, steps)
492
493 step = None
494 start = '(Never used)' # Quelch pyflakes warnings - start will be
495 # set as soon as step is set
496 for i, prev in zip(idxs[1:], idxs[:-1]):
497 if step is not None:
498 if i - prev == step:
499 continue
500 yield _genslice(start, prev, step)
501 step = None
502 continue
503 if i - prev in [-1, 1]:
504 step = i - prev
505 start = prev
506 continue
507 else:
508 yield u's[%d]' % prev
509 if step is None:
510 yield u's[%d]' % i
511 else:
512 yield _genslice(start, i, step)
513
514 cache_res = func(map(compat_chr, range(slen)))
515 cache_spec = [ord(c) for c in cache_res]
516 expr_code = u' + '.join(gen_sig_code(cache_spec))
517 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
518 self.to_screen(u'Extracted signature:\n' + code)
519
520 def _parse_sig_js(self, jscode):
521 funcname = self._search_regex(
522 r'signature=([a-zA-Z]+)', jscode,
523 u'Initial JS player signature function name')
524
525 functions = {}
526
527 def argidx(varname):
528 return string.lowercase.index(varname)
529
530 def interpret_statement(stmt, local_vars, allow_recursion=20):
531 if allow_recursion < 0:
532 raise ExtractorError(u'Recursion limit reached')
533
534 if stmt.startswith(u'var '):
535 stmt = stmt[len(u'var '):]
536 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
537 r'=(?P<expr>.*)$', stmt)
538 if ass_m:
539 if ass_m.groupdict().get('index'):
540 def assign(val):
541 lvar = local_vars[ass_m.group('out')]
542 idx = interpret_expression(ass_m.group('index'),
543 local_vars, allow_recursion)
544 assert isinstance(idx, int)
545 lvar[idx] = val
546 return val
547 expr = ass_m.group('expr')
548 else:
549 def assign(val):
550 local_vars[ass_m.group('out')] = val
551 return val
552 expr = ass_m.group('expr')
553 elif stmt.startswith(u'return '):
554 assign = lambda v: v
555 expr = stmt[len(u'return '):]
556 else:
557 raise ExtractorError(
558 u'Cannot determine left side of statement in %r' % stmt)
559
560 v = interpret_expression(expr, local_vars, allow_recursion)
561 return assign(v)
562
563 def interpret_expression(expr, local_vars, allow_recursion):
564 if expr.isdigit():
565 return int(expr)
566
567 if expr.isalpha():
568 return local_vars[expr]
569
570 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
571 if m:
572 member = m.group('member')
573 val = local_vars[m.group('in')]
574 if member == 'split("")':
575 return list(val)
576 if member == 'join("")':
577 return u''.join(val)
578 if member == 'length':
579 return len(val)
580 if member == 'reverse()':
581 return val[::-1]
582 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
583 if slice_m:
584 idx = interpret_expression(
585 slice_m.group('idx'), local_vars, allow_recursion-1)
586 return val[idx:]
587
588 m = re.match(
589 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
590 if m:
591 val = local_vars[m.group('in')]
592 idx = interpret_expression(m.group('idx'), local_vars,
593 allow_recursion-1)
594 return val[idx]
595
596 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
597 if m:
598 a = interpret_expression(m.group('a'),
599 local_vars, allow_recursion)
600 b = interpret_expression(m.group('b'),
601 local_vars, allow_recursion)
602 return a % b
603
604 m = re.match(
605 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
606 if m:
607 fname = m.group('func')
608 if fname not in functions:
609 functions[fname] = extract_function(fname)
610 argvals = [int(v) if v.isdigit() else local_vars[v]
611 for v in m.group('args').split(',')]
612 return functions[fname](argvals)
613 raise ExtractorError(u'Unsupported JS expression %r' % expr)
614
615 def extract_function(funcname):
616 func_m = re.search(
617 r'function ' + re.escape(funcname) +
618 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
619 jscode)
620 argnames = func_m.group('args').split(',')
621
622 def resf(args):
623 local_vars = dict(zip(argnames, args))
624 for stmt in func_m.group('code').split(';'):
625 res = interpret_statement(stmt, local_vars)
626 return res
627 return resf
628
629 initial_function = extract_function(funcname)
630 return lambda s: initial_function([s])
631
632 def _parse_sig_swf(self, file_contents):
633 if file_contents[1:3] != b'WS':
634 raise ExtractorError(
635 u'Not an SWF file; header is %r' % file_contents[:3])
636 if file_contents[:1] == b'C':
637 content = zlib.decompress(file_contents[8:])
638 else:
639 raise NotImplementedError(u'Unsupported compression format %r' %
640 file_contents[:1])
641
642 def extract_tags(content):
643 pos = 0
644 while pos < len(content):
645 header16 = struct.unpack('<H', content[pos:pos+2])[0]
646 pos += 2
647 tag_code = header16 >> 6
648 tag_len = header16 & 0x3f
649 if tag_len == 0x3f:
650 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
651 pos += 4
652 assert pos+tag_len <= len(content)
653 yield (tag_code, content[pos:pos+tag_len])
654 pos += tag_len
655
656 code_tag = next(tag
657 for tag_code, tag in extract_tags(content)
658 if tag_code == 82)
659 p = code_tag.index(b'\0', 4) + 1
660 code_reader = io.BytesIO(code_tag[p:])
661
662 # Parse ABC (AVM2 ByteCode)
663 def read_int(reader=None):
664 if reader is None:
665 reader = code_reader
666 res = 0
667 shift = 0
668 for _ in range(5):
669 buf = reader.read(1)
670 assert len(buf) == 1
671 b = struct.unpack('<B', buf)[0]
672 res = res | ((b & 0x7f) << shift)
673 if b & 0x80 == 0:
674 break
675 shift += 7
676 return res
677
678 def u30(reader=None):
679 res = read_int(reader)
680 assert res & 0xf0000000 == 0
681 return res
682 u32 = read_int
683
684 def s32(reader=None):
685 v = read_int(reader)
686 if v & 0x80000000 != 0:
687 v = - ((v ^ 0xffffffff) + 1)
688 return v
689
690 def read_string(reader=None):
691 if reader is None:
692 reader = code_reader
693 slen = u30(reader)
694 resb = reader.read(slen)
695 assert len(resb) == slen
696 return resb.decode('utf-8')
697
698 def read_bytes(count, reader=None):
699 if reader is None:
700 reader = code_reader
701 resb = reader.read(count)
702 assert len(resb) == count
703 return resb
704
705 def read_byte(reader=None):
706 resb = read_bytes(1, reader=reader)
707 res = struct.unpack('<B', resb)[0]
708 return res
709
710 # minor_version + major_version
711 read_bytes(2 + 2)
712
713 # Constant pool
714 int_count = u30()
715 for _c in range(1, int_count):
716 s32()
717 uint_count = u30()
718 for _c in range(1, uint_count):
719 u32()
720 double_count = u30()
721 read_bytes((double_count-1) * 8)
722 string_count = u30()
723 constant_strings = [u'']
724 for _c in range(1, string_count):
725 s = read_string()
726 constant_strings.append(s)
727 namespace_count = u30()
728 for _c in range(1, namespace_count):
729 read_bytes(1) # kind
730 u30() # name
731 ns_set_count = u30()
732 for _c in range(1, ns_set_count):
733 count = u30()
734 for _c2 in range(count):
735 u30()
736 multiname_count = u30()
737 MULTINAME_SIZES = {
738 0x07: 2, # QName
739 0x0d: 2, # QNameA
740 0x0f: 1, # RTQName
741 0x10: 1, # RTQNameA
742 0x11: 0, # RTQNameL
743 0x12: 0, # RTQNameLA
744 0x09: 2, # Multiname
745 0x0e: 2, # MultinameA
746 0x1b: 1, # MultinameL
747 0x1c: 1, # MultinameLA
748 }
749 multinames = [u'']
750 for _c in range(1, multiname_count):
751 kind = u30()
752 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
753 if kind == 0x07:
754 u30() # namespace_idx
755 name_idx = u30()
756 multinames.append(constant_strings[name_idx])
757 else:
758 multinames.append('[MULTINAME kind: %d]' % kind)
759 for _c2 in range(MULTINAME_SIZES[kind]):
760 u30()
761
762 # Methods
763 method_count = u30()
764 MethodInfo = collections.namedtuple(
765 'MethodInfo',
766 ['NEED_ARGUMENTS', 'NEED_REST'])
767 method_infos = []
768 for method_id in range(method_count):
769 param_count = u30()
770 u30() # return type
771 for _ in range(param_count):
772 u30() # param type
773 u30() # name index (always 0 for youtube)
774 flags = read_byte()
775 if flags & 0x08 != 0:
776 # Options present
777 option_count = u30()
778 for c in range(option_count):
779 u30() # val
780 read_bytes(1) # kind
781 if flags & 0x80 != 0:
782 # Param names present
783 for _ in range(param_count):
784 u30() # param name
785 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
786 method_infos.append(mi)
787
788 # Metadata
789 metadata_count = u30()
790 for _c in range(metadata_count):
791 u30() # name
792 item_count = u30()
793 for _c2 in range(item_count):
794 u30() # key
795 u30() # value
796
797 def parse_traits_info():
798 trait_name_idx = u30()
799 kind_full = read_byte()
800 kind = kind_full & 0x0f
801 attrs = kind_full >> 4
802 methods = {}
803 if kind in [0x00, 0x06]: # Slot or Const
804 u30() # Slot id
805 u30() # type_name_idx
806 vindex = u30()
807 if vindex != 0:
808 read_byte() # vkind
809 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
810 u30() # disp_id
811 method_idx = u30()
812 methods[multinames[trait_name_idx]] = method_idx
813 elif kind == 0x04: # Class
814 u30() # slot_id
815 u30() # classi
816 elif kind == 0x05: # Function
817 u30() # slot_id
818 function_idx = u30()
819 methods[function_idx] = multinames[trait_name_idx]
820 else:
821 raise ExtractorError(u'Unsupported trait kind %d' % kind)
822
823 if attrs & 0x4 != 0: # Metadata present
824 metadata_count = u30()
825 for _c3 in range(metadata_count):
826 u30() # metadata index
827
828 return methods
829
830 # Classes
831 TARGET_CLASSNAME = u'SignatureDecipher'
832 searched_idx = multinames.index(TARGET_CLASSNAME)
833 searched_class_id = None
834 class_count = u30()
835 for class_id in range(class_count):
836 name_idx = u30()
837 if name_idx == searched_idx:
838 # We found the class we're looking for!
839 searched_class_id = class_id
840 u30() # super_name idx
841 flags = read_byte()
842 if flags & 0x08 != 0: # Protected namespace is present
843 u30() # protected_ns_idx
844 intrf_count = u30()
845 for _c2 in range(intrf_count):
846 u30()
847 u30() # iinit
848 trait_count = u30()
849 for _c2 in range(trait_count):
850 parse_traits_info()
851
852 if searched_class_id is None:
853 raise ExtractorError(u'Target class %r not found' %
854 TARGET_CLASSNAME)
855
856 method_names = {}
857 method_idxs = {}
858 for class_id in range(class_count):
859 u30() # cinit
860 trait_count = u30()
861 for _c2 in range(trait_count):
862 trait_methods = parse_traits_info()
863 if class_id == searched_class_id:
864 method_names.update(trait_methods.items())
865 method_idxs.update(dict(
866 (idx, name)
867 for name, idx in trait_methods.items()))
868
869 # Scripts
870 script_count = u30()
871 for _c in range(script_count):
872 u30() # init
873 trait_count = u30()
874 for _c2 in range(trait_count):
875 parse_traits_info()
876
877 # Method bodies
878 method_body_count = u30()
879 Method = collections.namedtuple('Method', ['code', 'local_count'])
880 methods = {}
881 for _c in range(method_body_count):
882 method_idx = u30()
883 u30() # max_stack
884 local_count = u30()
885 u30() # init_scope_depth
886 u30() # max_scope_depth
887 code_length = u30()
888 code = read_bytes(code_length)
889 if method_idx in method_idxs:
890 m = Method(code, local_count)
891 methods[method_idxs[method_idx]] = m
892 exception_count = u30()
893 for _c2 in range(exception_count):
894 u30() # from
895 u30() # to
896 u30() # target
897 u30() # exc_type
898 u30() # var_name
899 trait_count = u30()
900 for _c2 in range(trait_count):
901 parse_traits_info()
902
903 assert p + code_reader.tell() == len(code_tag)
904 assert len(methods) == len(method_idxs)
905
906 method_pyfunctions = {}
907
908 def extract_function(func_name):
909 if func_name in method_pyfunctions:
910 return method_pyfunctions[func_name]
911 if func_name not in methods:
912 raise ExtractorError(u'Cannot find function %r' % func_name)
913 m = methods[func_name]
914
915 def resfunc(args):
916 registers = ['(this)'] + list(args) + [None] * m.local_count
917 stack = []
918 coder = io.BytesIO(m.code)
919 while True:
920 opcode = struct.unpack('!B', coder.read(1))[0]
921 if opcode == 36: # pushbyte
922 v = struct.unpack('!B', coder.read(1))[0]
923 stack.append(v)
924 elif opcode == 44: # pushstring
925 idx = u30(coder)
926 stack.append(constant_strings[idx])
927 elif opcode == 48: # pushscope
928 # We don't implement the scope register, so we'll just
929 # ignore the popped value
930 stack.pop()
931 elif opcode == 70: # callproperty
932 index = u30(coder)
933 mname = multinames[index]
934 arg_count = u30(coder)
935 args = list(reversed(
936 [stack.pop() for _ in range(arg_count)]))
937 obj = stack.pop()
938 if mname == u'split':
939 assert len(args) == 1
940 assert isinstance(args[0], compat_str)
941 assert isinstance(obj, compat_str)
942 if args[0] == u'':
943 res = list(obj)
944 else:
945 res = obj.split(args[0])
946 stack.append(res)
947 elif mname == u'slice':
948 assert len(args) == 1
949 assert isinstance(args[0], int)
950 assert isinstance(obj, list)
951 res = obj[args[0]:]
952 stack.append(res)
953 elif mname == u'join':
954 assert len(args) == 1
955 assert isinstance(args[0], compat_str)
956 assert isinstance(obj, list)
957 res = args[0].join(obj)
958 stack.append(res)
959 elif mname in method_pyfunctions:
960 stack.append(method_pyfunctions[mname](args))
961 else:
962 raise NotImplementedError(
963 u'Unsupported property %r on %r'
964 % (mname, obj))
965 elif opcode == 72: # returnvalue
966 res = stack.pop()
967 return res
968 elif opcode == 79: # callpropvoid
969 index = u30(coder)
970 mname = multinames[index]
971 arg_count = u30(coder)
972 args = list(reversed(
973 [stack.pop() for _ in range(arg_count)]))
974 obj = stack.pop()
975 if mname == u'reverse':
976 assert isinstance(obj, list)
977 obj.reverse()
978 else:
979 raise NotImplementedError(
980 u'Unsupported (void) property %r on %r'
981 % (mname, obj))
982 elif opcode == 93: # findpropstrict
983 index = u30(coder)
984 mname = multinames[index]
985 res = extract_function(mname)
986 stack.append(res)
987 elif opcode == 97: # setproperty
988 index = u30(coder)
989 value = stack.pop()
990 idx = stack.pop()
991 obj = stack.pop()
992 assert isinstance(obj, list)
993 assert isinstance(idx, int)
994 obj[idx] = value
995 elif opcode == 98: # getlocal
996 index = u30(coder)
997 stack.append(registers[index])
998 elif opcode == 99: # setlocal
999 index = u30(coder)
1000 value = stack.pop()
1001 registers[index] = value
1002 elif opcode == 102: # getproperty
1003 index = u30(coder)
1004 pname = multinames[index]
1005 if pname == u'length':
1006 obj = stack.pop()
1007 assert isinstance(obj, list)
1008 stack.append(len(obj))
1009 else: # Assume attribute access
1010 idx = stack.pop()
1011 assert isinstance(idx, int)
1012 obj = stack.pop()
1013 assert isinstance(obj, list)
1014 stack.append(obj[idx])
1015 elif opcode == 128: # coerce
1016 u30(coder)
1017 elif opcode == 133: # coerce_s
1018 assert isinstance(stack[-1], (type(None), compat_str))
1019 elif opcode == 164: # modulo
1020 value2 = stack.pop()
1021 value1 = stack.pop()
1022 res = value1 % value2
1023 stack.append(res)
1024 elif opcode == 208: # getlocal_0
1025 stack.append(registers[0])
1026 elif opcode == 209: # getlocal_1
1027 stack.append(registers[1])
1028 elif opcode == 210: # getlocal_2
1029 stack.append(registers[2])
1030 elif opcode == 211: # getlocal_3
1031 stack.append(registers[3])
1032 elif opcode == 214: # setlocal_2
1033 registers[2] = stack.pop()
1034 elif opcode == 215: # setlocal_3
1035 registers[3] = stack.pop()
1036 else:
1037 raise NotImplementedError(
1038 u'Unsupported opcode %d' % opcode)
1039
1040 method_pyfunctions[func_name] = resfunc
1041 return resfunc
1042
1043 initial_function = extract_function(u'decipher')
1044 return lambda s: initial_function([s])
1045
1046 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1047 """Turn the encrypted s field into a working signature"""
1048
1049 if player_url is not None:
1050 try:
1051 if player_url not in self._player_cache:
1052 func = self._extract_signature_function(
1053 video_id, player_url, len(s)
1054 )
1055 self._player_cache[player_url] = func
1056 func = self._player_cache[player_url]
1057 if self._downloader.params.get('youtube_print_sig_code'):
1058 self._print_sig_code(func, len(s))
1059 return func(s)
1060 except Exception:
1061 tb = traceback.format_exc()
1062 self._downloader.report_warning(
1063 u'Automatic signature extraction failed: ' + tb)
1064
1065 self._downloader.report_warning(
1066 u'Warning: Falling back to static signature algorithm')
1067 return self._static_decrypt_signature(
1068 s, video_id, player_url, age_gate)
1069
1070 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1071 if age_gate:
1072 # The videos with age protection use another player, so the
1073 # algorithms can be different.
1074 if len(s) == 86:
1075 return s[2:63] + s[82] + s[64:82] + s[63]
1076
1077 if len(s) == 93:
1078 return s[86:29:-1] + s[88] + s[28:5:-1]
1079 elif len(s) == 92:
1080 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1081 elif len(s) == 91:
1082 return s[84:27:-1] + s[86] + s[26:5:-1]
1083 elif len(s) == 90:
1084 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1085 elif len(s) == 89:
1086 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1087 elif len(s) == 88:
1088 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1089 elif len(s) == 87:
1090 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1091 elif len(s) == 86:
1092 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1093 elif len(s) == 85:
1094 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1095 elif len(s) == 84:
1096 return s[81:36:-1] + s[0] + s[35:2:-1]
1097 elif len(s) == 83:
1098 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1099 elif len(s) == 82:
1100 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1101 elif len(s) == 81:
1102 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1103 elif len(s) == 80:
1104 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1105 elif len(s) == 79:
1106 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1107
1108 else:
1109 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1110
1111 def _decrypt_signature_age_gate(self, s):
1112 # The videos with age protection use another player, so the algorithms
1113 # can be different.
1114 if len(s) == 86:
1115 return s[2:63] + s[82] + s[64:82] + s[63]
1116 else:
1117 # Fallback to the other algortihms
1118 return self._decrypt_signature(s)
1119
1120 def _get_available_subtitles(self, video_id):
1121 try:
1122 sub_list = self._download_webpage(
1123 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1124 video_id, note=False)
1125 except ExtractorError as err:
1126 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1127 return {}
1128 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1129
1130 sub_lang_list = {}
1131 for l in lang_list:
1132 lang = l[1]
1133 params = compat_urllib_parse.urlencode({
1134 'lang': lang,
1135 'v': video_id,
1136 'fmt': self._downloader.params.get('subtitlesformat'),
1137 })
1138 url = u'http://www.youtube.com/api/timedtext?' + params
1139 sub_lang_list[lang] = url
1140 if not sub_lang_list:
1141 self._downloader.report_warning(u'video doesn\'t have subtitles')
1142 return {}
1143 return sub_lang_list
1144
1145 def _get_available_automatic_caption(self, video_id, webpage):
1146 """We need the webpage for getting the captions url, pass it as an
1147 argument to speed up the process."""
1148 sub_format = self._downloader.params.get('subtitlesformat')
1149 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1150 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1151 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1152 if mobj is None:
1153 self._downloader.report_warning(err_msg)
1154 return {}
1155 player_config = json.loads(mobj.group(1))
1156 try:
1157 args = player_config[u'args']
1158 caption_url = args[u'ttsurl']
1159 timestamp = args[u'timestamp']
1160 # We get the available subtitles
1161 list_params = compat_urllib_parse.urlencode({
1162 'type': 'list',
1163 'tlangs': 1,
1164 'asrs': 1,
1165 })
1166 list_url = caption_url + '&' + list_params
1167 list_page = self._download_webpage(list_url, video_id)
1168 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1169 original_lang_node = caption_list.find('track')
1170 if original_lang_node.attrib.get('kind') != 'asr' :
1171 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1172 return {}
1173 original_lang = original_lang_node.attrib['lang_code']
1174
1175 sub_lang_list = {}
1176 for lang_node in caption_list.findall('target'):
1177 sub_lang = lang_node.attrib['lang_code']
1178 params = compat_urllib_parse.urlencode({
1179 'lang': original_lang,
1180 'tlang': sub_lang,
1181 'fmt': sub_format,
1182 'ts': timestamp,
1183 'kind': 'asr',
1184 })
1185 sub_lang_list[sub_lang] = caption_url + '&' + params
1186 return sub_lang_list
1187 # An extractor error can be raise by the download process if there are
1188 # no automatic captions but there are subtitles
1189 except (KeyError, ExtractorError):
1190 self._downloader.report_warning(err_msg)
1191 return {}
1192
1193 def _print_formats(self, formats):
1194 print('Available formats:')
1195 for x in formats:
1196 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1197 self._video_dimensions.get(x, '???'),
1198 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1199
1200 def _extract_id(self, url):
1201 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1202 if mobj is None:
1203 raise ExtractorError(u'Invalid URL: %s' % url)
1204 video_id = mobj.group(2)
1205 return video_id
1206
1207 def _get_video_url_list(self, url_map):
1208 """
1209 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1210 with the requested formats.
1211 """
1212 req_format = self._downloader.params.get('format', None)
1213 format_limit = self._downloader.params.get('format_limit', None)
1214 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1215 if format_limit is not None and format_limit in available_formats:
1216 format_list = available_formats[available_formats.index(format_limit):]
1217 else:
1218 format_list = available_formats
1219 existing_formats = [x for x in format_list if x in url_map]
1220 if len(existing_formats) == 0:
1221 raise ExtractorError(u'no known formats available for video')
1222 if self._downloader.params.get('listformats', None):
1223 self._print_formats(existing_formats)
1224 return
1225 if req_format is None or req_format == 'best':
1226 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1227 elif req_format == 'worst':
1228 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1229 elif req_format in ('-1', 'all'):
1230 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1231 else:
1232 # Specific formats. We pick the first in a slash-delimeted sequence.
1233 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1234 # available in the specified format. For example,
1235 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1236 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1237 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1238 req_formats = req_format.split('/')
1239 video_url_list = None
1240 for rf in req_formats:
1241 if rf in url_map:
1242 video_url_list = [(rf, url_map[rf])]
1243 break
1244 if rf in self._video_formats_map:
1245 for srf in self._video_formats_map[rf]:
1246 if srf in url_map:
1247 video_url_list = [(srf, url_map[srf])]
1248 break
1249 else:
1250 continue
1251 break
1252 if video_url_list is None:
1253 raise ExtractorError(u'requested format not available')
1254 return video_url_list
1255
1256 def _extract_from_m3u8(self, manifest_url, video_id):
1257 url_map = {}
1258 def _get_urls(_manifest):
1259 lines = _manifest.split('\n')
1260 urls = filter(lambda l: l and not l.startswith('#'),
1261 lines)
1262 return urls
1263 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1264 formats_urls = _get_urls(manifest)
1265 for format_url in formats_urls:
1266 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1267 url_map[itag] = format_url
1268 return url_map
1269
1270 def _real_extract(self, url):
1271 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1272 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1273
1274 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1275 mobj = re.search(self._NEXT_URL_RE, url)
1276 if mobj:
1277 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1278 video_id = self._extract_id(url)
1279
1280 # Get video webpage
1281 self.report_video_webpage_download(video_id)
1282 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1283 request = compat_urllib_request.Request(url)
1284 try:
1285 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1286 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1287 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1288
1289 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1290
1291 # Attempt to extract SWF player URL
1292 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1293 if mobj is not None:
1294 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1295 else:
1296 player_url = None
1297
1298 # Get video info
1299 self.report_video_info_webpage_download(video_id)
1300 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1301 self.report_age_confirmation()
1302 age_gate = True
1303 # We simulate the access to the video from www.youtube.com/v/{video_id}
1304 # this can be viewed without login into Youtube
1305 data = compat_urllib_parse.urlencode({'video_id': video_id,
1306 'el': 'embedded',
1307 'gl': 'US',
1308 'hl': 'en',
1309 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1310 'asv': 3,
1311 'sts':'1588',
1312 })
1313 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1314 video_info_webpage = self._download_webpage(video_info_url, video_id,
1315 note=False,
1316 errnote='unable to download video info webpage')
1317 video_info = compat_parse_qs(video_info_webpage)
1318 else:
1319 age_gate = False
1320 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1321 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1322 % (video_id, el_type))
1323 video_info_webpage = self._download_webpage(video_info_url, video_id,
1324 note=False,
1325 errnote='unable to download video info webpage')
1326 video_info = compat_parse_qs(video_info_webpage)
1327 if 'token' in video_info:
1328 break
1329 if 'token' not in video_info:
1330 if 'reason' in video_info:
1331 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1332 else:
1333 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1334
1335 # Check for "rental" videos
1336 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1337 raise ExtractorError(u'"rental" videos not supported')
1338
1339 # Start extracting information
1340 self.report_information_extraction(video_id)
1341
1342 # uploader
1343 if 'author' not in video_info:
1344 raise ExtractorError(u'Unable to extract uploader name')
1345 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1346
1347 # uploader_id
1348 video_uploader_id = None
1349 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1350 if mobj is not None:
1351 video_uploader_id = mobj.group(1)
1352 else:
1353 self._downloader.report_warning(u'unable to extract uploader nickname')
1354
1355 # title
1356 if 'title' not in video_info:
1357 raise ExtractorError(u'Unable to extract video title')
1358 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1359
1360 # thumbnail image
1361 # We try first to get a high quality image:
1362 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1363 video_webpage, re.DOTALL)
1364 if m_thumb is not None:
1365 video_thumbnail = m_thumb.group(1)
1366 elif 'thumbnail_url' not in video_info:
1367 self._downloader.report_warning(u'unable to extract video thumbnail')
1368 video_thumbnail = ''
1369 else: # don't panic if we can't find it
1370 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1371
1372 # upload date
1373 upload_date = None
1374 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1375 if mobj is not None:
1376 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1377 upload_date = unified_strdate(upload_date)
1378
1379 # description
1380 video_description = get_element_by_id("eow-description", video_webpage)
1381 if video_description:
1382 video_description = clean_html(video_description)
1383 else:
1384 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1385 if fd_mobj:
1386 video_description = unescapeHTML(fd_mobj.group(1))
1387 else:
1388 video_description = u''
1389
1390 # subtitles
1391 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1392
1393 if self._downloader.params.get('listsubtitles', False):
1394 self._list_available_subtitles(video_id, video_webpage)
1395 return
1396
1397 if 'length_seconds' not in video_info:
1398 self._downloader.report_warning(u'unable to extract video duration')
1399 video_duration = ''
1400 else:
1401 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1402
1403 # Decide which formats to download
1404
1405 try:
1406 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1407 if not mobj:
1408 raise ValueError('Could not find vevo ID')
1409 info = json.loads(mobj.group(1))
1410 args = info['args']
1411 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1412 # this signatures are encrypted
1413 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1414 if m_s is not None:
1415 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1416 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1417 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1418 if m_s is not None:
1419 if 'url_encoded_fmt_stream_map' in video_info:
1420 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1421 else:
1422 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1423 elif 'adaptive_fmts' in video_info:
1424 if 'url_encoded_fmt_stream_map' in video_info:
1425 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1426 else:
1427 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1428 except ValueError:
1429 pass
1430
1431 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1432 self.report_rtmp_download()
1433 video_url_list = [(None, video_info['conn'][0])]
1434 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1435 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1436 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1437 url_map = {}
1438 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1439 url_data = compat_parse_qs(url_data_str)
1440 if 'itag' in url_data and 'url' in url_data:
1441 url = url_data['url'][0]
1442 if 'sig' in url_data:
1443 url += '&signature=' + url_data['sig'][0]
1444 elif 's' in url_data:
1445 encrypted_sig = url_data['s'][0]
1446 if self._downloader.params.get('verbose'):
1447 if age_gate:
1448 player_version = self._search_regex(
1449 r'-(.+)\.swf$',
1450 player_url if player_url else None,
1451 'flash player', fatal=False)
1452 player_desc = 'flash player %s' % player_version
1453 else:
1454 player_version = self._search_regex(
1455 r'html5player-(.+?)\.js', video_webpage,
1456 'html5 player', fatal=False)
1457 player_desc = u'html5 player %s' % player_version
1458
1459 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1460 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1461 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1462
1463 if not age_gate:
1464 jsplayer_url_json = self._search_regex(
1465 r'"assets":.+?"js":\s*("[^"]+")',
1466 video_webpage, u'JS player URL')
1467 player_url = json.loads(jsplayer_url_json)
1468
1469 signature = self._decrypt_signature(
1470 encrypted_sig, video_id, player_url, age_gate)
1471 url += '&signature=' + signature
1472 if 'ratebypass' not in url:
1473 url += '&ratebypass=yes'
1474 url_map[url_data['itag'][0]] = url
1475 video_url_list = self._get_video_url_list(url_map)
1476 if not video_url_list:
1477 return
1478 elif video_info.get('hlsvp'):
1479 manifest_url = video_info['hlsvp'][0]
1480 url_map = self._extract_from_m3u8(manifest_url, video_id)
1481 video_url_list = self._get_video_url_list(url_map)
1482 if not video_url_list:
1483 return
1484
1485 else:
1486 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1487
1488 results = []
1489 for format_param, video_real_url in video_url_list:
1490 # Extension
1491 video_extension = self._video_extensions.get(format_param, 'flv')
1492
1493 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1494 self._video_dimensions.get(format_param, '???'),
1495 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1496
1497 results.append({
1498 'id': video_id,
1499 'url': video_real_url,
1500 'uploader': video_uploader,
1501 'uploader_id': video_uploader_id,
1502 'upload_date': upload_date,
1503 'title': video_title,
1504 'ext': video_extension,
1505 'format': video_format,
1506 'thumbnail': video_thumbnail,
1507 'description': video_description,
1508 'player_url': player_url,
1509 'subtitles': video_subtitles,
1510 'duration': video_duration
1511 })
1512 return results
1513
1514 class YoutubePlaylistIE(InfoExtractor):
1515 IE_DESC = u'YouTube.com playlists'
1516 _VALID_URL = r"""(?:
1517 (?:https?://)?
1518 (?:\w+\.)?
1519 youtube\.com/
1520 (?:
1521 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1522 \? (?:.*?&)*? (?:p|a|list)=
1523 | p/
1524 )
1525 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1526 .*
1527 |
1528 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1529 )"""
1530 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1531 _MAX_RESULTS = 50
1532 IE_NAME = u'youtube:playlist'
1533
1534 @classmethod
1535 def suitable(cls, url):
1536 """Receives a URL and returns True if suitable for this IE."""
1537 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1538
1539 def _real_extract(self, url):
1540 # Extract playlist id
1541 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1542 if mobj is None:
1543 raise ExtractorError(u'Invalid URL: %s' % url)
1544
1545 # Download playlist videos from API
1546 playlist_id = mobj.group(1) or mobj.group(2)
1547 videos = []
1548
1549 for page_num in itertools.count(1):
1550 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1551 if start_index >= 1000:
1552 self._downloader.report_warning(u'Max number of results reached')
1553 break
1554 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1555 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1556
1557 try:
1558 response = json.loads(page)
1559 except ValueError as err:
1560 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1561
1562 if 'feed' not in response:
1563 raise ExtractorError(u'Got a malformed response from YouTube API')
1564 playlist_title = response['feed']['title']['$t']
1565 if 'entry' not in response['feed']:
1566 # Number of videos is a multiple of self._MAX_RESULTS
1567 break
1568
1569 for entry in response['feed']['entry']:
1570 index = entry['yt$position']['$t']
1571 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1572 videos.append((
1573 index,
1574 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1575 ))
1576
1577 videos = [v[1] for v in sorted(videos)]
1578
1579 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1580 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1581
1582
1583 class YoutubeChannelIE(InfoExtractor):
1584 IE_DESC = u'YouTube.com channels'
1585 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1586 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1587 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1588 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1589 IE_NAME = u'youtube:channel'
1590
1591 def extract_videos_from_page(self, page):
1592 ids_in_page = []
1593 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1594 if mobj.group(1) not in ids_in_page:
1595 ids_in_page.append(mobj.group(1))
1596 return ids_in_page
1597
1598 def _real_extract(self, url):
1599 # Extract channel id
1600 mobj = re.match(self._VALID_URL, url)
1601 if mobj is None:
1602 raise ExtractorError(u'Invalid URL: %s' % url)
1603
1604 # Download channel page
1605 channel_id = mobj.group(1)
1606 video_ids = []
1607 pagenum = 1
1608
1609 url = self._TEMPLATE_URL % (channel_id, pagenum)
1610 page = self._download_webpage(url, channel_id,
1611 u'Downloading page #%s' % pagenum)
1612
1613 # Extract video identifiers
1614 ids_in_page = self.extract_videos_from_page(page)
1615 video_ids.extend(ids_in_page)
1616
1617 # Download any subsequent channel pages using the json-based channel_ajax query
1618 if self._MORE_PAGES_INDICATOR in page:
1619 for pagenum in itertools.count(1):
1620 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1621 page = self._download_webpage(url, channel_id,
1622 u'Downloading page #%s' % pagenum)
1623
1624 page = json.loads(page)
1625
1626 ids_in_page = self.extract_videos_from_page(page['content_html'])
1627 video_ids.extend(ids_in_page)
1628
1629 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1630 break
1631
1632 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1633
1634 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1635 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1636 return [self.playlist_result(url_entries, channel_id)]
1637
1638
1639 class YoutubeUserIE(InfoExtractor):
1640 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1641 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1642 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1643 _GDATA_PAGE_SIZE = 50
1644 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1645 IE_NAME = u'youtube:user'
1646
1647 @classmethod
1648 def suitable(cls, url):
1649 # Don't return True if the url can be extracted with other youtube
1650 # extractor, the regex would is too permissive and it would match.
1651 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1652 if any(ie.suitable(url) for ie in other_ies): return False
1653 else: return super(YoutubeUserIE, cls).suitable(url)
1654
1655 def _real_extract(self, url):
1656 # Extract username
1657 mobj = re.match(self._VALID_URL, url)
1658 if mobj is None:
1659 raise ExtractorError(u'Invalid URL: %s' % url)
1660
1661 username = mobj.group(1)
1662
1663 # Download video ids using YouTube Data API. Result size per
1664 # query is limited (currently to 50 videos) so we need to query
1665 # page by page until there are no video ids - it means we got
1666 # all of them.
1667
1668 video_ids = []
1669
1670 for pagenum in itertools.count(0):
1671 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1672
1673 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1674 page = self._download_webpage(gdata_url, username,
1675 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1676
1677 try:
1678 response = json.loads(page)
1679 except ValueError as err:
1680 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1681 if 'entry' not in response['feed']:
1682 # Number of videos is a multiple of self._MAX_RESULTS
1683 break
1684
1685 # Extract video identifiers
1686 ids_in_page = []
1687 for entry in response['feed']['entry']:
1688 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1689 video_ids.extend(ids_in_page)
1690
1691 # A little optimization - if current page is not
1692 # "full", ie. does not contain PAGE_SIZE video ids then
1693 # we can assume that this page is the last one - there
1694 # are no more ids on further pages - no need to query
1695 # again.
1696
1697 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1698 break
1699
1700 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1701 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1702 return [self.playlist_result(url_results, playlist_title = username)]
1703
1704 class YoutubeSearchIE(SearchInfoExtractor):
1705 IE_DESC = u'YouTube.com searches'
1706 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1707 _MAX_RESULTS = 1000
1708 IE_NAME = u'youtube:search'
1709 _SEARCH_KEY = 'ytsearch'
1710
1711 def report_download_page(self, query, pagenum):
1712 """Report attempt to download search page with given number."""
1713 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1714
1715 def _get_n_results(self, query, n):
1716 """Get a specified number of results for a query"""
1717
1718 video_ids = []
1719 pagenum = 0
1720 limit = n
1721
1722 while (50 * pagenum) < limit:
1723 self.report_download_page(query, pagenum+1)
1724 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1725 request = compat_urllib_request.Request(result_url)
1726 try:
1727 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1728 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1729 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1730 api_response = json.loads(data)['data']
1731
1732 if not 'items' in api_response:
1733 raise ExtractorError(u'[youtube] No video results')
1734
1735 new_ids = list(video['id'] for video in api_response['items'])
1736 video_ids += new_ids
1737
1738 limit = min(n, api_response['totalItems'])
1739 pagenum += 1
1740
1741 if len(video_ids) > n:
1742 video_ids = video_ids[:n]
1743 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1744 return self.playlist_result(videos, query)
1745
1746
1747 class YoutubeShowIE(InfoExtractor):
1748 IE_DESC = u'YouTube.com (multi-season) shows'
1749 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1750 IE_NAME = u'youtube:show'
1751
1752 def _real_extract(self, url):
1753 mobj = re.match(self._VALID_URL, url)
1754 show_name = mobj.group(1)
1755 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1756 # There's one playlist for each season of the show
1757 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1758 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1759 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1760
1761
1762 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1763 """
1764 Base class for extractors that fetch info from
1765 http://www.youtube.com/feed_ajax
1766 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1767 """
1768 _LOGIN_REQUIRED = True
1769 _PAGING_STEP = 30
1770 # use action_load_personal_feed instead of action_load_system_feed
1771 _PERSONAL_FEED = False
1772
1773 @property
1774 def _FEED_TEMPLATE(self):
1775 action = 'action_load_system_feed'
1776 if self._PERSONAL_FEED:
1777 action = 'action_load_personal_feed'
1778 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1779
1780 @property
1781 def IE_NAME(self):
1782 return u'youtube:%s' % self._FEED_NAME
1783
1784 def _real_initialize(self):
1785 self._login()
1786
1787 def _real_extract(self, url):
1788 feed_entries = []
1789 # The step argument is available only in 2.7 or higher
1790 for i in itertools.count(0):
1791 paging = i*self._PAGING_STEP
1792 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1793 u'%s feed' % self._FEED_NAME,
1794 u'Downloading page %s' % i)
1795 info = json.loads(info)
1796 feed_html = info['feed_html']
1797 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1798 ids = orderedSet(m.group(1) for m in m_ids)
1799 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1800 if info['paging'] is None:
1801 break
1802 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1803
1804 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1805 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1806 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1807 _FEED_NAME = 'subscriptions'
1808 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1809
1810 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1811 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1812 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1813 _FEED_NAME = 'recommended'
1814 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1815
1816 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1817 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1818 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1819 _FEED_NAME = 'watch_later'
1820 _PLAYLIST_TITLE = u'Youtube Watch Later'
1821 _PAGING_STEP = 100
1822 _PERSONAL_FEED = True
1823
1824 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1825 IE_NAME = u'youtube:favorites'
1826 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1827 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1828 _LOGIN_REQUIRED = True
1829
1830 def _real_extract(self, url):
1831 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1832 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1833 return self.url_result(playlist_id, 'YoutubePlaylist')