]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
Merge pull request #1531 from rg3/no-playlist
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 import collections
4 import errno
5 import io
6 import itertools
7 import json
8 import os.path
9 import re
10 import socket
11 import string
12 import struct
13 import traceback
14 import xml.etree.ElementTree
15 import zlib
16
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
19 from ..utils import (
20 compat_chr,
21 compat_http_client,
22 compat_parse_qs,
23 compat_urllib_error,
24 compat_urllib_parse,
25 compat_urllib_request,
26 compat_urlparse,
27 compat_str,
28
29 clean_html,
30 get_element_by_id,
31 ExtractorError,
32 unescapeHTML,
33 unified_strdate,
34 orderedSet,
35 write_json_file,
36 )
37
38 class YoutubeBaseInfoExtractor(InfoExtractor):
39 """Provide base functions for Youtube extractors"""
40 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
41 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
42 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
47 def report_lang(self):
48 """Report attempt to set language."""
49 self.to_screen(u'Setting language')
50
51 def _set_language(self):
52 request = compat_urllib_request.Request(self._LANG_URL)
53 try:
54 self.report_lang()
55 compat_urllib_request.urlopen(request).read()
56 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
57 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
58 return False
59 return True
60
61 def _login(self):
62 (username, password) = self._get_login_info()
63 # No authentication to be performed
64 if username is None:
65 if self._LOGIN_REQUIRED:
66 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
67 return False
68
69 request = compat_urllib_request.Request(self._LOGIN_URL)
70 try:
71 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
72 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
73 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
74 return False
75
76 galx = None
77 dsh = None
78 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
79 if match:
80 galx = match.group(1)
81 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
82 if match:
83 dsh = match.group(1)
84
85 # Log in
86 login_form_strs = {
87 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
88 u'Email': username,
89 u'GALX': galx,
90 u'Passwd': password,
91 u'PersistentCookie': u'yes',
92 u'_utf8': u'霱',
93 u'bgresponse': u'js_disabled',
94 u'checkConnection': u'',
95 u'checkedDomains': u'youtube',
96 u'dnConn': u'',
97 u'dsh': dsh,
98 u'pstMsg': u'0',
99 u'rmShown': u'1',
100 u'secTok': u'',
101 u'signIn': u'Sign in',
102 u'timeStmp': u'',
103 u'service': u'youtube',
104 u'uilel': u'3',
105 u'hl': u'en_US',
106 }
107 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
108 # chokes on unicode
109 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
110 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
111 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
112 try:
113 self.report_login()
114 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
115 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
116 self._downloader.report_warning(u'unable to log in: bad username or password')
117 return False
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
120 return False
121 return True
122
123 def _confirm_age(self):
124 age_form = {
125 'next_url': '/',
126 'action_confirm': 'Confirm',
127 }
128 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
129 try:
130 self.report_age_confirmation()
131 compat_urllib_request.urlopen(request).read().decode('utf-8')
132 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
133 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
134 return True
135
136 def _real_initialize(self):
137 if self._downloader is None:
138 return
139 if not self._set_language():
140 return
141 if not self._login():
142 return
143 self._confirm_age()
144
145
146 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
147 IE_DESC = u'YouTube.com'
148 _VALID_URL = r"""^
149 (
150 (?:https?://)? # http(s):// (optional)
151 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
152 tube\.majestyc\.net/|
153 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
154 (?:.*?\#/)? # handle anchor (#/) redirect urls
155 (?: # the various things that can precede the ID:
156 (?:(?:v|embed|e)/) # v/ or embed/ or e/
157 |(?: # or the v= param in all its forms
158 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
159 (?:\?|\#!?) # the params delimiter ? or # or #!
160 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
161 v=
162 )
163 ))
164 |youtu\.be/ # just youtu.be/xxxx
165 )
166 )? # all until now is optional -> you can pass the naked ID
167 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
168 (?(1).+)? # if we found the ID, everything can follow
169 $"""
170 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
171 # Listed in order of quality
172 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
173 # Apple HTTP Live Streaming
174 '96', '95', '94', '93', '92', '132', '151',
175 # 3D
176 '85', '84', '102', '83', '101', '82', '100',
177 # Dash video
178 '138', '137', '248', '136', '247', '135', '246',
179 '245', '244', '134', '243', '133', '242', '160',
180 # Dash audio
181 '141', '172', '140', '171', '139',
182 ]
183 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
184 # Apple HTTP Live Streaming
185 '96', '95', '94', '93', '92', '132', '151',
186 # 3D
187 '85', '102', '84', '101', '83', '100', '82',
188 # Dash video
189 '138', '248', '137', '247', '136', '246', '245',
190 '244', '135', '243', '134', '242', '133', '160',
191 # Dash audio
192 '172', '141', '171', '140', '139',
193 ]
194 _video_formats_map = {
195 'flv': ['35', '34', '6', '5'],
196 '3gp': ['36', '17', '13'],
197 'mp4': ['38', '37', '22', '18'],
198 'webm': ['46', '45', '44', '43'],
199 }
200 _video_extensions = {
201 '13': '3gp',
202 '17': '3gp',
203 '18': 'mp4',
204 '22': 'mp4',
205 '36': '3gp',
206 '37': 'mp4',
207 '38': 'mp4',
208 '43': 'webm',
209 '44': 'webm',
210 '45': 'webm',
211 '46': 'webm',
212
213 # 3d videos
214 '82': 'mp4',
215 '83': 'mp4',
216 '84': 'mp4',
217 '85': 'mp4',
218 '100': 'webm',
219 '101': 'webm',
220 '102': 'webm',
221
222 # Apple HTTP Live Streaming
223 '92': 'mp4',
224 '93': 'mp4',
225 '94': 'mp4',
226 '95': 'mp4',
227 '96': 'mp4',
228 '132': 'mp4',
229 '151': 'mp4',
230
231 # Dash mp4
232 '133': 'mp4',
233 '134': 'mp4',
234 '135': 'mp4',
235 '136': 'mp4',
236 '137': 'mp4',
237 '138': 'mp4',
238 '139': 'mp4',
239 '140': 'mp4',
240 '141': 'mp4',
241 '160': 'mp4',
242
243 # Dash webm
244 '171': 'webm',
245 '172': 'webm',
246 '242': 'webm',
247 '243': 'webm',
248 '244': 'webm',
249 '245': 'webm',
250 '246': 'webm',
251 '247': 'webm',
252 '248': 'webm',
253 }
254 _video_dimensions = {
255 '5': '240x400',
256 '6': '???',
257 '13': '???',
258 '17': '144x176',
259 '18': '360x640',
260 '22': '720x1280',
261 '34': '360x640',
262 '35': '480x854',
263 '36': '240x320',
264 '37': '1080x1920',
265 '38': '3072x4096',
266 '43': '360x640',
267 '44': '480x854',
268 '45': '720x1280',
269 '46': '1080x1920',
270 '82': '360p',
271 '83': '480p',
272 '84': '720p',
273 '85': '1080p',
274 '92': '240p',
275 '93': '360p',
276 '94': '480p',
277 '95': '720p',
278 '96': '1080p',
279 '100': '360p',
280 '101': '480p',
281 '102': '720p',
282 '132': '240p',
283 '151': '72p',
284 '133': '240p',
285 '134': '360p',
286 '135': '480p',
287 '136': '720p',
288 '137': '1080p',
289 '138': '>1080p',
290 '139': '48k',
291 '140': '128k',
292 '141': '256k',
293 '160': '192p',
294 '171': '128k',
295 '172': '256k',
296 '242': '240p',
297 '243': '360p',
298 '244': '480p',
299 '245': '480p',
300 '246': '480p',
301 '247': '720p',
302 '248': '1080p',
303 }
304 _special_itags = {
305 '82': '3D',
306 '83': '3D',
307 '84': '3D',
308 '85': '3D',
309 '100': '3D',
310 '101': '3D',
311 '102': '3D',
312 '133': 'DASH Video',
313 '134': 'DASH Video',
314 '135': 'DASH Video',
315 '136': 'DASH Video',
316 '137': 'DASH Video',
317 '138': 'DASH Video',
318 '139': 'DASH Audio',
319 '140': 'DASH Audio',
320 '141': 'DASH Audio',
321 '160': 'DASH Video',
322 '171': 'DASH Audio',
323 '172': 'DASH Audio',
324 '242': 'DASH Video',
325 '243': 'DASH Video',
326 '244': 'DASH Video',
327 '245': 'DASH Video',
328 '246': 'DASH Video',
329 '247': 'DASH Video',
330 '248': 'DASH Video',
331 }
332
333 IE_NAME = u'youtube'
334 _TESTS = [
335 {
336 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
337 u"file": u"BaW_jenozKc.mp4",
338 u"info_dict": {
339 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
340 u"uploader": u"Philipp Hagemeister",
341 u"uploader_id": u"phihag",
342 u"upload_date": u"20121002",
343 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
344 }
345 },
346 {
347 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
348 u"file": u"1ltcDfZMA3U.flv",
349 u"note": u"Test VEVO video (#897)",
350 u"info_dict": {
351 u"upload_date": u"20070518",
352 u"title": u"Maps - It Will Find You",
353 u"description": u"Music video by Maps performing It Will Find You.",
354 u"uploader": u"MuteUSA",
355 u"uploader_id": u"MuteUSA"
356 }
357 },
358 {
359 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
360 u"file": u"UxxajLWwzqY.mp4",
361 u"note": u"Test generic use_cipher_signature video (#897)",
362 u"info_dict": {
363 u"upload_date": u"20120506",
364 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
365 u"description": u"md5:5b292926389560516e384ac437c0ec07",
366 u"uploader": u"Icona Pop",
367 u"uploader_id": u"IconaPop"
368 }
369 },
370 {
371 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
372 u"file": u"07FYdnEawAQ.mp4",
373 u"note": u"Test VEVO video with age protection (#956)",
374 u"info_dict": {
375 u"upload_date": u"20130703",
376 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
377 u"description": u"md5:64249768eec3bc4276236606ea996373",
378 u"uploader": u"justintimberlakeVEVO",
379 u"uploader_id": u"justintimberlakeVEVO"
380 }
381 },
382 ]
383
384
385 @classmethod
386 def suitable(cls, url):
387 """Receives a URL and returns True if suitable for this IE."""
388 if YoutubePlaylistIE.suitable(url): return False
389 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
390
391 def __init__(self, *args, **kwargs):
392 super(YoutubeIE, self).__init__(*args, **kwargs)
393 self._player_cache = {}
394
395 def report_video_webpage_download(self, video_id):
396 """Report attempt to download video webpage."""
397 self.to_screen(u'%s: Downloading video webpage' % video_id)
398
399 def report_video_info_webpage_download(self, video_id):
400 """Report attempt to download video info webpage."""
401 self.to_screen(u'%s: Downloading video info webpage' % video_id)
402
403 def report_information_extraction(self, video_id):
404 """Report attempt to extract video information."""
405 self.to_screen(u'%s: Extracting video information' % video_id)
406
407 def report_unavailable_format(self, video_id, format):
408 """Report extracted video URL."""
409 self.to_screen(u'%s: Format %s not available' % (video_id, format))
410
411 def report_rtmp_download(self):
412 """Indicate the download will use the RTMP protocol."""
413 self.to_screen(u'RTMP download detected')
414
415 def _extract_signature_function(self, video_id, player_url, slen):
416 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
417 player_url)
418 player_type = id_m.group('ext')
419 player_id = id_m.group('id')
420
421 # Read from filesystem cache
422 func_id = '%s_%s_%d' % (player_type, player_id, slen)
423 assert os.path.basename(func_id) == func_id
424 cache_dir = self._downloader.params.get('cachedir',
425 u'~/.youtube-dl/cache')
426
427 cache_enabled = cache_dir is not None
428 if cache_enabled:
429 cache_fn = os.path.join(os.path.expanduser(cache_dir),
430 u'youtube-sigfuncs',
431 func_id + '.json')
432 try:
433 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
434 cache_spec = json.load(cachef)
435 return lambda s: u''.join(s[i] for i in cache_spec)
436 except IOError:
437 pass # No cache available
438
439 if player_type == 'js':
440 code = self._download_webpage(
441 player_url, video_id,
442 note=u'Downloading %s player %s' % (player_type, player_id),
443 errnote=u'Download of %s failed' % player_url)
444 res = self._parse_sig_js(code)
445 elif player_type == 'swf':
446 urlh = self._request_webpage(
447 player_url, video_id,
448 note=u'Downloading %s player %s' % (player_type, player_id),
449 errnote=u'Download of %s failed' % player_url)
450 code = urlh.read()
451 res = self._parse_sig_swf(code)
452 else:
453 assert False, 'Invalid player type %r' % player_type
454
455 if cache_enabled:
456 try:
457 test_string = u''.join(map(compat_chr, range(slen)))
458 cache_res = res(test_string)
459 cache_spec = [ord(c) for c in cache_res]
460 try:
461 os.makedirs(os.path.dirname(cache_fn))
462 except OSError as ose:
463 if ose.errno != errno.EEXIST:
464 raise
465 write_json_file(cache_spec, cache_fn)
466 except Exception:
467 tb = traceback.format_exc()
468 self._downloader.report_warning(
469 u'Writing cache to %r failed: %s' % (cache_fn, tb))
470
471 return res
472
473 def _print_sig_code(self, func, slen):
474 def gen_sig_code(idxs):
475 def _genslice(start, end, step):
476 starts = u'' if start == 0 else str(start)
477 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
478 steps = u'' if step == 1 else (u':%d' % step)
479 return u's[%s%s%s]' % (starts, ends, steps)
480
481 step = None
482 start = '(Never used)' # Quelch pyflakes warnings - start will be
483 # set as soon as step is set
484 for i, prev in zip(idxs[1:], idxs[:-1]):
485 if step is not None:
486 if i - prev == step:
487 continue
488 yield _genslice(start, prev, step)
489 step = None
490 continue
491 if i - prev in [-1, 1]:
492 step = i - prev
493 start = prev
494 continue
495 else:
496 yield u's[%d]' % prev
497 if step is None:
498 yield u's[%d]' % i
499 else:
500 yield _genslice(start, i, step)
501
502 test_string = u''.join(map(compat_chr, range(slen)))
503 cache_res = func(test_string)
504 cache_spec = [ord(c) for c in cache_res]
505 expr_code = u' + '.join(gen_sig_code(cache_spec))
506 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
507 self.to_screen(u'Extracted signature function:\n' + code)
508
509 def _parse_sig_js(self, jscode):
510 funcname = self._search_regex(
511 r'signature=([a-zA-Z]+)', jscode,
512 u'Initial JS player signature function name')
513
514 functions = {}
515
516 def argidx(varname):
517 return string.lowercase.index(varname)
518
519 def interpret_statement(stmt, local_vars, allow_recursion=20):
520 if allow_recursion < 0:
521 raise ExtractorError(u'Recursion limit reached')
522
523 if stmt.startswith(u'var '):
524 stmt = stmt[len(u'var '):]
525 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
526 r'=(?P<expr>.*)$', stmt)
527 if ass_m:
528 if ass_m.groupdict().get('index'):
529 def assign(val):
530 lvar = local_vars[ass_m.group('out')]
531 idx = interpret_expression(ass_m.group('index'),
532 local_vars, allow_recursion)
533 assert isinstance(idx, int)
534 lvar[idx] = val
535 return val
536 expr = ass_m.group('expr')
537 else:
538 def assign(val):
539 local_vars[ass_m.group('out')] = val
540 return val
541 expr = ass_m.group('expr')
542 elif stmt.startswith(u'return '):
543 assign = lambda v: v
544 expr = stmt[len(u'return '):]
545 else:
546 raise ExtractorError(
547 u'Cannot determine left side of statement in %r' % stmt)
548
549 v = interpret_expression(expr, local_vars, allow_recursion)
550 return assign(v)
551
552 def interpret_expression(expr, local_vars, allow_recursion):
553 if expr.isdigit():
554 return int(expr)
555
556 if expr.isalpha():
557 return local_vars[expr]
558
559 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
560 if m:
561 member = m.group('member')
562 val = local_vars[m.group('in')]
563 if member == 'split("")':
564 return list(val)
565 if member == 'join("")':
566 return u''.join(val)
567 if member == 'length':
568 return len(val)
569 if member == 'reverse()':
570 return val[::-1]
571 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
572 if slice_m:
573 idx = interpret_expression(
574 slice_m.group('idx'), local_vars, allow_recursion-1)
575 return val[idx:]
576
577 m = re.match(
578 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
579 if m:
580 val = local_vars[m.group('in')]
581 idx = interpret_expression(m.group('idx'), local_vars,
582 allow_recursion-1)
583 return val[idx]
584
585 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
586 if m:
587 a = interpret_expression(m.group('a'),
588 local_vars, allow_recursion)
589 b = interpret_expression(m.group('b'),
590 local_vars, allow_recursion)
591 return a % b
592
593 m = re.match(
594 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
595 if m:
596 fname = m.group('func')
597 if fname not in functions:
598 functions[fname] = extract_function(fname)
599 argvals = [int(v) if v.isdigit() else local_vars[v]
600 for v in m.group('args').split(',')]
601 return functions[fname](argvals)
602 raise ExtractorError(u'Unsupported JS expression %r' % expr)
603
604 def extract_function(funcname):
605 func_m = re.search(
606 r'function ' + re.escape(funcname) +
607 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
608 jscode)
609 argnames = func_m.group('args').split(',')
610
611 def resf(args):
612 local_vars = dict(zip(argnames, args))
613 for stmt in func_m.group('code').split(';'):
614 res = interpret_statement(stmt, local_vars)
615 return res
616 return resf
617
618 initial_function = extract_function(funcname)
619 return lambda s: initial_function([s])
620
621 def _parse_sig_swf(self, file_contents):
622 if file_contents[1:3] != b'WS':
623 raise ExtractorError(
624 u'Not an SWF file; header is %r' % file_contents[:3])
625 if file_contents[:1] == b'C':
626 content = zlib.decompress(file_contents[8:])
627 else:
628 raise NotImplementedError(u'Unsupported compression format %r' %
629 file_contents[:1])
630
631 def extract_tags(content):
632 pos = 0
633 while pos < len(content):
634 header16 = struct.unpack('<H', content[pos:pos+2])[0]
635 pos += 2
636 tag_code = header16 >> 6
637 tag_len = header16 & 0x3f
638 if tag_len == 0x3f:
639 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
640 pos += 4
641 assert pos+tag_len <= len(content)
642 yield (tag_code, content[pos:pos+tag_len])
643 pos += tag_len
644
645 code_tag = next(tag
646 for tag_code, tag in extract_tags(content)
647 if tag_code == 82)
648 p = code_tag.index(b'\0', 4) + 1
649 code_reader = io.BytesIO(code_tag[p:])
650
651 # Parse ABC (AVM2 ByteCode)
652 def read_int(reader=None):
653 if reader is None:
654 reader = code_reader
655 res = 0
656 shift = 0
657 for _ in range(5):
658 buf = reader.read(1)
659 assert len(buf) == 1
660 b = struct.unpack('<B', buf)[0]
661 res = res | ((b & 0x7f) << shift)
662 if b & 0x80 == 0:
663 break
664 shift += 7
665 return res
666
667 def u30(reader=None):
668 res = read_int(reader)
669 assert res & 0xf0000000 == 0
670 return res
671 u32 = read_int
672
673 def s32(reader=None):
674 v = read_int(reader)
675 if v & 0x80000000 != 0:
676 v = - ((v ^ 0xffffffff) + 1)
677 return v
678
679 def read_string(reader=None):
680 if reader is None:
681 reader = code_reader
682 slen = u30(reader)
683 resb = reader.read(slen)
684 assert len(resb) == slen
685 return resb.decode('utf-8')
686
687 def read_bytes(count, reader=None):
688 if reader is None:
689 reader = code_reader
690 resb = reader.read(count)
691 assert len(resb) == count
692 return resb
693
694 def read_byte(reader=None):
695 resb = read_bytes(1, reader=reader)
696 res = struct.unpack('<B', resb)[0]
697 return res
698
699 # minor_version + major_version
700 read_bytes(2 + 2)
701
702 # Constant pool
703 int_count = u30()
704 for _c in range(1, int_count):
705 s32()
706 uint_count = u30()
707 for _c in range(1, uint_count):
708 u32()
709 double_count = u30()
710 read_bytes((double_count-1) * 8)
711 string_count = u30()
712 constant_strings = [u'']
713 for _c in range(1, string_count):
714 s = read_string()
715 constant_strings.append(s)
716 namespace_count = u30()
717 for _c in range(1, namespace_count):
718 read_bytes(1) # kind
719 u30() # name
720 ns_set_count = u30()
721 for _c in range(1, ns_set_count):
722 count = u30()
723 for _c2 in range(count):
724 u30()
725 multiname_count = u30()
726 MULTINAME_SIZES = {
727 0x07: 2, # QName
728 0x0d: 2, # QNameA
729 0x0f: 1, # RTQName
730 0x10: 1, # RTQNameA
731 0x11: 0, # RTQNameL
732 0x12: 0, # RTQNameLA
733 0x09: 2, # Multiname
734 0x0e: 2, # MultinameA
735 0x1b: 1, # MultinameL
736 0x1c: 1, # MultinameLA
737 }
738 multinames = [u'']
739 for _c in range(1, multiname_count):
740 kind = u30()
741 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
742 if kind == 0x07:
743 u30() # namespace_idx
744 name_idx = u30()
745 multinames.append(constant_strings[name_idx])
746 else:
747 multinames.append('[MULTINAME kind: %d]' % kind)
748 for _c2 in range(MULTINAME_SIZES[kind]):
749 u30()
750
751 # Methods
752 method_count = u30()
753 MethodInfo = collections.namedtuple(
754 'MethodInfo',
755 ['NEED_ARGUMENTS', 'NEED_REST'])
756 method_infos = []
757 for method_id in range(method_count):
758 param_count = u30()
759 u30() # return type
760 for _ in range(param_count):
761 u30() # param type
762 u30() # name index (always 0 for youtube)
763 flags = read_byte()
764 if flags & 0x08 != 0:
765 # Options present
766 option_count = u30()
767 for c in range(option_count):
768 u30() # val
769 read_bytes(1) # kind
770 if flags & 0x80 != 0:
771 # Param names present
772 for _ in range(param_count):
773 u30() # param name
774 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
775 method_infos.append(mi)
776
777 # Metadata
778 metadata_count = u30()
779 for _c in range(metadata_count):
780 u30() # name
781 item_count = u30()
782 for _c2 in range(item_count):
783 u30() # key
784 u30() # value
785
786 def parse_traits_info():
787 trait_name_idx = u30()
788 kind_full = read_byte()
789 kind = kind_full & 0x0f
790 attrs = kind_full >> 4
791 methods = {}
792 if kind in [0x00, 0x06]: # Slot or Const
793 u30() # Slot id
794 u30() # type_name_idx
795 vindex = u30()
796 if vindex != 0:
797 read_byte() # vkind
798 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
799 u30() # disp_id
800 method_idx = u30()
801 methods[multinames[trait_name_idx]] = method_idx
802 elif kind == 0x04: # Class
803 u30() # slot_id
804 u30() # classi
805 elif kind == 0x05: # Function
806 u30() # slot_id
807 function_idx = u30()
808 methods[function_idx] = multinames[trait_name_idx]
809 else:
810 raise ExtractorError(u'Unsupported trait kind %d' % kind)
811
812 if attrs & 0x4 != 0: # Metadata present
813 metadata_count = u30()
814 for _c3 in range(metadata_count):
815 u30() # metadata index
816
817 return methods
818
819 # Classes
820 TARGET_CLASSNAME = u'SignatureDecipher'
821 searched_idx = multinames.index(TARGET_CLASSNAME)
822 searched_class_id = None
823 class_count = u30()
824 for class_id in range(class_count):
825 name_idx = u30()
826 if name_idx == searched_idx:
827 # We found the class we're looking for!
828 searched_class_id = class_id
829 u30() # super_name idx
830 flags = read_byte()
831 if flags & 0x08 != 0: # Protected namespace is present
832 u30() # protected_ns_idx
833 intrf_count = u30()
834 for _c2 in range(intrf_count):
835 u30()
836 u30() # iinit
837 trait_count = u30()
838 for _c2 in range(trait_count):
839 parse_traits_info()
840
841 if searched_class_id is None:
842 raise ExtractorError(u'Target class %r not found' %
843 TARGET_CLASSNAME)
844
845 method_names = {}
846 method_idxs = {}
847 for class_id in range(class_count):
848 u30() # cinit
849 trait_count = u30()
850 for _c2 in range(trait_count):
851 trait_methods = parse_traits_info()
852 if class_id == searched_class_id:
853 method_names.update(trait_methods.items())
854 method_idxs.update(dict(
855 (idx, name)
856 for name, idx in trait_methods.items()))
857
858 # Scripts
859 script_count = u30()
860 for _c in range(script_count):
861 u30() # init
862 trait_count = u30()
863 for _c2 in range(trait_count):
864 parse_traits_info()
865
866 # Method bodies
867 method_body_count = u30()
868 Method = collections.namedtuple('Method', ['code', 'local_count'])
869 methods = {}
870 for _c in range(method_body_count):
871 method_idx = u30()
872 u30() # max_stack
873 local_count = u30()
874 u30() # init_scope_depth
875 u30() # max_scope_depth
876 code_length = u30()
877 code = read_bytes(code_length)
878 if method_idx in method_idxs:
879 m = Method(code, local_count)
880 methods[method_idxs[method_idx]] = m
881 exception_count = u30()
882 for _c2 in range(exception_count):
883 u30() # from
884 u30() # to
885 u30() # target
886 u30() # exc_type
887 u30() # var_name
888 trait_count = u30()
889 for _c2 in range(trait_count):
890 parse_traits_info()
891
892 assert p + code_reader.tell() == len(code_tag)
893 assert len(methods) == len(method_idxs)
894
895 method_pyfunctions = {}
896
897 def extract_function(func_name):
898 if func_name in method_pyfunctions:
899 return method_pyfunctions[func_name]
900 if func_name not in methods:
901 raise ExtractorError(u'Cannot find function %r' % func_name)
902 m = methods[func_name]
903
904 def resfunc(args):
905 registers = ['(this)'] + list(args) + [None] * m.local_count
906 stack = []
907 coder = io.BytesIO(m.code)
908 while True:
909 opcode = struct.unpack('!B', coder.read(1))[0]
910 if opcode == 36: # pushbyte
911 v = struct.unpack('!B', coder.read(1))[0]
912 stack.append(v)
913 elif opcode == 44: # pushstring
914 idx = u30(coder)
915 stack.append(constant_strings[idx])
916 elif opcode == 48: # pushscope
917 # We don't implement the scope register, so we'll just
918 # ignore the popped value
919 stack.pop()
920 elif opcode == 70: # callproperty
921 index = u30(coder)
922 mname = multinames[index]
923 arg_count = u30(coder)
924 args = list(reversed(
925 [stack.pop() for _ in range(arg_count)]))
926 obj = stack.pop()
927 if mname == u'split':
928 assert len(args) == 1
929 assert isinstance(args[0], compat_str)
930 assert isinstance(obj, compat_str)
931 if args[0] == u'':
932 res = list(obj)
933 else:
934 res = obj.split(args[0])
935 stack.append(res)
936 elif mname == u'slice':
937 assert len(args) == 1
938 assert isinstance(args[0], int)
939 assert isinstance(obj, list)
940 res = obj[args[0]:]
941 stack.append(res)
942 elif mname == u'join':
943 assert len(args) == 1
944 assert isinstance(args[0], compat_str)
945 assert isinstance(obj, list)
946 res = args[0].join(obj)
947 stack.append(res)
948 elif mname in method_pyfunctions:
949 stack.append(method_pyfunctions[mname](args))
950 else:
951 raise NotImplementedError(
952 u'Unsupported property %r on %r'
953 % (mname, obj))
954 elif opcode == 72: # returnvalue
955 res = stack.pop()
956 return res
957 elif opcode == 79: # callpropvoid
958 index = u30(coder)
959 mname = multinames[index]
960 arg_count = u30(coder)
961 args = list(reversed(
962 [stack.pop() for _ in range(arg_count)]))
963 obj = stack.pop()
964 if mname == u'reverse':
965 assert isinstance(obj, list)
966 obj.reverse()
967 else:
968 raise NotImplementedError(
969 u'Unsupported (void) property %r on %r'
970 % (mname, obj))
971 elif opcode == 93: # findpropstrict
972 index = u30(coder)
973 mname = multinames[index]
974 res = extract_function(mname)
975 stack.append(res)
976 elif opcode == 97: # setproperty
977 index = u30(coder)
978 value = stack.pop()
979 idx = stack.pop()
980 obj = stack.pop()
981 assert isinstance(obj, list)
982 assert isinstance(idx, int)
983 obj[idx] = value
984 elif opcode == 98: # getlocal
985 index = u30(coder)
986 stack.append(registers[index])
987 elif opcode == 99: # setlocal
988 index = u30(coder)
989 value = stack.pop()
990 registers[index] = value
991 elif opcode == 102: # getproperty
992 index = u30(coder)
993 pname = multinames[index]
994 if pname == u'length':
995 obj = stack.pop()
996 assert isinstance(obj, list)
997 stack.append(len(obj))
998 else: # Assume attribute access
999 idx = stack.pop()
1000 assert isinstance(idx, int)
1001 obj = stack.pop()
1002 assert isinstance(obj, list)
1003 stack.append(obj[idx])
1004 elif opcode == 128: # coerce
1005 u30(coder)
1006 elif opcode == 133: # coerce_s
1007 assert isinstance(stack[-1], (type(None), compat_str))
1008 elif opcode == 164: # modulo
1009 value2 = stack.pop()
1010 value1 = stack.pop()
1011 res = value1 % value2
1012 stack.append(res)
1013 elif opcode == 208: # getlocal_0
1014 stack.append(registers[0])
1015 elif opcode == 209: # getlocal_1
1016 stack.append(registers[1])
1017 elif opcode == 210: # getlocal_2
1018 stack.append(registers[2])
1019 elif opcode == 211: # getlocal_3
1020 stack.append(registers[3])
1021 elif opcode == 214: # setlocal_2
1022 registers[2] = stack.pop()
1023 elif opcode == 215: # setlocal_3
1024 registers[3] = stack.pop()
1025 else:
1026 raise NotImplementedError(
1027 u'Unsupported opcode %d' % opcode)
1028
1029 method_pyfunctions[func_name] = resfunc
1030 return resfunc
1031
1032 initial_function = extract_function(u'decipher')
1033 return lambda s: initial_function([s])
1034
1035 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1036 """Turn the encrypted s field into a working signature"""
1037
1038 if player_url is not None:
1039 try:
1040 if player_url not in self._player_cache:
1041 func = self._extract_signature_function(
1042 video_id, player_url, len(s)
1043 )
1044 self._player_cache[player_url] = func
1045 func = self._player_cache[player_url]
1046 if self._downloader.params.get('youtube_print_sig_code'):
1047 self._print_sig_code(func, len(s))
1048 return func(s)
1049 except Exception:
1050 tb = traceback.format_exc()
1051 self._downloader.report_warning(
1052 u'Automatic signature extraction failed: ' + tb)
1053
1054 self._downloader.report_warning(
1055 u'Warning: Falling back to static signature algorithm')
1056
1057 return self._static_decrypt_signature(
1058 s, video_id, player_url, age_gate)
1059
1060 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1061 if age_gate:
1062 # The videos with age protection use another player, so the
1063 # algorithms can be different.
1064 if len(s) == 86:
1065 return s[2:63] + s[82] + s[64:82] + s[63]
1066
1067 if len(s) == 93:
1068 return s[86:29:-1] + s[88] + s[28:5:-1]
1069 elif len(s) == 92:
1070 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1071 elif len(s) == 91:
1072 return s[84:27:-1] + s[86] + s[26:5:-1]
1073 elif len(s) == 90:
1074 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1075 elif len(s) == 89:
1076 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1077 elif len(s) == 88:
1078 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1079 elif len(s) == 87:
1080 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1081 elif len(s) == 86:
1082 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1083 elif len(s) == 85:
1084 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1085 elif len(s) == 84:
1086 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1087 elif len(s) == 83:
1088 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1089 elif len(s) == 82:
1090 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1091 elif len(s) == 81:
1092 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1093 elif len(s) == 80:
1094 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1095 elif len(s) == 79:
1096 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1097
1098 else:
1099 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1100
1101 def _get_available_subtitles(self, video_id):
1102 try:
1103 sub_list = self._download_webpage(
1104 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1105 video_id, note=False)
1106 except ExtractorError as err:
1107 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1108 return {}
1109 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1110
1111 sub_lang_list = {}
1112 for l in lang_list:
1113 lang = l[1]
1114 params = compat_urllib_parse.urlencode({
1115 'lang': lang,
1116 'v': video_id,
1117 'fmt': self._downloader.params.get('subtitlesformat'),
1118 })
1119 url = u'http://www.youtube.com/api/timedtext?' + params
1120 sub_lang_list[lang] = url
1121 if not sub_lang_list:
1122 self._downloader.report_warning(u'video doesn\'t have subtitles')
1123 return {}
1124 return sub_lang_list
1125
1126 def _get_available_automatic_caption(self, video_id, webpage):
1127 """We need the webpage for getting the captions url, pass it as an
1128 argument to speed up the process."""
1129 sub_format = self._downloader.params.get('subtitlesformat')
1130 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1131 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1132 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1133 if mobj is None:
1134 self._downloader.report_warning(err_msg)
1135 return {}
1136 player_config = json.loads(mobj.group(1))
1137 try:
1138 args = player_config[u'args']
1139 caption_url = args[u'ttsurl']
1140 timestamp = args[u'timestamp']
1141 # We get the available subtitles
1142 list_params = compat_urllib_parse.urlencode({
1143 'type': 'list',
1144 'tlangs': 1,
1145 'asrs': 1,
1146 })
1147 list_url = caption_url + '&' + list_params
1148 list_page = self._download_webpage(list_url, video_id)
1149 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1150 original_lang_node = caption_list.find('track')
1151 if original_lang_node.attrib.get('kind') != 'asr' :
1152 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1153 return {}
1154 original_lang = original_lang_node.attrib['lang_code']
1155
1156 sub_lang_list = {}
1157 for lang_node in caption_list.findall('target'):
1158 sub_lang = lang_node.attrib['lang_code']
1159 params = compat_urllib_parse.urlencode({
1160 'lang': original_lang,
1161 'tlang': sub_lang,
1162 'fmt': sub_format,
1163 'ts': timestamp,
1164 'kind': 'asr',
1165 })
1166 sub_lang_list[sub_lang] = caption_url + '&' + params
1167 return sub_lang_list
1168 # An extractor error can be raise by the download process if there are
1169 # no automatic captions but there are subtitles
1170 except (KeyError, ExtractorError):
1171 self._downloader.report_warning(err_msg)
1172 return {}
1173
1174 def _print_formats(self, formats):
1175 print('Available formats:')
1176 for x in formats:
1177 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1178 self._video_dimensions.get(x, '???'),
1179 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1180
1181 def _extract_id(self, url):
1182 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1183 if mobj is None:
1184 raise ExtractorError(u'Invalid URL: %s' % url)
1185 video_id = mobj.group(2)
1186 return video_id
1187
1188 def _get_video_url_list(self, url_map):
1189 """
1190 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1191 with the requested formats.
1192 """
1193 req_format = self._downloader.params.get('format', None)
1194 format_limit = self._downloader.params.get('format_limit', None)
1195 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1196 if format_limit is not None and format_limit in available_formats:
1197 format_list = available_formats[available_formats.index(format_limit):]
1198 else:
1199 format_list = available_formats
1200 existing_formats = [x for x in format_list if x in url_map]
1201 if len(existing_formats) == 0:
1202 raise ExtractorError(u'no known formats available for video')
1203 if self._downloader.params.get('listformats', None):
1204 self._print_formats(existing_formats)
1205 return
1206 if req_format is None or req_format == 'best':
1207 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1208 elif req_format == 'worst':
1209 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1210 elif req_format in ('-1', 'all'):
1211 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1212 else:
1213 # Specific formats. We pick the first in a slash-delimeted sequence.
1214 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1215 # available in the specified format. For example,
1216 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1217 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1218 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1219 req_formats = req_format.split('/')
1220 video_url_list = None
1221 for rf in req_formats:
1222 if rf in url_map:
1223 video_url_list = [(rf, url_map[rf])]
1224 break
1225 if rf in self._video_formats_map:
1226 for srf in self._video_formats_map[rf]:
1227 if srf in url_map:
1228 video_url_list = [(srf, url_map[srf])]
1229 break
1230 else:
1231 continue
1232 break
1233 if video_url_list is None:
1234 raise ExtractorError(u'requested format not available')
1235 return video_url_list
1236
1237 def _extract_from_m3u8(self, manifest_url, video_id):
1238 url_map = {}
1239 def _get_urls(_manifest):
1240 lines = _manifest.split('\n')
1241 urls = filter(lambda l: l and not l.startswith('#'),
1242 lines)
1243 return urls
1244 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1245 formats_urls = _get_urls(manifest)
1246 for format_url in formats_urls:
1247 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1248 url_map[itag] = format_url
1249 return url_map
1250
1251 def _real_extract(self, url):
1252 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1253 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1254
1255 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1256 mobj = re.search(self._NEXT_URL_RE, url)
1257 if mobj:
1258 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1259 video_id = self._extract_id(url)
1260
1261 # Get video webpage
1262 self.report_video_webpage_download(video_id)
1263 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1264 request = compat_urllib_request.Request(url)
1265 try:
1266 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1267 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1268 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1269
1270 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1271
1272 # Attempt to extract SWF player URL
1273 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1274 if mobj is not None:
1275 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1276 else:
1277 player_url = None
1278
1279 # Get video info
1280 self.report_video_info_webpage_download(video_id)
1281 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1282 self.report_age_confirmation()
1283 age_gate = True
1284 # We simulate the access to the video from www.youtube.com/v/{video_id}
1285 # this can be viewed without login into Youtube
1286 data = compat_urllib_parse.urlencode({'video_id': video_id,
1287 'el': 'embedded',
1288 'gl': 'US',
1289 'hl': 'en',
1290 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1291 'asv': 3,
1292 'sts':'1588',
1293 })
1294 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1295 video_info_webpage = self._download_webpage(video_info_url, video_id,
1296 note=False,
1297 errnote='unable to download video info webpage')
1298 video_info = compat_parse_qs(video_info_webpage)
1299 else:
1300 age_gate = False
1301 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1302 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1303 % (video_id, el_type))
1304 video_info_webpage = self._download_webpage(video_info_url, video_id,
1305 note=False,
1306 errnote='unable to download video info webpage')
1307 video_info = compat_parse_qs(video_info_webpage)
1308 if 'token' in video_info:
1309 break
1310 if 'token' not in video_info:
1311 if 'reason' in video_info:
1312 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1313 else:
1314 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1315
1316 # Check for "rental" videos
1317 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1318 raise ExtractorError(u'"rental" videos not supported')
1319
1320 # Start extracting information
1321 self.report_information_extraction(video_id)
1322
1323 # uploader
1324 if 'author' not in video_info:
1325 raise ExtractorError(u'Unable to extract uploader name')
1326 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1327
1328 # uploader_id
1329 video_uploader_id = None
1330 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1331 if mobj is not None:
1332 video_uploader_id = mobj.group(1)
1333 else:
1334 self._downloader.report_warning(u'unable to extract uploader nickname')
1335
1336 # title
1337 if 'title' not in video_info:
1338 raise ExtractorError(u'Unable to extract video title')
1339 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1340
1341 # thumbnail image
1342 # We try first to get a high quality image:
1343 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1344 video_webpage, re.DOTALL)
1345 if m_thumb is not None:
1346 video_thumbnail = m_thumb.group(1)
1347 elif 'thumbnail_url' not in video_info:
1348 self._downloader.report_warning(u'unable to extract video thumbnail')
1349 video_thumbnail = None
1350 else: # don't panic if we can't find it
1351 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1352
1353 # upload date
1354 upload_date = None
1355 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1356 if mobj is not None:
1357 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1358 upload_date = unified_strdate(upload_date)
1359
1360 # description
1361 video_description = get_element_by_id("eow-description", video_webpage)
1362 if video_description:
1363 video_description = clean_html(video_description)
1364 else:
1365 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1366 if fd_mobj:
1367 video_description = unescapeHTML(fd_mobj.group(1))
1368 else:
1369 video_description = u''
1370
1371 # subtitles
1372 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1373
1374 if self._downloader.params.get('listsubtitles', False):
1375 self._list_available_subtitles(video_id, video_webpage)
1376 return
1377
1378 if 'length_seconds' not in video_info:
1379 self._downloader.report_warning(u'unable to extract video duration')
1380 video_duration = ''
1381 else:
1382 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1383
1384 # Decide which formats to download
1385
1386 try:
1387 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1388 if not mobj:
1389 raise ValueError('Could not find vevo ID')
1390 info = json.loads(mobj.group(1))
1391 args = info['args']
1392 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1393 # this signatures are encrypted
1394 if 'url_encoded_fmt_stream_map' not in args:
1395 raise ValueError(u'No stream_map present') # caught below
1396 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1397 if m_s is not None:
1398 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1399 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1400 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1401 if m_s is not None:
1402 if 'url_encoded_fmt_stream_map' in video_info:
1403 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1404 else:
1405 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1406 elif 'adaptive_fmts' in video_info:
1407 if 'url_encoded_fmt_stream_map' in video_info:
1408 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1409 else:
1410 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1411 except ValueError:
1412 pass
1413
1414 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1415 self.report_rtmp_download()
1416 video_url_list = [(None, video_info['conn'][0])]
1417 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1418 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1419 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1420 url_map = {}
1421 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1422 url_data = compat_parse_qs(url_data_str)
1423 if 'itag' in url_data and 'url' in url_data:
1424 url = url_data['url'][0]
1425 if 'sig' in url_data:
1426 url += '&signature=' + url_data['sig'][0]
1427 elif 's' in url_data:
1428 encrypted_sig = url_data['s'][0]
1429 if self._downloader.params.get('verbose'):
1430 if age_gate:
1431 if player_url is None:
1432 player_version = 'unknown'
1433 else:
1434 player_version = self._search_regex(
1435 r'-(.+)\.swf$', player_url,
1436 u'flash player', fatal=False)
1437 player_desc = 'flash player %s' % player_version
1438 else:
1439 player_version = self._search_regex(
1440 r'html5player-(.+?)\.js', video_webpage,
1441 'html5 player', fatal=False)
1442 player_desc = u'html5 player %s' % player_version
1443
1444 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1445 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1446 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1447
1448 if not age_gate:
1449 jsplayer_url_json = self._search_regex(
1450 r'"assets":.+?"js":\s*("[^"]+")',
1451 video_webpage, u'JS player URL')
1452 player_url = json.loads(jsplayer_url_json)
1453
1454 signature = self._decrypt_signature(
1455 encrypted_sig, video_id, player_url, age_gate)
1456 url += '&signature=' + signature
1457 if 'ratebypass' not in url:
1458 url += '&ratebypass=yes'
1459 url_map[url_data['itag'][0]] = url
1460 video_url_list = self._get_video_url_list(url_map)
1461 if not video_url_list:
1462 return
1463 elif video_info.get('hlsvp'):
1464 manifest_url = video_info['hlsvp'][0]
1465 url_map = self._extract_from_m3u8(manifest_url, video_id)
1466 video_url_list = self._get_video_url_list(url_map)
1467 if not video_url_list:
1468 return
1469
1470 else:
1471 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1472
1473 results = []
1474 for format_param, video_real_url in video_url_list:
1475 # Extension
1476 video_extension = self._video_extensions.get(format_param, 'flv')
1477
1478 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1479 self._video_dimensions.get(format_param, '???'),
1480 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1481
1482 results.append({
1483 'id': video_id,
1484 'url': video_real_url,
1485 'uploader': video_uploader,
1486 'uploader_id': video_uploader_id,
1487 'upload_date': upload_date,
1488 'title': video_title,
1489 'ext': video_extension,
1490 'format': video_format,
1491 'thumbnail': video_thumbnail,
1492 'description': video_description,
1493 'player_url': player_url,
1494 'subtitles': video_subtitles,
1495 'duration': video_duration
1496 })
1497 return results
1498
1499 class YoutubePlaylistIE(InfoExtractor):
1500 IE_DESC = u'YouTube.com playlists'
1501 _VALID_URL = r"""(?:
1502 (?:https?://)?
1503 (?:\w+\.)?
1504 youtube\.com/
1505 (?:
1506 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1507 \? (?:.*?&)*? (?:p|a|list)=
1508 | p/
1509 )
1510 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1511 .*
1512 |
1513 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1514 )"""
1515 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1516 _MAX_RESULTS = 50
1517 IE_NAME = u'youtube:playlist'
1518
1519 @classmethod
1520 def suitable(cls, url):
1521 """Receives a URL and returns True if suitable for this IE."""
1522 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1523
1524 def _real_extract(self, url):
1525 # Extract playlist id
1526 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1527 if mobj is None:
1528 raise ExtractorError(u'Invalid URL: %s' % url)
1529 playlist_id = mobj.group(1) or mobj.group(2)
1530
1531 # Check if it's a video-specific URL
1532 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1533 if 'v' in query_dict:
1534 video_id = query_dict['v'][0]
1535 if self._downloader.params.get('noplaylist'):
1536 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1537 return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
1538 else:
1539 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1540
1541 # Download playlist videos from API
1542 videos = []
1543
1544 for page_num in itertools.count(1):
1545 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1546 if start_index >= 1000:
1547 self._downloader.report_warning(u'Max number of results reached')
1548 break
1549 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1550 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1551
1552 try:
1553 response = json.loads(page)
1554 except ValueError as err:
1555 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1556
1557 if 'feed' not in response:
1558 raise ExtractorError(u'Got a malformed response from YouTube API')
1559 playlist_title = response['feed']['title']['$t']
1560 if 'entry' not in response['feed']:
1561 # Number of videos is a multiple of self._MAX_RESULTS
1562 break
1563
1564 for entry in response['feed']['entry']:
1565 index = entry['yt$position']['$t']
1566 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1567 videos.append((
1568 index,
1569 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1570 ))
1571
1572 videos = [v[1] for v in sorted(videos)]
1573
1574 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1575 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1576
1577
1578 class YoutubeChannelIE(InfoExtractor):
1579 IE_DESC = u'YouTube.com channels'
1580 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1581 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1582 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1583 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1584 IE_NAME = u'youtube:channel'
1585
1586 def extract_videos_from_page(self, page):
1587 ids_in_page = []
1588 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1589 if mobj.group(1) not in ids_in_page:
1590 ids_in_page.append(mobj.group(1))
1591 return ids_in_page
1592
1593 def _real_extract(self, url):
1594 # Extract channel id
1595 mobj = re.match(self._VALID_URL, url)
1596 if mobj is None:
1597 raise ExtractorError(u'Invalid URL: %s' % url)
1598
1599 # Download channel page
1600 channel_id = mobj.group(1)
1601 video_ids = []
1602 pagenum = 1
1603
1604 url = self._TEMPLATE_URL % (channel_id, pagenum)
1605 page = self._download_webpage(url, channel_id,
1606 u'Downloading page #%s' % pagenum)
1607
1608 # Extract video identifiers
1609 ids_in_page = self.extract_videos_from_page(page)
1610 video_ids.extend(ids_in_page)
1611
1612 # Download any subsequent channel pages using the json-based channel_ajax query
1613 if self._MORE_PAGES_INDICATOR in page:
1614 for pagenum in itertools.count(1):
1615 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1616 page = self._download_webpage(url, channel_id,
1617 u'Downloading page #%s' % pagenum)
1618
1619 page = json.loads(page)
1620
1621 ids_in_page = self.extract_videos_from_page(page['content_html'])
1622 video_ids.extend(ids_in_page)
1623
1624 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1625 break
1626
1627 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1628
1629 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1630 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1631 return [self.playlist_result(url_entries, channel_id)]
1632
1633
1634 class YoutubeUserIE(InfoExtractor):
1635 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1636 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1637 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1638 _GDATA_PAGE_SIZE = 50
1639 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1640 IE_NAME = u'youtube:user'
1641
1642 @classmethod
1643 def suitable(cls, url):
1644 # Don't return True if the url can be extracted with other youtube
1645 # extractor, the regex would is too permissive and it would match.
1646 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1647 if any(ie.suitable(url) for ie in other_ies): return False
1648 else: return super(YoutubeUserIE, cls).suitable(url)
1649
1650 def _real_extract(self, url):
1651 # Extract username
1652 mobj = re.match(self._VALID_URL, url)
1653 if mobj is None:
1654 raise ExtractorError(u'Invalid URL: %s' % url)
1655
1656 username = mobj.group(1)
1657
1658 # Download video ids using YouTube Data API. Result size per
1659 # query is limited (currently to 50 videos) so we need to query
1660 # page by page until there are no video ids - it means we got
1661 # all of them.
1662
1663 video_ids = []
1664
1665 for pagenum in itertools.count(0):
1666 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1667
1668 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1669 page = self._download_webpage(gdata_url, username,
1670 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1671
1672 try:
1673 response = json.loads(page)
1674 except ValueError as err:
1675 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1676 if 'entry' not in response['feed']:
1677 # Number of videos is a multiple of self._MAX_RESULTS
1678 break
1679
1680 # Extract video identifiers
1681 ids_in_page = []
1682 for entry in response['feed']['entry']:
1683 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1684 video_ids.extend(ids_in_page)
1685
1686 # A little optimization - if current page is not
1687 # "full", ie. does not contain PAGE_SIZE video ids then
1688 # we can assume that this page is the last one - there
1689 # are no more ids on further pages - no need to query
1690 # again.
1691
1692 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1693 break
1694
1695 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1696 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1697 return [self.playlist_result(url_results, playlist_title = username)]
1698
1699 class YoutubeSearchIE(SearchInfoExtractor):
1700 IE_DESC = u'YouTube.com searches'
1701 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1702 _MAX_RESULTS = 1000
1703 IE_NAME = u'youtube:search'
1704 _SEARCH_KEY = 'ytsearch'
1705
1706 def report_download_page(self, query, pagenum):
1707 """Report attempt to download search page with given number."""
1708 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1709
1710 def _get_n_results(self, query, n):
1711 """Get a specified number of results for a query"""
1712
1713 video_ids = []
1714 pagenum = 0
1715 limit = n
1716
1717 while (50 * pagenum) < limit:
1718 self.report_download_page(query, pagenum+1)
1719 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1720 request = compat_urllib_request.Request(result_url)
1721 try:
1722 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1723 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1725 api_response = json.loads(data)['data']
1726
1727 if not 'items' in api_response:
1728 raise ExtractorError(u'[youtube] No video results')
1729
1730 new_ids = list(video['id'] for video in api_response['items'])
1731 video_ids += new_ids
1732
1733 limit = min(n, api_response['totalItems'])
1734 pagenum += 1
1735
1736 if len(video_ids) > n:
1737 video_ids = video_ids[:n]
1738 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1739 return self.playlist_result(videos, query)
1740
1741
1742 class YoutubeShowIE(InfoExtractor):
1743 IE_DESC = u'YouTube.com (multi-season) shows'
1744 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1745 IE_NAME = u'youtube:show'
1746
1747 def _real_extract(self, url):
1748 mobj = re.match(self._VALID_URL, url)
1749 show_name = mobj.group(1)
1750 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1751 # There's one playlist for each season of the show
1752 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1753 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1754 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1755
1756
1757 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1758 """
1759 Base class for extractors that fetch info from
1760 http://www.youtube.com/feed_ajax
1761 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1762 """
1763 _LOGIN_REQUIRED = True
1764 _PAGING_STEP = 30
1765 # use action_load_personal_feed instead of action_load_system_feed
1766 _PERSONAL_FEED = False
1767
1768 @property
1769 def _FEED_TEMPLATE(self):
1770 action = 'action_load_system_feed'
1771 if self._PERSONAL_FEED:
1772 action = 'action_load_personal_feed'
1773 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1774
1775 @property
1776 def IE_NAME(self):
1777 return u'youtube:%s' % self._FEED_NAME
1778
1779 def _real_initialize(self):
1780 self._login()
1781
1782 def _real_extract(self, url):
1783 feed_entries = []
1784 # The step argument is available only in 2.7 or higher
1785 for i in itertools.count(0):
1786 paging = i*self._PAGING_STEP
1787 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1788 u'%s feed' % self._FEED_NAME,
1789 u'Downloading page %s' % i)
1790 info = json.loads(info)
1791 feed_html = info['feed_html']
1792 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1793 ids = orderedSet(m.group(1) for m in m_ids)
1794 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1795 if info['paging'] is None:
1796 break
1797 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1798
1799 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1800 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1801 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1802 _FEED_NAME = 'subscriptions'
1803 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1804
1805 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1806 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1807 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1808 _FEED_NAME = 'recommended'
1809 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1810
1811 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1812 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1813 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1814 _FEED_NAME = 'watch_later'
1815 _PLAYLIST_TITLE = u'Youtube Watch Later'
1816 _PAGING_STEP = 100
1817 _PERSONAL_FEED = True
1818
1819 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1820 IE_NAME = u'youtube:favorites'
1821 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1822 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1823 _LOGIN_REQUIRED = True
1824
1825 def _real_extract(self, url):
1826 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1827 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1828 return self.url_result(playlist_id, 'YoutubePlaylist')