]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
[youtube] Add filesystem signature cache
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3import collections
4import itertools
5import io
6import json
7import operator
8import os.path
9import re
10import shutil
11import socket
12import string
13import struct
14import traceback
15import zlib
16
17from .common import InfoExtractor, SearchInfoExtractor
18from .subtitles import SubtitlesInfoExtractor
19from ..utils import (
20 compat_http_client,
21 compat_parse_qs,
22 compat_urllib_error,
23 compat_urllib_parse,
24 compat_urllib_request,
25 compat_str,
26
27 clean_html,
28 get_element_by_id,
29 ExtractorError,
30 unescapeHTML,
31 unified_strdate,
32 orderedSet,
33)
34
35class YoutubeBaseInfoExtractor(InfoExtractor):
36 """Provide base functions for Youtube extractors"""
37 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
38 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
39 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
40 _NETRC_MACHINE = 'youtube'
41 # If True it will raise an error if no login info is provided
42 _LOGIN_REQUIRED = False
43
44 def report_lang(self):
45 """Report attempt to set language."""
46 self.to_screen(u'Setting language')
47
48 def _set_language(self):
49 request = compat_urllib_request.Request(self._LANG_URL)
50 try:
51 self.report_lang()
52 compat_urllib_request.urlopen(request).read()
53 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
54 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
55 return False
56 return True
57
58 def _login(self):
59 (username, password) = self._get_login_info()
60 # No authentication to be performed
61 if username is None:
62 if self._LOGIN_REQUIRED:
63 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
64 return False
65
66 request = compat_urllib_request.Request(self._LOGIN_URL)
67 try:
68 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
69 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
70 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
71 return False
72
73 galx = None
74 dsh = None
75 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
76 if match:
77 galx = match.group(1)
78 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
79 if match:
80 dsh = match.group(1)
81
82 # Log in
83 login_form_strs = {
84 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
85 u'Email': username,
86 u'GALX': galx,
87 u'Passwd': password,
88 u'PersistentCookie': u'yes',
89 u'_utf8': u'霱',
90 u'bgresponse': u'js_disabled',
91 u'checkConnection': u'',
92 u'checkedDomains': u'youtube',
93 u'dnConn': u'',
94 u'dsh': dsh,
95 u'pstMsg': u'0',
96 u'rmShown': u'1',
97 u'secTok': u'',
98 u'signIn': u'Sign in',
99 u'timeStmp': u'',
100 u'service': u'youtube',
101 u'uilel': u'3',
102 u'hl': u'en_US',
103 }
104 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
105 # chokes on unicode
106 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
107 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
108 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
109 try:
110 self.report_login()
111 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
112 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
113 self._downloader.report_warning(u'unable to log in: bad username or password')
114 return False
115 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
116 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
117 return False
118 return True
119
120 def _confirm_age(self):
121 age_form = {
122 'next_url': '/',
123 'action_confirm': 'Confirm',
124 }
125 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
126 try:
127 self.report_age_confirmation()
128 compat_urllib_request.urlopen(request).read().decode('utf-8')
129 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
130 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
131 return True
132
133 def _real_initialize(self):
134 if self._downloader is None:
135 return
136 if not self._set_language():
137 return
138 if not self._login():
139 return
140 self._confirm_age()
141
142
143class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
144 IE_DESC = u'YouTube.com'
145 _VALID_URL = r"""^
146 (
147 (?:https?://)? # http(s):// (optional)
148 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
149 tube\.majestyc\.net/|
150 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
151 (?:.*?\#/)? # handle anchor (#/) redirect urls
152 (?: # the various things that can precede the ID:
153 (?:(?:v|embed|e)/) # v/ or embed/ or e/
154 |(?: # or the v= param in all its forms
155 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
156 (?:\?|\#!?) # the params delimiter ? or # or #!
157 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
158 v=
159 )
160 ))
161 |youtu\.be/ # just youtu.be/xxxx
162 )
163 )? # all until now is optional -> you can pass the naked ID
164 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
165 (?(1).+)? # if we found the ID, everything can follow
166 $"""
167 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
168 # Listed in order of quality
169 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
170 # Apple HTTP Live Streaming
171 '96', '95', '94', '93', '92', '132', '151',
172 # 3D
173 '85', '84', '102', '83', '101', '82', '100',
174 # Dash video
175 '138', '137', '248', '136', '247', '135', '246',
176 '245', '244', '134', '243', '133', '242', '160',
177 # Dash audio
178 '141', '172', '140', '171', '139',
179 ]
180 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
181 # Apple HTTP Live Streaming
182 '96', '95', '94', '93', '92', '132', '151',
183 # 3D
184 '85', '102', '84', '101', '83', '100', '82',
185 # Dash video
186 '138', '248', '137', '247', '136', '246', '245',
187 '244', '135', '243', '134', '242', '133', '160',
188 # Dash audio
189 '172', '141', '171', '140', '139',
190 ]
191 _video_formats_map = {
192 'flv': ['35', '34', '6', '5'],
193 '3gp': ['36', '17', '13'],
194 'mp4': ['38', '37', '22', '18'],
195 'webm': ['46', '45', '44', '43'],
196 }
197 _video_extensions = {
198 '13': '3gp',
199 '17': '3gp',
200 '18': 'mp4',
201 '22': 'mp4',
202 '36': '3gp',
203 '37': 'mp4',
204 '38': 'mp4',
205 '43': 'webm',
206 '44': 'webm',
207 '45': 'webm',
208 '46': 'webm',
209
210 # 3d videos
211 '82': 'mp4',
212 '83': 'mp4',
213 '84': 'mp4',
214 '85': 'mp4',
215 '100': 'webm',
216 '101': 'webm',
217 '102': 'webm',
218
219 # Apple HTTP Live Streaming
220 '92': 'mp4',
221 '93': 'mp4',
222 '94': 'mp4',
223 '95': 'mp4',
224 '96': 'mp4',
225 '132': 'mp4',
226 '151': 'mp4',
227
228 # Dash mp4
229 '133': 'mp4',
230 '134': 'mp4',
231 '135': 'mp4',
232 '136': 'mp4',
233 '137': 'mp4',
234 '138': 'mp4',
235 '139': 'mp4',
236 '140': 'mp4',
237 '141': 'mp4',
238 '160': 'mp4',
239
240 # Dash webm
241 '171': 'webm',
242 '172': 'webm',
243 '242': 'webm',
244 '243': 'webm',
245 '244': 'webm',
246 '245': 'webm',
247 '246': 'webm',
248 '247': 'webm',
249 '248': 'webm',
250 }
251 _video_dimensions = {
252 '5': '240x400',
253 '6': '???',
254 '13': '???',
255 '17': '144x176',
256 '18': '360x640',
257 '22': '720x1280',
258 '34': '360x640',
259 '35': '480x854',
260 '36': '240x320',
261 '37': '1080x1920',
262 '38': '3072x4096',
263 '43': '360x640',
264 '44': '480x854',
265 '45': '720x1280',
266 '46': '1080x1920',
267 '82': '360p',
268 '83': '480p',
269 '84': '720p',
270 '85': '1080p',
271 '92': '240p',
272 '93': '360p',
273 '94': '480p',
274 '95': '720p',
275 '96': '1080p',
276 '100': '360p',
277 '101': '480p',
278 '102': '720p',
279 '132': '240p',
280 '151': '72p',
281 '133': '240p',
282 '134': '360p',
283 '135': '480p',
284 '136': '720p',
285 '137': '1080p',
286 '138': '>1080p',
287 '139': '48k',
288 '140': '128k',
289 '141': '256k',
290 '160': '192p',
291 '171': '128k',
292 '172': '256k',
293 '242': '240p',
294 '243': '360p',
295 '244': '480p',
296 '245': '480p',
297 '246': '480p',
298 '247': '720p',
299 '248': '1080p',
300 }
301 _special_itags = {
302 '82': '3D',
303 '83': '3D',
304 '84': '3D',
305 '85': '3D',
306 '100': '3D',
307 '101': '3D',
308 '102': '3D',
309 '133': 'DASH Video',
310 '134': 'DASH Video',
311 '135': 'DASH Video',
312 '136': 'DASH Video',
313 '137': 'DASH Video',
314 '138': 'DASH Video',
315 '139': 'DASH Audio',
316 '140': 'DASH Audio',
317 '141': 'DASH Audio',
318 '160': 'DASH Video',
319 '171': 'DASH Audio',
320 '172': 'DASH Audio',
321 '242': 'DASH Video',
322 '243': 'DASH Video',
323 '244': 'DASH Video',
324 '245': 'DASH Video',
325 '246': 'DASH Video',
326 '247': 'DASH Video',
327 '248': 'DASH Video',
328 }
329
330 IE_NAME = u'youtube'
331 _TESTS = [
332 {
333 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
334 u"file": u"BaW_jenozKc.mp4",
335 u"info_dict": {
336 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
337 u"uploader": u"Philipp Hagemeister",
338 u"uploader_id": u"phihag",
339 u"upload_date": u"20121002",
340 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
341 }
342 },
343 {
344 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
345 u"file": u"1ltcDfZMA3U.flv",
346 u"note": u"Test VEVO video (#897)",
347 u"info_dict": {
348 u"upload_date": u"20070518",
349 u"title": u"Maps - It Will Find You",
350 u"description": u"Music video by Maps performing It Will Find You.",
351 u"uploader": u"MuteUSA",
352 u"uploader_id": u"MuteUSA"
353 }
354 },
355 {
356 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
357 u"file": u"UxxajLWwzqY.mp4",
358 u"note": u"Test generic use_cipher_signature video (#897)",
359 u"info_dict": {
360 u"upload_date": u"20120506",
361 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
362 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
363 u"uploader": u"Icona Pop",
364 u"uploader_id": u"IconaPop"
365 }
366 },
367 {
368 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
369 u"file": u"07FYdnEawAQ.mp4",
370 u"note": u"Test VEVO video with age protection (#956)",
371 u"info_dict": {
372 u"upload_date": u"20130703",
373 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
374 u"description": u"md5:64249768eec3bc4276236606ea996373",
375 u"uploader": u"justintimberlakeVEVO",
376 u"uploader_id": u"justintimberlakeVEVO"
377 }
378 },
379 {
380 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
381 u'file': u'TGi3HqYrWHE.mp4',
382 u'note': u'm3u8 video',
383 u'info_dict': {
384 u'title': u'Triathlon - Men - London 2012 Olympic Games',
385 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
386 u'uploader': u'olympic',
387 u'upload_date': u'20120807',
388 u'uploader_id': u'olympic',
389 },
390 u'params': {
391 u'skip_download': True,
392 },
393 },
394 ]
395
396
397 @classmethod
398 def suitable(cls, url):
399 """Receives a URL and returns True if suitable for this IE."""
400 if YoutubePlaylistIE.suitable(url): return False
401 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
402
403 def __init__(self, *args, **kwargs):
404 super(YoutubeIE, self).__init__(*args, **kwargs)
405 self._player_cache = {}
406
407 def report_video_webpage_download(self, video_id):
408 """Report attempt to download video webpage."""
409 self.to_screen(u'%s: Downloading video webpage' % video_id)
410
411 def report_video_info_webpage_download(self, video_id):
412 """Report attempt to download video info webpage."""
413 self.to_screen(u'%s: Downloading video info webpage' % video_id)
414
415 def report_information_extraction(self, video_id):
416 """Report attempt to extract video information."""
417 self.to_screen(u'%s: Extracting video information' % video_id)
418
419 def report_unavailable_format(self, video_id, format):
420 """Report extracted video URL."""
421 self.to_screen(u'%s: Format %s not available' % (video_id, format))
422
423 def report_rtmp_download(self):
424 """Indicate the download will use the RTMP protocol."""
425 self.to_screen(u'RTMP download detected')
426
427 def _extract_signature_function(self, video_id, player_url, slen):
428 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
429 player_url)
430 player_type = id_m.group('ext')
431 player_id = id_m.group('id')
432
433 # Read from filesystem cache
434 func_id = '%s_%s_%d' % (player_type, player_id, slen)
435 assert os.path.basename(func_id) == func_id
436 cache_dir = self.downloader.params.get('cachedir',
437 u'~/.youtube-dl/cache')
438
439 if cache_dir is not False:
440 cache_fn = os.path.join(os.path.expanduser(cache_dir),
441 u'youtube-sigfuncs',
442 func_id + '.json')
443 try:
444 with io.open(cache_fn, '', encoding='utf-8') as cachef:
445 cache_spec = json.load(cachef)
446 return lambda s: u''.join(s[i] for i in cache_spec)
447 except OSError:
448 pass # No cache available
449
450 if player_type == 'js':
451 code = self._download_webpage(
452 player_url, video_id,
453 note=u'Downloading %s player %s' % (player_type, player_id),
454 errnote=u'Download of %s failed' % player_url)
455 res = self._parse_sig_js(code)
456 elif player_type == 'swf':
457 urlh = self._request_webpage(
458 player_url, video_id,
459 note=u'Downloading %s player %s' % (player_type, player_id),
460 errnote=u'Download of %s failed' % player_url)
461 code = urlh.read()
462 res = self._parse_sig_swf(code)
463 else:
464 assert False, 'Invalid player type %r' % player_type
465
466 if cache_dir is not False:
467 cache_res = res(map(compat_chr, range(slen)))
468 cache_spec = [ord(c) for c in cache_res]
469 shutil.makedirs(os.path.dirname(cache_fn))
470 write_json_file(cache_spec, cache_fn)
471
472 return res
473
474 def _parse_sig_js(self, jscode):
475 funcname = self._search_regex(
476 r'signature=([a-zA-Z]+)', jscode,
477 u'Initial JS player signature function name')
478
479 functions = {}
480
481 def argidx(varname):
482 return string.lowercase.index(varname)
483
484 def interpret_statement(stmt, local_vars, allow_recursion=20):
485 if allow_recursion < 0:
486 raise ExctractorError(u'Recursion limit reached')
487
488 if stmt.startswith(u'var '):
489 stmt = stmt[len(u'var '):]
490 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
491 r'=(?P<expr>.*)$', stmt)
492 if ass_m:
493 if ass_m.groupdict().get('index'):
494 def assign(val):
495 lvar = local_vars[ass_m.group('out')]
496 idx = interpret_expression(ass_m.group('index'),
497 local_vars, allow_recursion)
498 assert isinstance(idx, int)
499 lvar[idx] = val
500 return val
501 expr = ass_m.group('expr')
502 else:
503 def assign(val):
504 local_vars[ass_m.group('out')] = val
505 return val
506 expr = ass_m.group('expr')
507 elif stmt.startswith(u'return '):
508 assign = lambda v: v
509 expr = stmt[len(u'return '):]
510 else:
511 raise ExtractorError(
512 u'Cannot determine left side of statement in %r' % stmt)
513
514 v = interpret_expression(expr, local_vars, allow_recursion)
515 return assign(v)
516
517 def interpret_expression(expr, local_vars, allow_recursion):
518 if expr.isdigit():
519 return int(expr)
520
521 if expr.isalpha():
522 return local_vars[expr]
523
524 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
525 if m:
526 member = m.group('member')
527 val = local_vars[m.group('in')]
528 if member == 'split("")':
529 return list(val)
530 if member == 'join("")':
531 return u''.join(val)
532 if member == 'length':
533 return len(val)
534 if member == 'reverse()':
535 return val[::-1]
536 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
537 if slice_m:
538 idx = interpret_expression(
539 slice_m.group('idx'), local_vars, allow_recursion-1)
540 return val[idx:]
541
542 m = re.match(
543 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
544 if m:
545 val = local_vars[m.group('in')]
546 idx = interpret_expression(m.group('idx'), local_vars,
547 allow_recursion-1)
548 return val[idx]
549
550 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
551 if m:
552 a = interpret_expression(m.group('a'),
553 local_vars, allow_recursion)
554 b = interpret_expression(m.group('b'),
555 local_vars, allow_recursion)
556 return a % b
557
558 m = re.match(
559 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
560 if m:
561 fname = m.group('func')
562 if fname not in functions:
563 functions[fname] = extract_function(fname)
564 argvals = [int(v) if v.isdigit() else local_vars[v]
565 for v in m.group('args').split(',')]
566 return functions[fname](argvals)
567 raise ExtractorError(u'Unsupported JS expression %r' % expr)
568
569 def extract_function(funcname):
570 func_m = re.search(
571 r'function ' + re.escape(funcname) +
572 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
573 jscode)
574 argnames = func_m.group('args').split(',')
575
576 def resf(args):
577 local_vars = dict(zip(argnames, args))
578 for stmt in func_m.group('code').split(';'):
579 res = interpret_statement(stmt, local_vars)
580 return res
581 return resf
582
583 initial_function = extract_function(funcname)
584 return lambda s: initial_function([s])
585
586 def _parse_sig_swf(self, file_contents):
587 if file_contents[1:3] != b'WS':
588 raise ExtractorError(
589 u'Not an SWF file; header is %r' % file_contents[:3])
590 if file_contents[:1] == b'C':
591 content = zlib.decompress(file_contents[8:])
592 else:
593 raise NotImplementedError(u'Unsupported compression format %r' %
594 file_contents[:1])
595
596 def extract_tags(content):
597 pos = 0
598 while pos < len(content):
599 header16 = struct.unpack('<H', content[pos:pos+2])[0]
600 pos += 2
601 tag_code = header16 >> 6
602 tag_len = header16 & 0x3f
603 if tag_len == 0x3f:
604 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
605 pos += 4
606 assert pos+tag_len <= len(content)
607 yield (tag_code, content[pos:pos+tag_len])
608 pos += tag_len
609
610 code_tag = next(tag
611 for tag_code, tag in extract_tags(content)
612 if tag_code == 82)
613 p = code_tag.index(b'\0', 4) + 1
614 code_reader = io.BytesIO(code_tag[p:])
615
616 # Parse ABC (AVM2 ByteCode)
617 def read_int(reader=None):
618 if reader is None:
619 reader = code_reader
620 res = 0
621 shift = 0
622 for _ in range(5):
623 buf = reader.read(1)
624 assert len(buf) == 1
625 b = struct.unpack('<B', buf)[0]
626 res = res | ((b & 0x7f) << shift)
627 if b & 0x80 == 0:
628 break
629 shift += 7
630 return res
631
632 def u30(reader=None):
633 res = read_int(reader)
634 assert res & 0xf0000000 == 0
635 return res
636 u32 = read_int
637
638 def s32(reader=None):
639 v = read_int(reader)
640 if v & 0x80000000 != 0:
641 v = - ((v ^ 0xffffffff) + 1)
642 return v
643
644 def string(reader=None):
645 if reader is None:
646 reader = code_reader
647 slen = u30(reader)
648 resb = reader.read(slen)
649 assert len(resb) == slen
650 return resb.decode('utf-8')
651
652 def read_bytes(count, reader=None):
653 if reader is None:
654 reader = code_reader
655 resb = reader.read(count)
656 assert len(resb) == count
657 return resb
658
659 def read_byte(reader=None):
660 resb = read_bytes(1, reader=reader)
661 res = struct.unpack('<B', resb)[0]
662 return res
663
664 # minor_version + major_version
665 _ = read_bytes(2 + 2)
666
667 # Constant pool
668 int_count = u30()
669 for _c in range(1, int_count):
670 _ = s32()
671 uint_count = u30()
672 for _c in range(1, uint_count):
673 _ = u32()
674 double_count = u30()
675 _ = read_bytes((double_count-1) * 8)
676 string_count = u30()
677 constant_strings = [u'']
678 for _c in range(1, string_count):
679 s = string()
680 constant_strings.append(s)
681 namespace_count = u30()
682 for _c in range(1, namespace_count):
683 _ = read_bytes(1) # kind
684 _ = u30() # name
685 ns_set_count = u30()
686 for _c in range(1, ns_set_count):
687 count = u30()
688 for _c2 in range(count):
689 _ = u30()
690 multiname_count = u30()
691 MULTINAME_SIZES = {
692 0x07: 2, # QName
693 0x0d: 2, # QNameA
694 0x0f: 1, # RTQName
695 0x10: 1, # RTQNameA
696 0x11: 0, # RTQNameL
697 0x12: 0, # RTQNameLA
698 0x09: 2, # Multiname
699 0x0e: 2, # MultinameA
700 0x1b: 1, # MultinameL
701 0x1c: 1, # MultinameLA
702 }
703 multinames = [u'']
704 for _c in range(1, multiname_count):
705 kind = u30()
706 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
707 if kind == 0x07:
708 namespace_idx = u30()
709 name_idx = u30()
710 multinames.append(constant_strings[name_idx])
711 else:
712 multinames.append('[MULTINAME kind: %d]' % kind)
713 for _c2 in range(MULTINAME_SIZES[kind]):
714 _ = u30()
715
716 # Methods
717 method_count = u30()
718 MethodInfo = collections.namedtuple(
719 'MethodInfo',
720 ['NEED_ARGUMENTS', 'NEED_REST'])
721 method_infos = []
722 for method_id in range(method_count):
723 param_count = u30()
724 _ = u30() # return type
725 for _ in range(param_count):
726 _ = u30() # param type
727 _ = u30() # name index (always 0 for youtube)
728 flags = read_byte()
729 if flags & 0x08 != 0:
730 # Options present
731 option_count = u30()
732 for c in range(option_count):
733 _ = u30() # val
734 _ = read_bytes(1) # kind
735 if flags & 0x80 != 0:
736 # Param names present
737 for _ in range(param_count):
738 _ = u30() # param name
739 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
740 method_infos.append(mi)
741
742 # Metadata
743 metadata_count = u30()
744 for _c in range(metadata_count):
745 _ = u30() # name
746 item_count = u30()
747 for _c2 in range(item_count):
748 _ = u30() # key
749 _ = u30() # value
750
751 def parse_traits_info():
752 trait_name_idx = u30()
753 kind_full = read_byte()
754 kind = kind_full & 0x0f
755 attrs = kind_full >> 4
756 methods = {}
757 if kind in [0x00, 0x06]: # Slot or Const
758 _ = u30() # Slot id
759 type_name_idx = u30()
760 vindex = u30()
761 if vindex != 0:
762 _ = read_byte() # vkind
763 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
764 _ = u30() # disp_id
765 method_idx = u30()
766 methods[multinames[trait_name_idx]] = method_idx
767 elif kind == 0x04: # Class
768 _ = u30() # slot_id
769 _ = u30() # classi
770 elif kind == 0x05: # Function
771 _ = u30() # slot_id
772 function_idx = u30()
773 methods[function_idx] = multinames[trait_name_idx]
774 else:
775 raise ExtractorError(u'Unsupported trait kind %d' % kind)
776
777 if attrs & 0x4 != 0: # Metadata present
778 metadata_count = u30()
779 for _c3 in range(metadata_count):
780 _ = u30()
781
782 return methods
783
784 # Classes
785 TARGET_CLASSNAME = u'SignatureDecipher'
786 searched_idx = multinames.index(TARGET_CLASSNAME)
787 searched_class_id = None
788 class_count = u30()
789 for class_id in range(class_count):
790 name_idx = u30()
791 if name_idx == searched_idx:
792 # We found the class we're looking for!
793 searched_class_id = class_id
794 _ = u30() # super_name idx
795 flags = read_byte()
796 if flags & 0x08 != 0: # Protected namespace is present
797 protected_ns_idx = u30()
798 intrf_count = u30()
799 for _c2 in range(intrf_count):
800 _ = u30()
801 _ = u30() # iinit
802 trait_count = u30()
803 for _c2 in range(trait_count):
804 _ = parse_traits_info()
805
806 if searched_class_id is None:
807 raise ExtractorError(u'Target class %r not found' %
808 TARGET_CLASSNAME)
809
810 method_names = {}
811 method_idxs = {}
812 for class_id in range(class_count):
813 _ = u30() # cinit
814 trait_count = u30()
815 for _c2 in range(trait_count):
816 trait_methods = parse_traits_info()
817 if class_id == searched_class_id:
818 method_names.update(trait_methods.items())
819 method_idxs.update(dict(
820 (idx, name)
821 for name, idx in trait_methods.items()))
822
823 # Scripts
824 script_count = u30()
825 for _c in range(script_count):
826 _ = u30() # init
827 trait_count = u30()
828 for _c2 in range(trait_count):
829 _ = parse_traits_info()
830
831 # Method bodies
832 method_body_count = u30()
833 Method = collections.namedtuple('Method', ['code', 'local_count'])
834 methods = {}
835 for _c in range(method_body_count):
836 method_idx = u30()
837 max_stack = u30()
838 local_count = u30()
839 init_scope_depth = u30()
840 max_scope_depth = u30()
841 code_length = u30()
842 code = read_bytes(code_length)
843 if method_idx in method_idxs:
844 m = Method(code, local_count)
845 methods[method_idxs[method_idx]] = m
846 exception_count = u30()
847 for _c2 in range(exception_count):
848 _ = u30() # from
849 _ = u30() # to
850 _ = u30() # target
851 _ = u30() # exc_type
852 _ = u30() # var_name
853 trait_count = u30()
854 for _c2 in range(trait_count):
855 _ = parse_traits_info()
856
857 assert p + code_reader.tell() == len(code_tag)
858 assert len(methods) == len(method_idxs)
859
860 method_pyfunctions = {}
861
862 def extract_function(func_name):
863 if func_name in method_pyfunctions:
864 return method_pyfunctions[func_name]
865 if func_name not in methods:
866 raise ExtractorError(u'Cannot find function %r' % func_name)
867 m = methods[func_name]
868
869 def resfunc(args):
870 registers = ['(this)'] + list(args) + [None] * m.local_count
871 stack = []
872 coder = io.BytesIO(m.code)
873 while True:
874 opcode = struct.unpack('!B', coder.read(1))[0]
875 if opcode == 36: # pushbyte
876 v = struct.unpack('!B', coder.read(1))[0]
877 stack.append(v)
878 elif opcode == 44: # pushstring
879 idx = u30(coder)
880 stack.append(constant_strings[idx])
881 elif opcode == 48: # pushscope
882 # We don't implement the scope register, so we'll just
883 # ignore the popped value
884 stack.pop()
885 elif opcode == 70: # callproperty
886 index = u30(coder)
887 mname = multinames[index]
888 arg_count = u30(coder)
889 args = list(reversed(
890 [stack.pop() for _ in range(arg_count)]))
891 obj = stack.pop()
892 if mname == u'split':
893 assert len(args) == 1
894 assert isinstance(args[0], compat_str)
895 assert isinstance(obj, compat_str)
896 if args[0] == u'':
897 res = list(obj)
898 else:
899 res = obj.split(args[0])
900 stack.append(res)
901 elif mname == u'slice':
902 assert len(args) == 1
903 assert isinstance(args[0], int)
904 assert isinstance(obj, list)
905 res = obj[args[0]:]
906 stack.append(res)
907 elif mname == u'join':
908 assert len(args) == 1
909 assert isinstance(args[0], compat_str)
910 assert isinstance(obj, list)
911 res = args[0].join(obj)
912 stack.append(res)
913 elif mname in method_pyfunctions:
914 stack.append(method_pyfunctions[mname](args))
915 else:
916 raise NotImplementedError(
917 u'Unsupported property %r on %r'
918 % (mname, obj))
919 elif opcode == 72: # returnvalue
920 res = stack.pop()
921 return res
922 elif opcode == 79: # callpropvoid
923 index = u30(coder)
924 mname = multinames[index]
925 arg_count = u30(coder)
926 args = list(reversed(
927 [stack.pop() for _ in range(arg_count)]))
928 obj = stack.pop()
929 if mname == u'reverse':
930 assert isinstance(obj, list)
931 obj.reverse()
932 else:
933 raise NotImplementedError(
934 u'Unsupported (void) property %r on %r'
935 % (mname, obj))
936 elif opcode == 93: # findpropstrict
937 index = u30(coder)
938 mname = multinames[index]
939 res = extract_function(mname)
940 stack.append(res)
941 elif opcode == 97: # setproperty
942 index = u30(coder)
943 value = stack.pop()
944 idx = stack.pop()
945 obj = stack.pop()
946 assert isinstance(obj, list)
947 assert isinstance(idx, int)
948 obj[idx] = value
949 elif opcode == 98: # getlocal
950 index = u30(coder)
951 stack.append(registers[index])
952 elif opcode == 99: # setlocal
953 index = u30(coder)
954 value = stack.pop()
955 registers[index] = value
956 elif opcode == 102: # getproperty
957 index = u30(coder)
958 pname = multinames[index]
959 if pname == u'length':
960 obj = stack.pop()
961 assert isinstance(obj, list)
962 stack.append(len(obj))
963 else: # Assume attribute access
964 idx = stack.pop()
965 assert isinstance(idx, int)
966 obj = stack.pop()
967 assert isinstance(obj, list)
968 stack.append(obj[idx])
969 elif opcode == 128: # coerce
970 _ = u30(coder)
971 elif opcode == 133: # coerce_s
972 assert isinstance(stack[-1], (type(None), compat_str))
973 elif opcode == 164: # modulo
974 value2 = stack.pop()
975 value1 = stack.pop()
976 res = value1 % value2
977 stack.append(res)
978 elif opcode == 208: # getlocal_0
979 stack.append(registers[0])
980 elif opcode == 209: # getlocal_1
981 stack.append(registers[1])
982 elif opcode == 210: # getlocal_2
983 stack.append(registers[2])
984 elif opcode == 211: # getlocal_3
985 stack.append(registers[3])
986 elif opcode == 214: # setlocal_2
987 registers[2] = stack.pop()
988 elif opcode == 215: # setlocal_3
989 registers[3] = stack.pop()
990 else:
991 raise NotImplementedError(
992 u'Unsupported opcode %d' % opcode)
993
994 method_pyfunctions[func_name] = resfunc
995 return resfunc
996
997 initial_function = extract_function(u'decipher')
998 return lambda s: initial_function([s])
999
1000 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1001 """Turn the encrypted s field into a working signature"""
1002
1003 if player_url is not None:
1004 try:
1005 if player_url not in self._player_cache:
1006 func = self._extract_signature_function(
1007 video_id, player_url, len(s)
1008 )
1009 self._player_cache[player_url] = func
1010 return self._player_cache[player_url](s)
1011 except Exception as e:
1012 tb = traceback.format_exc()
1013 self._downloader.report_warning(
1014 u'Automatic signature extraction failed: ' + tb)
1015
1016 self._downloader.report_warning(
1017 u'Warning: Falling back to static signature algorithm')
1018 return self._static_decrypt_signature(
1019 s, video_id, player_url, age_gate)
1020
1021 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1022 if age_gate:
1023 # The videos with age protection use another player, so the
1024 # algorithms can be different.
1025 if len(s) == 86:
1026 return s[2:63] + s[82] + s[64:82] + s[63]
1027
1028 if len(s) == 92:
1029 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1030 elif len(s) == 90:
1031 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1032 elif len(s) == 89:
1033 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1034 elif len(s) == 88:
1035 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1036 elif len(s) == 87:
1037 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1038 elif len(s) == 86:
1039 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1040 elif len(s) == 85:
1041 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1042 elif len(s) == 84:
1043 return s[81:36:-1] + s[0] + s[35:2:-1]
1044 elif len(s) == 83:
1045 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1046 elif len(s) == 82:
1047 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1048 elif len(s) == 81:
1049 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1050 elif len(s) == 80:
1051 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1052 elif len(s) == 79:
1053 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1054
1055 else:
1056 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1057
1058 def _decrypt_signature_age_gate(self, s):
1059 # The videos with age protection use another player, so the algorithms
1060 # can be different.
1061 if len(s) == 86:
1062 return s[2:63] + s[82] + s[64:82] + s[63]
1063 else:
1064 # Fallback to the other algortihms
1065 return self._decrypt_signature(s)
1066
1067 def _get_available_subtitles(self, video_id):
1068 try:
1069 sub_list = self._download_webpage(
1070 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1071 video_id, note=False)
1072 except ExtractorError as err:
1073 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1074 return {}
1075 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1076
1077 sub_lang_list = {}
1078 for l in lang_list:
1079 lang = l[1]
1080 params = compat_urllib_parse.urlencode({
1081 'lang': lang,
1082 'v': video_id,
1083 'fmt': self._downloader.params.get('subtitlesformat'),
1084 })
1085 url = u'http://www.youtube.com/api/timedtext?' + params
1086 sub_lang_list[lang] = url
1087 if not sub_lang_list:
1088 self._downloader.report_warning(u'video doesn\'t have subtitles')
1089 return {}
1090 return sub_lang_list
1091
1092 def _get_available_automatic_caption(self, video_id, webpage):
1093 """We need the webpage for getting the captions url, pass it as an
1094 argument to speed up the process."""
1095 sub_format = self._downloader.params.get('subtitlesformat')
1096 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1097 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1098 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1099 if mobj is None:
1100 self._downloader.report_warning(err_msg)
1101 return {}
1102 player_config = json.loads(mobj.group(1))
1103 try:
1104 args = player_config[u'args']
1105 caption_url = args[u'ttsurl']
1106 timestamp = args[u'timestamp']
1107 # We get the available subtitles
1108 list_params = compat_urllib_parse.urlencode({
1109 'type': 'list',
1110 'tlangs': 1,
1111 'asrs': 1,
1112 })
1113 list_url = caption_url + '&' + list_params
1114 list_page = self._download_webpage(list_url, video_id)
1115 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1116 original_lang_node = caption_list.find('track')
1117 if original_lang_node.attrib.get('kind') != 'asr' :
1118 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1119 return {}
1120 original_lang = original_lang_node.attrib['lang_code']
1121
1122 sub_lang_list = {}
1123 for lang_node in caption_list.findall('target'):
1124 sub_lang = lang_node.attrib['lang_code']
1125 params = compat_urllib_parse.urlencode({
1126 'lang': original_lang,
1127 'tlang': sub_lang,
1128 'fmt': sub_format,
1129 'ts': timestamp,
1130 'kind': 'asr',
1131 })
1132 sub_lang_list[sub_lang] = caption_url + '&' + params
1133 return sub_lang_list
1134 # An extractor error can be raise by the download process if there are
1135 # no automatic captions but there are subtitles
1136 except (KeyError, ExtractorError):
1137 self._downloader.report_warning(err_msg)
1138 return {}
1139
1140 def _print_formats(self, formats):
1141 print('Available formats:')
1142 for x in formats:
1143 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1144 self._video_dimensions.get(x, '???'),
1145 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1146
1147 def _extract_id(self, url):
1148 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1149 if mobj is None:
1150 raise ExtractorError(u'Invalid URL: %s' % url)
1151 video_id = mobj.group(2)
1152 return video_id
1153
1154 def _get_video_url_list(self, url_map):
1155 """
1156 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1157 with the requested formats.
1158 """
1159 req_format = self._downloader.params.get('format', None)
1160 format_limit = self._downloader.params.get('format_limit', None)
1161 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1162 if format_limit is not None and format_limit in available_formats:
1163 format_list = available_formats[available_formats.index(format_limit):]
1164 else:
1165 format_list = available_formats
1166 existing_formats = [x for x in format_list if x in url_map]
1167 if len(existing_formats) == 0:
1168 raise ExtractorError(u'no known formats available for video')
1169 if self._downloader.params.get('listformats', None):
1170 self._print_formats(existing_formats)
1171 return
1172 if req_format is None or req_format == 'best':
1173 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1174 elif req_format == 'worst':
1175 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1176 elif req_format in ('-1', 'all'):
1177 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1178 else:
1179 # Specific formats. We pick the first in a slash-delimeted sequence.
1180 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1181 # available in the specified format. For example,
1182 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1183 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1184 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1185 req_formats = req_format.split('/')
1186 video_url_list = None
1187 for rf in req_formats:
1188 if rf in url_map:
1189 video_url_list = [(rf, url_map[rf])]
1190 break
1191 if rf in self._video_formats_map:
1192 for srf in self._video_formats_map[rf]:
1193 if srf in url_map:
1194 video_url_list = [(srf, url_map[srf])]
1195 break
1196 else:
1197 continue
1198 break
1199 if video_url_list is None:
1200 raise ExtractorError(u'requested format not available')
1201 return video_url_list
1202
1203 def _extract_from_m3u8(self, manifest_url, video_id):
1204 url_map = {}
1205 def _get_urls(_manifest):
1206 lines = _manifest.split('\n')
1207 urls = filter(lambda l: l and not l.startswith('#'),
1208 lines)
1209 return urls
1210 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1211 formats_urls = _get_urls(manifest)
1212 for format_url in formats_urls:
1213 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1214 url_map[itag] = format_url
1215 return url_map
1216
1217 def _real_extract(self, url):
1218 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1219 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1220
1221 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1222 mobj = re.search(self._NEXT_URL_RE, url)
1223 if mobj:
1224 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1225 video_id = self._extract_id(url)
1226
1227 # Get video webpage
1228 self.report_video_webpage_download(video_id)
1229 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1230 request = compat_urllib_request.Request(url)
1231 try:
1232 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1234 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1235
1236 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1237
1238 # Attempt to extract SWF player URL
1239 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1240 if mobj is not None:
1241 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1242 else:
1243 player_url = None
1244
1245 # Get video info
1246 self.report_video_info_webpage_download(video_id)
1247 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1248 self.report_age_confirmation()
1249 age_gate = True
1250 # We simulate the access to the video from www.youtube.com/v/{video_id}
1251 # this can be viewed without login into Youtube
1252 data = compat_urllib_parse.urlencode({'video_id': video_id,
1253 'el': 'embedded',
1254 'gl': 'US',
1255 'hl': 'en',
1256 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1257 'asv': 3,
1258 'sts':'1588',
1259 })
1260 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1261 video_info_webpage = self._download_webpage(video_info_url, video_id,
1262 note=False,
1263 errnote='unable to download video info webpage')
1264 video_info = compat_parse_qs(video_info_webpage)
1265 else:
1266 age_gate = False
1267 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1268 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1269 % (video_id, el_type))
1270 video_info_webpage = self._download_webpage(video_info_url, video_id,
1271 note=False,
1272 errnote='unable to download video info webpage')
1273 video_info = compat_parse_qs(video_info_webpage)
1274 if 'token' in video_info:
1275 break
1276 if 'token' not in video_info:
1277 if 'reason' in video_info:
1278 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1279 else:
1280 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1281
1282 # Check for "rental" videos
1283 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1284 raise ExtractorError(u'"rental" videos not supported')
1285
1286 # Start extracting information
1287 self.report_information_extraction(video_id)
1288
1289 # uploader
1290 if 'author' not in video_info:
1291 raise ExtractorError(u'Unable to extract uploader name')
1292 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1293
1294 # uploader_id
1295 video_uploader_id = None
1296 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1297 if mobj is not None:
1298 video_uploader_id = mobj.group(1)
1299 else:
1300 self._downloader.report_warning(u'unable to extract uploader nickname')
1301
1302 # title
1303 if 'title' not in video_info:
1304 raise ExtractorError(u'Unable to extract video title')
1305 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1306
1307 # thumbnail image
1308 # We try first to get a high quality image:
1309 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1310 video_webpage, re.DOTALL)
1311 if m_thumb is not None:
1312 video_thumbnail = m_thumb.group(1)
1313 elif 'thumbnail_url' not in video_info:
1314 self._downloader.report_warning(u'unable to extract video thumbnail')
1315 video_thumbnail = ''
1316 else: # don't panic if we can't find it
1317 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1318
1319 # upload date
1320 upload_date = None
1321 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1322 if mobj is not None:
1323 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1324 upload_date = unified_strdate(upload_date)
1325
1326 # description
1327 video_description = get_element_by_id("eow-description", video_webpage)
1328 if video_description:
1329 video_description = clean_html(video_description)
1330 else:
1331 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1332 if fd_mobj:
1333 video_description = unescapeHTML(fd_mobj.group(1))
1334 else:
1335 video_description = u''
1336
1337 # subtitles
1338 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1339
1340 if self._downloader.params.get('listsubtitles', False):
1341 self._list_available_subtitles(video_id, video_webpage)
1342 return
1343
1344 if 'length_seconds' not in video_info:
1345 self._downloader.report_warning(u'unable to extract video duration')
1346 video_duration = ''
1347 else:
1348 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1349
1350 # Decide which formats to download
1351
1352 try:
1353 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1354 if not mobj:
1355 raise ValueError('Could not find vevo ID')
1356 info = json.loads(mobj.group(1))
1357 args = info['args']
1358 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1359 # this signatures are encrypted
1360 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1361 if m_s is not None:
1362 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1363 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1364 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1365 if m_s is not None:
1366 if 'url_encoded_fmt_stream_map' in video_info:
1367 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1368 else:
1369 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1370 elif 'adaptive_fmts' in video_info:
1371 if 'url_encoded_fmt_stream_map' in video_info:
1372 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1373 else:
1374 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1375 except ValueError:
1376 pass
1377
1378 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1379 self.report_rtmp_download()
1380 video_url_list = [(None, video_info['conn'][0])]
1381 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1382 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1383 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1384 url_map = {}
1385 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1386 url_data = compat_parse_qs(url_data_str)
1387 if 'itag' in url_data and 'url' in url_data:
1388 url = url_data['url'][0]
1389 if 'sig' in url_data:
1390 url += '&signature=' + url_data['sig'][0]
1391 elif 's' in url_data:
1392 encrypted_sig = url_data['s'][0]
1393 if self._downloader.params.get('verbose'):
1394 if age_gate:
1395 player_version = self._search_regex(
1396 r'-(.+)\.swf$',
1397 player_url if player_url else None,
1398 'flash player', fatal=False)
1399 player_desc = 'flash player %s' % player_version
1400 else:
1401 player_version = self._search_regex(
1402 r'html5player-(.+?)\.js', video_webpage,
1403 'html5 player', fatal=False)
1404 player_desc = u'html5 player %s' % player_version
1405
1406 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1407 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1408 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1409
1410 if not age_gate:
1411 jsplayer_url_json = self._search_regex(
1412 r'"assets":.+?"js":\s*("[^"]+")',
1413 video_webpage, u'JS player URL')
1414 player_url = json.loads(jsplayer_url_json)
1415
1416 signature = self._decrypt_signature(
1417 encrypted_sig, video_id, player_url, age_gate)
1418 url += '&signature=' + signature
1419 if 'ratebypass' not in url:
1420 url += '&ratebypass=yes'
1421 url_map[url_data['itag'][0]] = url
1422 video_url_list = self._get_video_url_list(url_map)
1423 if not video_url_list:
1424 return
1425 elif video_info.get('hlsvp'):
1426 manifest_url = video_info['hlsvp'][0]
1427 url_map = self._extract_from_m3u8(manifest_url, video_id)
1428 video_url_list = self._get_video_url_list(url_map)
1429 if not video_url_list:
1430 return
1431
1432 else:
1433 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1434
1435 results = []
1436 for format_param, video_real_url in video_url_list:
1437 # Extension
1438 video_extension = self._video_extensions.get(format_param, 'flv')
1439
1440 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1441 self._video_dimensions.get(format_param, '???'),
1442 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1443
1444 results.append({
1445 'id': video_id,
1446 'url': video_real_url,
1447 'uploader': video_uploader,
1448 'uploader_id': video_uploader_id,
1449 'upload_date': upload_date,
1450 'title': video_title,
1451 'ext': video_extension,
1452 'format': video_format,
1453 'thumbnail': video_thumbnail,
1454 'description': video_description,
1455 'player_url': player_url,
1456 'subtitles': video_subtitles,
1457 'duration': video_duration
1458 })
1459 return results
1460
1461class YoutubePlaylistIE(InfoExtractor):
1462 IE_DESC = u'YouTube.com playlists'
1463 _VALID_URL = r"""(?:
1464 (?:https?://)?
1465 (?:\w+\.)?
1466 youtube\.com/
1467 (?:
1468 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1469 \? (?:.*?&)*? (?:p|a|list)=
1470 | p/
1471 )
1472 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1473 .*
1474 |
1475 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1476 )"""
1477 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1478 _MAX_RESULTS = 50
1479 IE_NAME = u'youtube:playlist'
1480
1481 @classmethod
1482 def suitable(cls, url):
1483 """Receives a URL and returns True if suitable for this IE."""
1484 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1485
1486 def _real_extract(self, url):
1487 # Extract playlist id
1488 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1489 if mobj is None:
1490 raise ExtractorError(u'Invalid URL: %s' % url)
1491
1492 # Download playlist videos from API
1493 playlist_id = mobj.group(1) or mobj.group(2)
1494 videos = []
1495
1496 for page_num in itertools.count(1):
1497 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1498 if start_index >= 1000:
1499 self._downloader.report_warning(u'Max number of results reached')
1500 break
1501 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1502 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1503
1504 try:
1505 response = json.loads(page)
1506 except ValueError as err:
1507 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1508
1509 if 'feed' not in response:
1510 raise ExtractorError(u'Got a malformed response from YouTube API')
1511 playlist_title = response['feed']['title']['$t']
1512 if 'entry' not in response['feed']:
1513 # Number of videos is a multiple of self._MAX_RESULTS
1514 break
1515
1516 for entry in response['feed']['entry']:
1517 index = entry['yt$position']['$t']
1518 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1519 videos.append((
1520 index,
1521 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1522 ))
1523
1524 videos = [v[1] for v in sorted(videos)]
1525
1526 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1527 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1528
1529
1530class YoutubeChannelIE(InfoExtractor):
1531 IE_DESC = u'YouTube.com channels'
1532 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1533 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1534 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1535 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1536 IE_NAME = u'youtube:channel'
1537
1538 def extract_videos_from_page(self, page):
1539 ids_in_page = []
1540 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1541 if mobj.group(1) not in ids_in_page:
1542 ids_in_page.append(mobj.group(1))
1543 return ids_in_page
1544
1545 def _real_extract(self, url):
1546 # Extract channel id
1547 mobj = re.match(self._VALID_URL, url)
1548 if mobj is None:
1549 raise ExtractorError(u'Invalid URL: %s' % url)
1550
1551 # Download channel page
1552 channel_id = mobj.group(1)
1553 video_ids = []
1554 pagenum = 1
1555
1556 url = self._TEMPLATE_URL % (channel_id, pagenum)
1557 page = self._download_webpage(url, channel_id,
1558 u'Downloading page #%s' % pagenum)
1559
1560 # Extract video identifiers
1561 ids_in_page = self.extract_videos_from_page(page)
1562 video_ids.extend(ids_in_page)
1563
1564 # Download any subsequent channel pages using the json-based channel_ajax query
1565 if self._MORE_PAGES_INDICATOR in page:
1566 for pagenum in itertools.count(1):
1567 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1568 page = self._download_webpage(url, channel_id,
1569 u'Downloading page #%s' % pagenum)
1570
1571 page = json.loads(page)
1572
1573 ids_in_page = self.extract_videos_from_page(page['content_html'])
1574 video_ids.extend(ids_in_page)
1575
1576 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1577 break
1578
1579 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1580
1581 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1582 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1583 return [self.playlist_result(url_entries, channel_id)]
1584
1585
1586class YoutubeUserIE(InfoExtractor):
1587 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1588 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1589 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1590 _GDATA_PAGE_SIZE = 50
1591 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1592 IE_NAME = u'youtube:user'
1593
1594 @classmethod
1595 def suitable(cls, url):
1596 # Don't return True if the url can be extracted with other youtube
1597 # extractor, the regex would is too permissive and it would match.
1598 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1599 if any(ie.suitable(url) for ie in other_ies): return False
1600 else: return super(YoutubeUserIE, cls).suitable(url)
1601
1602 def _real_extract(self, url):
1603 # Extract username
1604 mobj = re.match(self._VALID_URL, url)
1605 if mobj is None:
1606 raise ExtractorError(u'Invalid URL: %s' % url)
1607
1608 username = mobj.group(1)
1609
1610 # Download video ids using YouTube Data API. Result size per
1611 # query is limited (currently to 50 videos) so we need to query
1612 # page by page until there are no video ids - it means we got
1613 # all of them.
1614
1615 video_ids = []
1616
1617 for pagenum in itertools.count(0):
1618 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1619
1620 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1621 page = self._download_webpage(gdata_url, username,
1622 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1623
1624 try:
1625 response = json.loads(page)
1626 except ValueError as err:
1627 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1628 if 'entry' not in response['feed']:
1629 # Number of videos is a multiple of self._MAX_RESULTS
1630 break
1631
1632 # Extract video identifiers
1633 ids_in_page = []
1634 for entry in response['feed']['entry']:
1635 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1636 video_ids.extend(ids_in_page)
1637
1638 # A little optimization - if current page is not
1639 # "full", ie. does not contain PAGE_SIZE video ids then
1640 # we can assume that this page is the last one - there
1641 # are no more ids on further pages - no need to query
1642 # again.
1643
1644 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1645 break
1646
1647 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1648 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1649 return [self.playlist_result(url_results, playlist_title = username)]
1650
1651class YoutubeSearchIE(SearchInfoExtractor):
1652 IE_DESC = u'YouTube.com searches'
1653 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1654 _MAX_RESULTS = 1000
1655 IE_NAME = u'youtube:search'
1656 _SEARCH_KEY = 'ytsearch'
1657
1658 def report_download_page(self, query, pagenum):
1659 """Report attempt to download search page with given number."""
1660 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1661
1662 def _get_n_results(self, query, n):
1663 """Get a specified number of results for a query"""
1664
1665 video_ids = []
1666 pagenum = 0
1667 limit = n
1668
1669 while (50 * pagenum) < limit:
1670 self.report_download_page(query, pagenum+1)
1671 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1672 request = compat_urllib_request.Request(result_url)
1673 try:
1674 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1675 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1676 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1677 api_response = json.loads(data)['data']
1678
1679 if not 'items' in api_response:
1680 raise ExtractorError(u'[youtube] No video results')
1681
1682 new_ids = list(video['id'] for video in api_response['items'])
1683 video_ids += new_ids
1684
1685 limit = min(n, api_response['totalItems'])
1686 pagenum += 1
1687
1688 if len(video_ids) > n:
1689 video_ids = video_ids[:n]
1690 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1691 return self.playlist_result(videos, query)
1692
1693
1694class YoutubeShowIE(InfoExtractor):
1695 IE_DESC = u'YouTube.com (multi-season) shows'
1696 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1697 IE_NAME = u'youtube:show'
1698
1699 def _real_extract(self, url):
1700 mobj = re.match(self._VALID_URL, url)
1701 show_name = mobj.group(1)
1702 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1703 # There's one playlist for each season of the show
1704 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1705 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1706 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1707
1708
1709class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1710 """
1711 Base class for extractors that fetch info from
1712 http://www.youtube.com/feed_ajax
1713 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1714 """
1715 _LOGIN_REQUIRED = True
1716 _PAGING_STEP = 30
1717 # use action_load_personal_feed instead of action_load_system_feed
1718 _PERSONAL_FEED = False
1719
1720 @property
1721 def _FEED_TEMPLATE(self):
1722 action = 'action_load_system_feed'
1723 if self._PERSONAL_FEED:
1724 action = 'action_load_personal_feed'
1725 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1726
1727 @property
1728 def IE_NAME(self):
1729 return u'youtube:%s' % self._FEED_NAME
1730
1731 def _real_initialize(self):
1732 self._login()
1733
1734 def _real_extract(self, url):
1735 feed_entries = []
1736 # The step argument is available only in 2.7 or higher
1737 for i in itertools.count(0):
1738 paging = i*self._PAGING_STEP
1739 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1740 u'%s feed' % self._FEED_NAME,
1741 u'Downloading page %s' % i)
1742 info = json.loads(info)
1743 feed_html = info['feed_html']
1744 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1745 ids = orderedSet(m.group(1) for m in m_ids)
1746 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1747 if info['paging'] is None:
1748 break
1749 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1750
1751class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1752 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1753 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1754 _FEED_NAME = 'subscriptions'
1755 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1756
1757class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1758 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1759 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1760 _FEED_NAME = 'recommended'
1761 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1762
1763class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1764 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1765 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1766 _FEED_NAME = 'watch_later'
1767 _PLAYLIST_TITLE = u'Youtube Watch Later'
1768 _PAGING_STEP = 100
1769 _PERSONAL_FEED = True
1770
1771class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1772 IE_NAME = u'youtube:favorites'
1773 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1774 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1775 _LOGIN_REQUIRED = True
1776
1777 def _real_extract(self, url):
1778 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1779 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1780 return self.url_result(playlist_id, 'YoutubePlaylist')