]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
Clarify a couple of calls
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 import collections
4 import itertools
5 import io
6 import json
7 import netrc
8 import re
9 import socket
10 import string
11 import struct
12 import traceback
13 import zlib
14
15 from .common import InfoExtractor, SearchInfoExtractor
16 from .subtitles import SubtitlesInfoExtractor
17 from ..utils import (
18 compat_http_client,
19 compat_parse_qs,
20 compat_urllib_error,
21 compat_urllib_parse,
22 compat_urllib_request,
23 compat_str,
24
25 clean_html,
26 get_element_by_id,
27 ExtractorError,
28 unescapeHTML,
29 unified_strdate,
30 orderedSet,
31 )
32
33 class YoutubeBaseInfoExtractor(InfoExtractor):
34 """Provide base functions for Youtube extractors"""
35 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
36 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
37 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
38 _NETRC_MACHINE = 'youtube'
39 # If True it will raise an error if no login info is provided
40 _LOGIN_REQUIRED = False
41
42 def report_lang(self):
43 """Report attempt to set language."""
44 self.to_screen(u'Setting language')
45
46 def _set_language(self):
47 request = compat_urllib_request.Request(self._LANG_URL)
48 try:
49 self.report_lang()
50 compat_urllib_request.urlopen(request).read()
51 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
52 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
53 return False
54 return True
55
56 def _login(self):
57 (username, password) = self._get_login_info()
58 # No authentication to be performed
59 if username is None:
60 if self._LOGIN_REQUIRED:
61 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
62 return False
63
64 request = compat_urllib_request.Request(self._LOGIN_URL)
65 try:
66 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
67 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
68 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
69 return False
70
71 galx = None
72 dsh = None
73 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
74 if match:
75 galx = match.group(1)
76 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
77 if match:
78 dsh = match.group(1)
79
80 # Log in
81 login_form_strs = {
82 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
83 u'Email': username,
84 u'GALX': galx,
85 u'Passwd': password,
86 u'PersistentCookie': u'yes',
87 u'_utf8': u'霱',
88 u'bgresponse': u'js_disabled',
89 u'checkConnection': u'',
90 u'checkedDomains': u'youtube',
91 u'dnConn': u'',
92 u'dsh': dsh,
93 u'pstMsg': u'0',
94 u'rmShown': u'1',
95 u'secTok': u'',
96 u'signIn': u'Sign in',
97 u'timeStmp': u'',
98 u'service': u'youtube',
99 u'uilel': u'3',
100 u'hl': u'en_US',
101 }
102 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
103 # chokes on unicode
104 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
105 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
106 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 try:
108 self.report_login()
109 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
110 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
111 self._downloader.report_warning(u'unable to log in: bad username or password')
112 return False
113 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
114 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
115 return False
116 return True
117
118 def _confirm_age(self):
119 age_form = {
120 'next_url': '/',
121 'action_confirm': 'Confirm',
122 }
123 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
124 try:
125 self.report_age_confirmation()
126 compat_urllib_request.urlopen(request).read().decode('utf-8')
127 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
128 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
129 return True
130
131 def _real_initialize(self):
132 if self._downloader is None:
133 return
134 if not self._set_language():
135 return
136 if not self._login():
137 return
138 self._confirm_age()
139
140
141 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
142 IE_DESC = u'YouTube.com'
143 _VALID_URL = r"""^
144 (
145 (?:https?://)? # http(s):// (optional)
146 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
147 tube\.majestyc\.net/|
148 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
149 (?:.*?\#/)? # handle anchor (#/) redirect urls
150 (?: # the various things that can precede the ID:
151 (?:(?:v|embed|e)/) # v/ or embed/ or e/
152 |(?: # or the v= param in all its forms
153 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
154 (?:\?|\#!?) # the params delimiter ? or # or #!
155 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
156 v=
157 )
158 ))
159 |youtu\.be/ # just youtu.be/xxxx
160 )
161 )? # all until now is optional -> you can pass the naked ID
162 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
163 (?(1).+)? # if we found the ID, everything can follow
164 $"""
165 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
166 # Listed in order of quality
167 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
168 # Apple HTTP Live Streaming
169 '96', '95', '94', '93', '92', '132', '151',
170 # 3D
171 '85', '84', '102', '83', '101', '82', '100',
172 # Dash video
173 '138', '137', '248', '136', '247', '135', '246',
174 '245', '244', '134', '243', '133', '242', '160',
175 # Dash audio
176 '141', '172', '140', '171', '139',
177 ]
178 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
179 # Apple HTTP Live Streaming
180 '96', '95', '94', '93', '92', '132', '151',
181 # 3D
182 '85', '102', '84', '101', '83', '100', '82',
183 # Dash video
184 '138', '248', '137', '247', '136', '246', '245',
185 '244', '135', '243', '134', '242', '133', '160',
186 # Dash audio
187 '172', '141', '171', '140', '139',
188 ]
189 _video_formats_map = {
190 'flv': ['35', '34', '6', '5'],
191 '3gp': ['36', '17', '13'],
192 'mp4': ['38', '37', '22', '18'],
193 'webm': ['46', '45', '44', '43'],
194 }
195 _video_extensions = {
196 '13': '3gp',
197 '17': '3gp',
198 '18': 'mp4',
199 '22': 'mp4',
200 '36': '3gp',
201 '37': 'mp4',
202 '38': 'mp4',
203 '43': 'webm',
204 '44': 'webm',
205 '45': 'webm',
206 '46': 'webm',
207
208 # 3d videos
209 '82': 'mp4',
210 '83': 'mp4',
211 '84': 'mp4',
212 '85': 'mp4',
213 '100': 'webm',
214 '101': 'webm',
215 '102': 'webm',
216
217 # Apple HTTP Live Streaming
218 '92': 'mp4',
219 '93': 'mp4',
220 '94': 'mp4',
221 '95': 'mp4',
222 '96': 'mp4',
223 '132': 'mp4',
224 '151': 'mp4',
225
226 # Dash mp4
227 '133': 'mp4',
228 '134': 'mp4',
229 '135': 'mp4',
230 '136': 'mp4',
231 '137': 'mp4',
232 '138': 'mp4',
233 '139': 'mp4',
234 '140': 'mp4',
235 '141': 'mp4',
236 '160': 'mp4',
237
238 # Dash webm
239 '171': 'webm',
240 '172': 'webm',
241 '242': 'webm',
242 '243': 'webm',
243 '244': 'webm',
244 '245': 'webm',
245 '246': 'webm',
246 '247': 'webm',
247 '248': 'webm',
248 }
249 _video_dimensions = {
250 '5': '240x400',
251 '6': '???',
252 '13': '???',
253 '17': '144x176',
254 '18': '360x640',
255 '22': '720x1280',
256 '34': '360x640',
257 '35': '480x854',
258 '36': '240x320',
259 '37': '1080x1920',
260 '38': '3072x4096',
261 '43': '360x640',
262 '44': '480x854',
263 '45': '720x1280',
264 '46': '1080x1920',
265 '82': '360p',
266 '83': '480p',
267 '84': '720p',
268 '85': '1080p',
269 '92': '240p',
270 '93': '360p',
271 '94': '480p',
272 '95': '720p',
273 '96': '1080p',
274 '100': '360p',
275 '101': '480p',
276 '102': '720p',
277 '132': '240p',
278 '151': '72p',
279 '133': '240p',
280 '134': '360p',
281 '135': '480p',
282 '136': '720p',
283 '137': '1080p',
284 '138': '>1080p',
285 '139': '48k',
286 '140': '128k',
287 '141': '256k',
288 '160': '192p',
289 '171': '128k',
290 '172': '256k',
291 '242': '240p',
292 '243': '360p',
293 '244': '480p',
294 '245': '480p',
295 '246': '480p',
296 '247': '720p',
297 '248': '1080p',
298 }
299 _special_itags = {
300 '82': '3D',
301 '83': '3D',
302 '84': '3D',
303 '85': '3D',
304 '100': '3D',
305 '101': '3D',
306 '102': '3D',
307 '133': 'DASH Video',
308 '134': 'DASH Video',
309 '135': 'DASH Video',
310 '136': 'DASH Video',
311 '137': 'DASH Video',
312 '138': 'DASH Video',
313 '139': 'DASH Audio',
314 '140': 'DASH Audio',
315 '141': 'DASH Audio',
316 '160': 'DASH Video',
317 '171': 'DASH Audio',
318 '172': 'DASH Audio',
319 '242': 'DASH Video',
320 '243': 'DASH Video',
321 '244': 'DASH Video',
322 '245': 'DASH Video',
323 '246': 'DASH Video',
324 '247': 'DASH Video',
325 '248': 'DASH Video',
326 }
327
328 IE_NAME = u'youtube'
329 _TESTS = [
330 {
331 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
332 u"file": u"BaW_jenozKc.mp4",
333 u"info_dict": {
334 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
335 u"uploader": u"Philipp Hagemeister",
336 u"uploader_id": u"phihag",
337 u"upload_date": u"20121002",
338 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
339 }
340 },
341 {
342 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
343 u"file": u"1ltcDfZMA3U.flv",
344 u"note": u"Test VEVO video (#897)",
345 u"info_dict": {
346 u"upload_date": u"20070518",
347 u"title": u"Maps - It Will Find You",
348 u"description": u"Music video by Maps performing It Will Find You.",
349 u"uploader": u"MuteUSA",
350 u"uploader_id": u"MuteUSA"
351 }
352 },
353 {
354 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
355 u"file": u"UxxajLWwzqY.mp4",
356 u"note": u"Test generic use_cipher_signature video (#897)",
357 u"info_dict": {
358 u"upload_date": u"20120506",
359 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
360 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
361 u"uploader": u"Icona Pop",
362 u"uploader_id": u"IconaPop"
363 }
364 },
365 {
366 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
367 u"file": u"07FYdnEawAQ.mp4",
368 u"note": u"Test VEVO video with age protection (#956)",
369 u"info_dict": {
370 u"upload_date": u"20130703",
371 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
372 u"description": u"md5:64249768eec3bc4276236606ea996373",
373 u"uploader": u"justintimberlakeVEVO",
374 u"uploader_id": u"justintimberlakeVEVO"
375 }
376 },
377 {
378 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
379 u'file': u'TGi3HqYrWHE.mp4',
380 u'note': u'm3u8 video',
381 u'info_dict': {
382 u'title': u'Triathlon - Men - London 2012 Olympic Games',
383 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
384 u'uploader': u'olympic',
385 u'upload_date': u'20120807',
386 u'uploader_id': u'olympic',
387 },
388 u'params': {
389 u'skip_download': True,
390 },
391 },
392 ]
393
394
395 @classmethod
396 def suitable(cls, url):
397 """Receives a URL and returns True if suitable for this IE."""
398 if YoutubePlaylistIE.suitable(url): return False
399 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
400
401 def __init__(self, *args, **kwargs):
402 super(YoutubeIE, self).__init__(*args, **kwargs)
403 self._player_cache = {}
404
405 def report_video_webpage_download(self, video_id):
406 """Report attempt to download video webpage."""
407 self.to_screen(u'%s: Downloading video webpage' % video_id)
408
409 def report_video_info_webpage_download(self, video_id):
410 """Report attempt to download video info webpage."""
411 self.to_screen(u'%s: Downloading video info webpage' % video_id)
412
413 def report_information_extraction(self, video_id):
414 """Report attempt to extract video information."""
415 self.to_screen(u'%s: Extracting video information' % video_id)
416
417 def report_unavailable_format(self, video_id, format):
418 """Report extracted video URL."""
419 self.to_screen(u'%s: Format %s not available' % (video_id, format))
420
421 def report_rtmp_download(self):
422 """Indicate the download will use the RTMP protocol."""
423 self.to_screen(u'RTMP download detected')
424
425 def _extract_signature_function(self, video_id, player_url):
426 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9]+)\.(?P<ext>[a-z]+)$',
427 player_url)
428 player_type = id_m.group('ext')
429 player_id = id_m.group('id')
430
431 # TODO read from filesystem cache
432
433 if player_type == 'js':
434 code = self._download_webpage(
435 player_url, video_id,
436 note=u'Downloading %s player %s' % (player_type, player_id),
437 errnote=u'Download of %s failed' % player_url)
438 res = self._parse_sig_js(code)
439 elif player_tpye == 'swf':
440 urlh = self._request_webpage(
441 player_url, video_id,
442 note=u'Downloading %s player %s' % (player_type, player_id),
443 errnote=u'Download of %s failed' % player_url)
444 code = urlh.read()
445 res = self._parse_sig_swf(code)
446 else:
447 assert False, 'Invalid player type %r' % player_type
448
449 # TODO write cache
450
451 return res
452
453 def _parse_sig_js(self, jscode):
454 funcname = self._search_regex(
455 r'signature=([a-zA-Z]+)', jscode,
456 u'Initial JS player signature function name')
457
458 functions = {}
459
460 def argidx(varname):
461 return string.lowercase.index(varname)
462
463 def interpret_statement(stmt, local_vars, allow_recursion=20):
464 if allow_recursion < 0:
465 raise ExctractorError(u'Recursion limit reached')
466
467 if stmt.startswith(u'var '):
468 stmt = stmt[len(u'var '):]
469 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
470 r'=(?P<expr>.*)$', stmt)
471 if ass_m:
472 if ass_m.groupdict().get('index'):
473 def assign(val):
474 lvar = local_vars[ass_m.group('out')]
475 idx = interpret_expression(ass_m.group('index'),
476 local_vars, allow_recursion)
477 assert isinstance(idx, int)
478 lvar[idx] = val
479 return val
480 expr = ass_m.group('expr')
481 else:
482 def assign(val):
483 local_vars[ass_m.group('out')] = val
484 return val
485 expr = ass_m.group('expr')
486 elif stmt.startswith(u'return '):
487 assign = lambda v: v
488 expr = stmt[len(u'return '):]
489 else:
490 raise ExtractorError(
491 u'Cannot determine left side of statement in %r' % stmt)
492
493 v = interpret_expression(expr, local_vars, allow_recursion)
494 return assign(v)
495
496 def interpret_expression(expr, local_vars, allow_recursion):
497 if expr.isdigit():
498 return int(expr)
499
500 if expr.isalpha():
501 return local_vars[expr]
502
503 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
504 if m:
505 member = m.group('member')
506 val = local_vars[m.group('in')]
507 if member == 'split("")':
508 return list(val)
509 if member == 'join("")':
510 return u''.join(val)
511 if member == 'length':
512 return len(val)
513 if member == 'reverse()':
514 return val[::-1]
515 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
516 if slice_m:
517 idx = interpret_expression(
518 slice_m.group('idx'), local_vars, allow_recursion-1)
519 return val[idx:]
520
521 m = re.match(
522 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
523 if m:
524 val = local_vars[m.group('in')]
525 idx = interpret_expression(m.group('idx'), local_vars,
526 allow_recursion-1)
527 return val[idx]
528
529 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
530 if m:
531 a = interpret_expression(m.group('a'),
532 local_vars, allow_recursion)
533 b = interpret_expression(m.group('b'),
534 local_vars, allow_recursion)
535 return a % b
536
537 m = re.match(
538 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
539 if m:
540 fname = m.group('func')
541 if fname not in functions:
542 functions[fname] = extract_function(fname)
543 argvals = [int(v) if v.isdigit() else local_vars[v]
544 for v in m.group('args').split(',')]
545 return functions[fname](argvals)
546 raise ExtractorError(u'Unsupported JS expression %r' % expr)
547
548 def extract_function(funcname):
549 func_m = re.search(
550 r'function ' + re.escape(funcname) +
551 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
552 jscode)
553 argnames = func_m.group('args').split(',')
554
555 def resf(args):
556 local_vars = dict(zip(argnames, args))
557 for stmt in func_m.group('code').split(';'):
558 res = interpret_statement(stmt, local_vars)
559 return res
560 return resf
561
562 initial_function = extract_function(funcname)
563 return lambda s: initial_function([s])
564
565 def _parse_sig_swf(self, file_contents):
566 if file_contents[1:3] != b'WS':
567 raise ExtractorError(
568 u'Not an SWF file; header is %r' % file_contents[:3])
569 if file_contents[:1] == b'C':
570 content = zlib.decompress(file_contents[8:])
571 else:
572 raise NotImplementedError(u'Unsupported compression format %r' %
573 file_contents[:1])
574
575 def extract_tags(content):
576 pos = 0
577 while pos < len(content):
578 header16 = struct.unpack('<H', content[pos:pos+2])[0]
579 pos += 2
580 tag_code = header16 >> 6
581 tag_len = header16 & 0x3f
582 if tag_len == 0x3f:
583 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
584 pos += 4
585 assert pos+tag_len <= len(content)
586 yield (tag_code, content[pos:pos+tag_len])
587 pos += tag_len
588
589 code_tag = next(tag
590 for tag_code, tag in extract_tags(content)
591 if tag_code == 82)
592 p = code_tag.index(b'\0', 4) + 1
593 code_reader = io.BytesIO(code_tag[p:])
594
595 # Parse ABC (AVM2 ByteCode)
596 def read_int(reader=None):
597 if reader is None:
598 reader = code_reader
599 res = 0
600 shift = 0
601 for _ in range(5):
602 buf = reader.read(1)
603 assert len(buf) == 1
604 b = struct.unpack('<B', buf)[0]
605 res = res | ((b & 0x7f) << shift)
606 if b & 0x80 == 0:
607 break
608 shift += 7
609 return res
610
611 def u30(reader=None):
612 res = read_int(reader)
613 assert res & 0xf0000000 == 0
614 return res
615 u32 = read_int
616
617 def s32(reader=None):
618 v = read_int(reader)
619 if v & 0x80000000 != 0:
620 v = - ((v ^ 0xffffffff) + 1)
621 return v
622
623 def string(reader=None):
624 if reader is None:
625 reader = code_reader
626 slen = u30(reader)
627 resb = reader.read(slen)
628 assert len(resb) == slen
629 return resb.decode('utf-8')
630
631 def read_bytes(count, reader=None):
632 if reader is None:
633 reader = code_reader
634 resb = reader.read(count)
635 assert len(resb) == count
636 return resb
637
638 def read_byte(reader=None):
639 resb = read_bytes(1, reader=reader)
640 res = struct.unpack('<B', resb)[0]
641 return res
642
643 # minor_version + major_version
644 _ = read_bytes(2 + 2)
645
646 # Constant pool
647 int_count = u30()
648 for _c in range(1, int_count):
649 _ = s32()
650 uint_count = u30()
651 for _c in range(1, uint_count):
652 _ = u32()
653 double_count = u30()
654 _ = read_bytes((double_count-1) * 8)
655 string_count = u30()
656 constant_strings = [u'']
657 for _c in range(1, string_count):
658 s = string()
659 constant_strings.append(s)
660 namespace_count = u30()
661 for _c in range(1, namespace_count):
662 _ = read_bytes(1) # kind
663 _ = u30() # name
664 ns_set_count = u30()
665 for _c in range(1, ns_set_count):
666 count = u30()
667 for _c2 in range(count):
668 _ = u30()
669 multiname_count = u30()
670 MULTINAME_SIZES = {
671 0x07: 2, # QName
672 0x0d: 2, # QNameA
673 0x0f: 1, # RTQName
674 0x10: 1, # RTQNameA
675 0x11: 0, # RTQNameL
676 0x12: 0, # RTQNameLA
677 0x09: 2, # Multiname
678 0x0e: 2, # MultinameA
679 0x1b: 1, # MultinameL
680 0x1c: 1, # MultinameLA
681 }
682 multinames = [u'']
683 for _c in range(1, multiname_count):
684 kind = u30()
685 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
686 if kind == 0x07:
687 namespace_idx = u30()
688 name_idx = u30()
689 multinames.append(constant_strings[name_idx])
690 else:
691 multinames.append('[MULTINAME kind: %d]' % kind)
692 for _c2 in range(MULTINAME_SIZES[kind]):
693 _ = u30()
694
695 # Methods
696 method_count = u30()
697 MethodInfo = collections.namedtuple(
698 'MethodInfo',
699 ['NEED_ARGUMENTS', 'NEED_REST'])
700 method_infos = []
701 for method_id in range(method_count):
702 param_count = u30()
703 _ = u30() # return type
704 for _ in range(param_count):
705 _ = u30() # param type
706 _ = u30() # name index (always 0 for youtube)
707 flags = read_byte()
708 if flags & 0x08 != 0:
709 # Options present
710 option_count = u30()
711 for c in range(option_count):
712 _ = u30() # val
713 _ = read_bytes(1) # kind
714 if flags & 0x80 != 0:
715 # Param names present
716 for _ in range(param_count):
717 _ = u30() # param name
718 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
719 method_infos.append(mi)
720
721 # Metadata
722 metadata_count = u30()
723 for _c in range(metadata_count):
724 _ = u30() # name
725 item_count = u30()
726 for _c2 in range(item_count):
727 _ = u30() # key
728 _ = u30() # value
729
730 def parse_traits_info():
731 trait_name_idx = u30()
732 kind_full = read_byte()
733 kind = kind_full & 0x0f
734 attrs = kind_full >> 4
735 methods = {}
736 if kind in [0x00, 0x06]: # Slot or Const
737 _ = u30() # Slot id
738 type_name_idx = u30()
739 vindex = u30()
740 if vindex != 0:
741 _ = read_byte() # vkind
742 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
743 _ = u30() # disp_id
744 method_idx = u30()
745 methods[multinames[trait_name_idx]] = method_idx
746 elif kind == 0x04: # Class
747 _ = u30() # slot_id
748 _ = u30() # classi
749 elif kind == 0x05: # Function
750 _ = u30() # slot_id
751 function_idx = u30()
752 methods[function_idx] = multinames[trait_name_idx]
753 else:
754 raise ExtractorError(u'Unsupported trait kind %d' % kind)
755
756 if attrs & 0x4 != 0: # Metadata present
757 metadata_count = u30()
758 for _c3 in range(metadata_count):
759 _ = u30()
760
761 return methods
762
763 # Classes
764 TARGET_CLASSNAME = u'SignatureDecipher'
765 searched_idx = multinames.index(TARGET_CLASSNAME)
766 searched_class_id = None
767 class_count = u30()
768 for class_id in range(class_count):
769 name_idx = u30()
770 if name_idx == searched_idx:
771 # We found the class we're looking for!
772 searched_class_id = class_id
773 _ = u30() # super_name idx
774 flags = read_byte()
775 if flags & 0x08 != 0: # Protected namespace is present
776 protected_ns_idx = u30()
777 intrf_count = u30()
778 for _c2 in range(intrf_count):
779 _ = u30()
780 _ = u30() # iinit
781 trait_count = u30()
782 for _c2 in range(trait_count):
783 _ = parse_traits_info()
784
785 if searched_class_id is None:
786 raise ExtractorError(u'Target class %r not found' %
787 TARGET_CLASSNAME)
788
789 method_names = {}
790 method_idxs = {}
791 for class_id in range(class_count):
792 _ = u30() # cinit
793 trait_count = u30()
794 for _c2 in range(trait_count):
795 trait_methods = parse_traits_info()
796 if class_id == searched_class_id:
797 method_names.update(trait_methods.items())
798 method_idxs.update(dict(
799 (idx, name)
800 for name, idx in trait_methods.items()))
801
802 # Scripts
803 script_count = u30()
804 for _c in range(script_count):
805 _ = u30() # init
806 trait_count = u30()
807 for _c2 in range(trait_count):
808 _ = parse_traits_info()
809
810 # Method bodies
811 method_body_count = u30()
812 Method = collections.namedtuple('Method', ['code', 'local_count'])
813 methods = {}
814 for _c in range(method_body_count):
815 method_idx = u30()
816 max_stack = u30()
817 local_count = u30()
818 init_scope_depth = u30()
819 max_scope_depth = u30()
820 code_length = u30()
821 code = read_bytes(code_length)
822 if method_idx in method_idxs:
823 m = Method(code, local_count)
824 methods[method_idxs[method_idx]] = m
825 exception_count = u30()
826 for _c2 in range(exception_count):
827 _ = u30() # from
828 _ = u30() # to
829 _ = u30() # target
830 _ = u30() # exc_type
831 _ = u30() # var_name
832 trait_count = u30()
833 for _c2 in range(trait_count):
834 _ = parse_traits_info()
835
836 assert p + code_reader.tell() == len(code_tag)
837 assert len(methods) == len(method_idxs)
838
839 method_pyfunctions = {}
840
841 def extract_function(func_name):
842 if func_name in method_pyfunctions:
843 return method_pyfunctions[func_name]
844 if func_name not in methods:
845 raise ExtractorError(u'Cannot find function %r' % func_name)
846 m = methods[func_name]
847
848 def resfunc(args):
849 registers = ['(this)'] + list(args) + [None] * m.local_count
850 stack = []
851 coder = io.BytesIO(m.code)
852 while True:
853 opcode = struct.unpack('!B', coder.read(1))[0]
854 if opcode == 36: # pushbyte
855 v = struct.unpack('!B', coder.read(1))[0]
856 stack.append(v)
857 elif opcode == 44: # pushstring
858 idx = u30(coder)
859 stack.append(constant_strings[idx])
860 elif opcode == 48: # pushscope
861 # We don't implement the scope register, so we'll just
862 # ignore the popped value
863 stack.pop()
864 elif opcode == 70: # callproperty
865 index = u30(coder)
866 mname = multinames[index]
867 arg_count = u30(coder)
868 args = list(reversed(
869 [stack.pop() for _ in range(arg_count)]))
870 obj = stack.pop()
871 if mname == u'split':
872 assert len(args) == 1
873 assert isinstance(args[0], compat_str)
874 assert isinstance(obj, compat_str)
875 if args[0] == u'':
876 res = list(obj)
877 else:
878 res = obj.split(args[0])
879 stack.append(res)
880 elif mname == u'slice':
881 assert len(args) == 1
882 assert isinstance(args[0], int)
883 assert isinstance(obj, list)
884 res = obj[args[0]:]
885 stack.append(res)
886 elif mname == u'join':
887 assert len(args) == 1
888 assert isinstance(args[0], compat_str)
889 assert isinstance(obj, list)
890 res = args[0].join(obj)
891 stack.append(res)
892 elif mname in method_pyfunctions:
893 stack.append(method_pyfunctions[mname](args))
894 else:
895 raise NotImplementedError(
896 u'Unsupported property %r on %r'
897 % (mname, obj))
898 elif opcode == 72: # returnvalue
899 res = stack.pop()
900 return res
901 elif opcode == 79: # callpropvoid
902 index = u30(coder)
903 mname = multinames[index]
904 arg_count = u30(coder)
905 args = list(reversed(
906 [stack.pop() for _ in range(arg_count)]))
907 obj = stack.pop()
908 if mname == u'reverse':
909 assert isinstance(obj, list)
910 obj.reverse()
911 else:
912 raise NotImplementedError(
913 u'Unsupported (void) property %r on %r'
914 % (mname, obj))
915 elif opcode == 93: # findpropstrict
916 index = u30(coder)
917 mname = multinames[index]
918 res = extract_function(mname)
919 stack.append(res)
920 elif opcode == 97: # setproperty
921 index = u30(coder)
922 value = stack.pop()
923 idx = stack.pop()
924 obj = stack.pop()
925 assert isinstance(obj, list)
926 assert isinstance(idx, int)
927 obj[idx] = value
928 elif opcode == 98: # getlocal
929 index = u30(coder)
930 stack.append(registers[index])
931 elif opcode == 99: # setlocal
932 index = u30(coder)
933 value = stack.pop()
934 registers[index] = value
935 elif opcode == 102: # getproperty
936 index = u30(coder)
937 pname = multinames[index]
938 if pname == u'length':
939 obj = stack.pop()
940 assert isinstance(obj, list)
941 stack.append(len(obj))
942 else: # Assume attribute access
943 idx = stack.pop()
944 assert isinstance(idx, int)
945 obj = stack.pop()
946 assert isinstance(obj, list)
947 stack.append(obj[idx])
948 elif opcode == 128: # coerce
949 _ = u30(coder)
950 elif opcode == 133: # coerce_s
951 assert isinstance(stack[-1], (type(None), compat_str))
952 elif opcode == 164: # modulo
953 value2 = stack.pop()
954 value1 = stack.pop()
955 res = value1 % value2
956 stack.append(res)
957 elif opcode == 208: # getlocal_0
958 stack.append(registers[0])
959 elif opcode == 209: # getlocal_1
960 stack.append(registers[1])
961 elif opcode == 210: # getlocal_2
962 stack.append(registers[2])
963 elif opcode == 211: # getlocal_3
964 stack.append(registers[3])
965 elif opcode == 214: # setlocal_2
966 registers[2] = stack.pop()
967 elif opcode == 215: # setlocal_3
968 registers[3] = stack.pop()
969 else:
970 raise NotImplementedError(
971 u'Unsupported opcode %d' % opcode)
972
973 method_pyfunctions[func_name] = resfunc
974 return resfunc
975
976 initial_function = extract_function(u'decipher')
977 return lambda s: initial_function([s])
978
979 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
980 """Turn the encrypted s field into a working signature"""
981
982 if player_url is not None:
983 try:
984 if player_url not in self._player_cache:
985 func = self._extract_signature_function(
986 video_id, player_url
987 )
988 self._player_cache[player_url] = func
989 return self._player_cache[player_url](s)
990 except Exception as e:
991 tb = traceback.format_exc()
992 self._downloader.report_warning(
993 u'Automatic signature extraction failed: ' + tb)
994
995 self._downloader.report_warning(
996 u'Warning: Falling back to static signature algorithm')
997 return self._static_decrypt_signature(
998 s, video_id, player_url, age_gate)
999
1000 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1001 if age_gate:
1002 # The videos with age protection use another player, so the
1003 # algorithms can be different.
1004 if len(s) == 86:
1005 return s[2:63] + s[82] + s[64:82] + s[63]
1006
1007 if len(s) == 92:
1008 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1009 elif len(s) == 90:
1010 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1011 elif len(s) == 89:
1012 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1013 elif len(s) == 88:
1014 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1015 elif len(s) == 87:
1016 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1017 elif len(s) == 86:
1018 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1019 elif len(s) == 85:
1020 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1021 elif len(s) == 84:
1022 return s[81:36:-1] + s[0] + s[35:2:-1]
1023 elif len(s) == 83:
1024 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1025 elif len(s) == 82:
1026 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1027 elif len(s) == 81:
1028 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1029 elif len(s) == 80:
1030 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1031 elif len(s) == 79:
1032 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1033
1034 else:
1035 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1036
1037 def _decrypt_signature_age_gate(self, s):
1038 # The videos with age protection use another player, so the algorithms
1039 # can be different.
1040 if len(s) == 86:
1041 return s[2:63] + s[82] + s[64:82] + s[63]
1042 else:
1043 # Fallback to the other algortihms
1044 return self._decrypt_signature(s)
1045
1046 def _get_available_subtitles(self, video_id):
1047 try:
1048 sub_list = self._download_webpage(
1049 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1050 video_id, note=False)
1051 except ExtractorError as err:
1052 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1053 return {}
1054 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1055
1056 sub_lang_list = {}
1057 for l in lang_list:
1058 lang = l[1]
1059 params = compat_urllib_parse.urlencode({
1060 'lang': lang,
1061 'v': video_id,
1062 'fmt': self._downloader.params.get('subtitlesformat'),
1063 })
1064 url = u'http://www.youtube.com/api/timedtext?' + params
1065 sub_lang_list[lang] = url
1066 if not sub_lang_list:
1067 self._downloader.report_warning(u'video doesn\'t have subtitles')
1068 return {}
1069 return sub_lang_list
1070
1071 def _get_available_automatic_caption(self, video_id, webpage):
1072 """We need the webpage for getting the captions url, pass it as an
1073 argument to speed up the process."""
1074 sub_format = self._downloader.params.get('subtitlesformat')
1075 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1076 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1077 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1078 if mobj is None:
1079 self._downloader.report_warning(err_msg)
1080 return {}
1081 player_config = json.loads(mobj.group(1))
1082 try:
1083 args = player_config[u'args']
1084 caption_url = args[u'ttsurl']
1085 timestamp = args[u'timestamp']
1086 # We get the available subtitles
1087 list_params = compat_urllib_parse.urlencode({
1088 'type': 'list',
1089 'tlangs': 1,
1090 'asrs': 1,
1091 })
1092 list_url = caption_url + '&' + list_params
1093 list_page = self._download_webpage(list_url, video_id)
1094 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1095 original_lang_node = caption_list.find('track')
1096 if original_lang_node.attrib.get('kind') != 'asr' :
1097 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1098 return {}
1099 original_lang = original_lang_node.attrib['lang_code']
1100
1101 sub_lang_list = {}
1102 for lang_node in caption_list.findall('target'):
1103 sub_lang = lang_node.attrib['lang_code']
1104 params = compat_urllib_parse.urlencode({
1105 'lang': original_lang,
1106 'tlang': sub_lang,
1107 'fmt': sub_format,
1108 'ts': timestamp,
1109 'kind': 'asr',
1110 })
1111 sub_lang_list[sub_lang] = caption_url + '&' + params
1112 return sub_lang_list
1113 # An extractor error can be raise by the download process if there are
1114 # no automatic captions but there are subtitles
1115 except (KeyError, ExtractorError):
1116 self._downloader.report_warning(err_msg)
1117 return {}
1118
1119 def _print_formats(self, formats):
1120 print('Available formats:')
1121 for x in formats:
1122 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1123 self._video_dimensions.get(x, '???'),
1124 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1125
1126 def _extract_id(self, url):
1127 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1128 if mobj is None:
1129 raise ExtractorError(u'Invalid URL: %s' % url)
1130 video_id = mobj.group(2)
1131 return video_id
1132
1133 def _get_video_url_list(self, url_map):
1134 """
1135 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1136 with the requested formats.
1137 """
1138 req_format = self._downloader.params.get('format', None)
1139 format_limit = self._downloader.params.get('format_limit', None)
1140 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1141 if format_limit is not None and format_limit in available_formats:
1142 format_list = available_formats[available_formats.index(format_limit):]
1143 else:
1144 format_list = available_formats
1145 existing_formats = [x for x in format_list if x in url_map]
1146 if len(existing_formats) == 0:
1147 raise ExtractorError(u'no known formats available for video')
1148 if self._downloader.params.get('listformats', None):
1149 self._print_formats(existing_formats)
1150 return
1151 if req_format is None or req_format == 'best':
1152 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1153 elif req_format == 'worst':
1154 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1155 elif req_format in ('-1', 'all'):
1156 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1157 else:
1158 # Specific formats. We pick the first in a slash-delimeted sequence.
1159 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1160 # available in the specified format. For example,
1161 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1162 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1163 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1164 req_formats = req_format.split('/')
1165 video_url_list = None
1166 for rf in req_formats:
1167 if rf in url_map:
1168 video_url_list = [(rf, url_map[rf])]
1169 break
1170 if rf in self._video_formats_map:
1171 for srf in self._video_formats_map[rf]:
1172 if srf in url_map:
1173 video_url_list = [(srf, url_map[srf])]
1174 break
1175 else:
1176 continue
1177 break
1178 if video_url_list is None:
1179 raise ExtractorError(u'requested format not available')
1180 return video_url_list
1181
1182 def _extract_from_m3u8(self, manifest_url, video_id):
1183 url_map = {}
1184 def _get_urls(_manifest):
1185 lines = _manifest.split('\n')
1186 urls = filter(lambda l: l and not l.startswith('#'),
1187 lines)
1188 return urls
1189 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1190 formats_urls = _get_urls(manifest)
1191 for format_url in formats_urls:
1192 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1193 url_map[itag] = format_url
1194 return url_map
1195
1196 def _real_extract(self, url):
1197 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1198 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1199
1200 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1201 mobj = re.search(self._NEXT_URL_RE, url)
1202 if mobj:
1203 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1204 video_id = self._extract_id(url)
1205
1206 # Get video webpage
1207 self.report_video_webpage_download(video_id)
1208 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1209 request = compat_urllib_request.Request(url)
1210 try:
1211 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1212 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1213 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1214
1215 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1216
1217 # Attempt to extract SWF player URL
1218 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1219 if mobj is not None:
1220 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1221 else:
1222 player_url = None
1223
1224 # Get video info
1225 self.report_video_info_webpage_download(video_id)
1226 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1227 self.report_age_confirmation()
1228 age_gate = True
1229 # We simulate the access to the video from www.youtube.com/v/{video_id}
1230 # this can be viewed without login into Youtube
1231 data = compat_urllib_parse.urlencode({'video_id': video_id,
1232 'el': 'embedded',
1233 'gl': 'US',
1234 'hl': 'en',
1235 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1236 'asv': 3,
1237 'sts':'1588',
1238 })
1239 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1240 video_info_webpage = self._download_webpage(video_info_url, video_id,
1241 note=False,
1242 errnote='unable to download video info webpage')
1243 video_info = compat_parse_qs(video_info_webpage)
1244 else:
1245 age_gate = False
1246 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1247 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1248 % (video_id, el_type))
1249 video_info_webpage = self._download_webpage(video_info_url, video_id,
1250 note=False,
1251 errnote='unable to download video info webpage')
1252 video_info = compat_parse_qs(video_info_webpage)
1253 if 'token' in video_info:
1254 break
1255 if 'token' not in video_info:
1256 if 'reason' in video_info:
1257 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1258 else:
1259 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1260
1261 # Check for "rental" videos
1262 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1263 raise ExtractorError(u'"rental" videos not supported')
1264
1265 # Start extracting information
1266 self.report_information_extraction(video_id)
1267
1268 # uploader
1269 if 'author' not in video_info:
1270 raise ExtractorError(u'Unable to extract uploader name')
1271 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1272
1273 # uploader_id
1274 video_uploader_id = None
1275 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1276 if mobj is not None:
1277 video_uploader_id = mobj.group(1)
1278 else:
1279 self._downloader.report_warning(u'unable to extract uploader nickname')
1280
1281 # title
1282 if 'title' not in video_info:
1283 raise ExtractorError(u'Unable to extract video title')
1284 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1285
1286 # thumbnail image
1287 # We try first to get a high quality image:
1288 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1289 video_webpage, re.DOTALL)
1290 if m_thumb is not None:
1291 video_thumbnail = m_thumb.group(1)
1292 elif 'thumbnail_url' not in video_info:
1293 self._downloader.report_warning(u'unable to extract video thumbnail')
1294 video_thumbnail = ''
1295 else: # don't panic if we can't find it
1296 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1297
1298 # upload date
1299 upload_date = None
1300 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1301 if mobj is not None:
1302 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1303 upload_date = unified_strdate(upload_date)
1304
1305 # description
1306 video_description = get_element_by_id("eow-description", video_webpage)
1307 if video_description:
1308 video_description = clean_html(video_description)
1309 else:
1310 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1311 if fd_mobj:
1312 video_description = unescapeHTML(fd_mobj.group(1))
1313 else:
1314 video_description = u''
1315
1316 # subtitles
1317 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1318
1319 if self._downloader.params.get('listsubtitles', False):
1320 self._list_available_subtitles(video_id, video_webpage)
1321 return
1322
1323 if 'length_seconds' not in video_info:
1324 self._downloader.report_warning(u'unable to extract video duration')
1325 video_duration = ''
1326 else:
1327 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1328
1329 # Decide which formats to download
1330
1331 try:
1332 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1333 if not mobj:
1334 raise ValueError('Could not find vevo ID')
1335 info = json.loads(mobj.group(1))
1336 args = info['args']
1337 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1338 # this signatures are encrypted
1339 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1340 if m_s is not None:
1341 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1342 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1343 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1344 if m_s is not None:
1345 if 'url_encoded_fmt_stream_map' in video_info:
1346 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1347 else:
1348 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1349 elif 'adaptive_fmts' in video_info:
1350 if 'url_encoded_fmt_stream_map' in video_info:
1351 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1352 else:
1353 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1354 except ValueError:
1355 pass
1356
1357 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1358 self.report_rtmp_download()
1359 video_url_list = [(None, video_info['conn'][0])]
1360 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1361 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1362 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1363 url_map = {}
1364 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1365 url_data = compat_parse_qs(url_data_str)
1366 if 'itag' in url_data and 'url' in url_data:
1367 url = url_data['url'][0]
1368 if 'sig' in url_data:
1369 url += '&signature=' + url_data['sig'][0]
1370 elif 's' in url_data:
1371 encrypted_sig = url_data['s'][0]
1372 if self._downloader.params.get('verbose'):
1373 if age_gate:
1374 player_version = self._search_regex(
1375 r'-(.+)\.swf$',
1376 player_url if player_url else None,
1377 'flash player', fatal=False)
1378 player_desc = 'flash player %s' % player_version
1379 else:
1380 player_version = self._search_regex(
1381 r'html5player-(.+?)\.js', video_webpage,
1382 'html5 player', fatal=False)
1383 player_desc = u'html5 player %s' % player_version
1384
1385 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1386 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1387 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1388
1389 if not age_gate:
1390 jsplayer_url_json = self._search_regex(
1391 r'"assets":.+?"js":\s*("[^"]+")',
1392 video_webpage, u'JS player URL')
1393 player_url = json.loads(jsplayer_url_json)
1394
1395 signature = self._decrypt_signature(
1396 encrypted_sig, video_id, player_url, age_gate)
1397 url += '&signature=' + signature
1398 if 'ratebypass' not in url:
1399 url += '&ratebypass=yes'
1400 url_map[url_data['itag'][0]] = url
1401 video_url_list = self._get_video_url_list(url_map)
1402 if not video_url_list:
1403 return
1404 elif video_info.get('hlsvp'):
1405 manifest_url = video_info['hlsvp'][0]
1406 url_map = self._extract_from_m3u8(manifest_url, video_id)
1407 video_url_list = self._get_video_url_list(url_map)
1408 if not video_url_list:
1409 return
1410
1411 else:
1412 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1413
1414 results = []
1415 for format_param, video_real_url in video_url_list:
1416 # Extension
1417 video_extension = self._video_extensions.get(format_param, 'flv')
1418
1419 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1420 self._video_dimensions.get(format_param, '???'),
1421 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1422
1423 results.append({
1424 'id': video_id,
1425 'url': video_real_url,
1426 'uploader': video_uploader,
1427 'uploader_id': video_uploader_id,
1428 'upload_date': upload_date,
1429 'title': video_title,
1430 'ext': video_extension,
1431 'format': video_format,
1432 'thumbnail': video_thumbnail,
1433 'description': video_description,
1434 'player_url': player_url,
1435 'subtitles': video_subtitles,
1436 'duration': video_duration
1437 })
1438 return results
1439
1440 class YoutubePlaylistIE(InfoExtractor):
1441 IE_DESC = u'YouTube.com playlists'
1442 _VALID_URL = r"""(?:
1443 (?:https?://)?
1444 (?:\w+\.)?
1445 youtube\.com/
1446 (?:
1447 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1448 \? (?:.*?&)*? (?:p|a|list)=
1449 | p/
1450 )
1451 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1452 .*
1453 |
1454 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1455 )"""
1456 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1457 _MAX_RESULTS = 50
1458 IE_NAME = u'youtube:playlist'
1459
1460 @classmethod
1461 def suitable(cls, url):
1462 """Receives a URL and returns True if suitable for this IE."""
1463 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1464
1465 def _real_extract(self, url):
1466 # Extract playlist id
1467 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1468 if mobj is None:
1469 raise ExtractorError(u'Invalid URL: %s' % url)
1470
1471 # Download playlist videos from API
1472 playlist_id = mobj.group(1) or mobj.group(2)
1473 videos = []
1474
1475 for page_num in itertools.count(1):
1476 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1477 if start_index >= 1000:
1478 self._downloader.report_warning(u'Max number of results reached')
1479 break
1480 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1481 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1482
1483 try:
1484 response = json.loads(page)
1485 except ValueError as err:
1486 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1487
1488 if 'feed' not in response:
1489 raise ExtractorError(u'Got a malformed response from YouTube API')
1490 playlist_title = response['feed']['title']['$t']
1491 if 'entry' not in response['feed']:
1492 # Number of videos is a multiple of self._MAX_RESULTS
1493 break
1494
1495 for entry in response['feed']['entry']:
1496 index = entry['yt$position']['$t']
1497 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1498 videos.append((
1499 index,
1500 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1501 ))
1502
1503 videos = [v[1] for v in sorted(videos)]
1504
1505 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1506 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1507
1508
1509 class YoutubeChannelIE(InfoExtractor):
1510 IE_DESC = u'YouTube.com channels'
1511 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1512 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1513 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1514 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1515 IE_NAME = u'youtube:channel'
1516
1517 def extract_videos_from_page(self, page):
1518 ids_in_page = []
1519 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1520 if mobj.group(1) not in ids_in_page:
1521 ids_in_page.append(mobj.group(1))
1522 return ids_in_page
1523
1524 def _real_extract(self, url):
1525 # Extract channel id
1526 mobj = re.match(self._VALID_URL, url)
1527 if mobj is None:
1528 raise ExtractorError(u'Invalid URL: %s' % url)
1529
1530 # Download channel page
1531 channel_id = mobj.group(1)
1532 video_ids = []
1533 pagenum = 1
1534
1535 url = self._TEMPLATE_URL % (channel_id, pagenum)
1536 page = self._download_webpage(url, channel_id,
1537 u'Downloading page #%s' % pagenum)
1538
1539 # Extract video identifiers
1540 ids_in_page = self.extract_videos_from_page(page)
1541 video_ids.extend(ids_in_page)
1542
1543 # Download any subsequent channel pages using the json-based channel_ajax query
1544 if self._MORE_PAGES_INDICATOR in page:
1545 for pagenum in itertools.count(1):
1546 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1547 page = self._download_webpage(url, channel_id,
1548 u'Downloading page #%s' % pagenum)
1549
1550 page = json.loads(page)
1551
1552 ids_in_page = self.extract_videos_from_page(page['content_html'])
1553 video_ids.extend(ids_in_page)
1554
1555 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1556 break
1557
1558 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1559
1560 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1561 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1562 return [self.playlist_result(url_entries, channel_id)]
1563
1564
1565 class YoutubeUserIE(InfoExtractor):
1566 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1567 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1568 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1569 _GDATA_PAGE_SIZE = 50
1570 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1571 IE_NAME = u'youtube:user'
1572
1573 @classmethod
1574 def suitable(cls, url):
1575 # Don't return True if the url can be extracted with other youtube
1576 # extractor, the regex would is too permissive and it would match.
1577 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1578 if any(ie.suitable(url) for ie in other_ies): return False
1579 else: return super(YoutubeUserIE, cls).suitable(url)
1580
1581 def _real_extract(self, url):
1582 # Extract username
1583 mobj = re.match(self._VALID_URL, url)
1584 if mobj is None:
1585 raise ExtractorError(u'Invalid URL: %s' % url)
1586
1587 username = mobj.group(1)
1588
1589 # Download video ids using YouTube Data API. Result size per
1590 # query is limited (currently to 50 videos) so we need to query
1591 # page by page until there are no video ids - it means we got
1592 # all of them.
1593
1594 video_ids = []
1595
1596 for pagenum in itertools.count(0):
1597 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1598
1599 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1600 page = self._download_webpage(gdata_url, username,
1601 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1602
1603 try:
1604 response = json.loads(page)
1605 except ValueError as err:
1606 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1607 if 'entry' not in response['feed']:
1608 # Number of videos is a multiple of self._MAX_RESULTS
1609 break
1610
1611 # Extract video identifiers
1612 ids_in_page = []
1613 for entry in response['feed']['entry']:
1614 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1615 video_ids.extend(ids_in_page)
1616
1617 # A little optimization - if current page is not
1618 # "full", ie. does not contain PAGE_SIZE video ids then
1619 # we can assume that this page is the last one - there
1620 # are no more ids on further pages - no need to query
1621 # again.
1622
1623 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1624 break
1625
1626 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1627 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1628 return [self.playlist_result(url_results, playlist_title = username)]
1629
1630 class YoutubeSearchIE(SearchInfoExtractor):
1631 IE_DESC = u'YouTube.com searches'
1632 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1633 _MAX_RESULTS = 1000
1634 IE_NAME = u'youtube:search'
1635 _SEARCH_KEY = 'ytsearch'
1636
1637 def report_download_page(self, query, pagenum):
1638 """Report attempt to download search page with given number."""
1639 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1640
1641 def _get_n_results(self, query, n):
1642 """Get a specified number of results for a query"""
1643
1644 video_ids = []
1645 pagenum = 0
1646 limit = n
1647
1648 while (50 * pagenum) < limit:
1649 self.report_download_page(query, pagenum+1)
1650 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1651 request = compat_urllib_request.Request(result_url)
1652 try:
1653 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1654 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1655 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1656 api_response = json.loads(data)['data']
1657
1658 if not 'items' in api_response:
1659 raise ExtractorError(u'[youtube] No video results')
1660
1661 new_ids = list(video['id'] for video in api_response['items'])
1662 video_ids += new_ids
1663
1664 limit = min(n, api_response['totalItems'])
1665 pagenum += 1
1666
1667 if len(video_ids) > n:
1668 video_ids = video_ids[:n]
1669 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1670 return self.playlist_result(videos, query)
1671
1672
1673 class YoutubeShowIE(InfoExtractor):
1674 IE_DESC = u'YouTube.com (multi-season) shows'
1675 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1676 IE_NAME = u'youtube:show'
1677
1678 def _real_extract(self, url):
1679 mobj = re.match(self._VALID_URL, url)
1680 show_name = mobj.group(1)
1681 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1682 # There's one playlist for each season of the show
1683 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1684 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1685 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1686
1687
1688 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1689 """
1690 Base class for extractors that fetch info from
1691 http://www.youtube.com/feed_ajax
1692 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1693 """
1694 _LOGIN_REQUIRED = True
1695 _PAGING_STEP = 30
1696 # use action_load_personal_feed instead of action_load_system_feed
1697 _PERSONAL_FEED = False
1698
1699 @property
1700 def _FEED_TEMPLATE(self):
1701 action = 'action_load_system_feed'
1702 if self._PERSONAL_FEED:
1703 action = 'action_load_personal_feed'
1704 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1705
1706 @property
1707 def IE_NAME(self):
1708 return u'youtube:%s' % self._FEED_NAME
1709
1710 def _real_initialize(self):
1711 self._login()
1712
1713 def _real_extract(self, url):
1714 feed_entries = []
1715 # The step argument is available only in 2.7 or higher
1716 for i in itertools.count(0):
1717 paging = i*self._PAGING_STEP
1718 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1719 u'%s feed' % self._FEED_NAME,
1720 u'Downloading page %s' % i)
1721 info = json.loads(info)
1722 feed_html = info['feed_html']
1723 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1724 ids = orderedSet(m.group(1) for m in m_ids)
1725 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1726 if info['paging'] is None:
1727 break
1728 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1729
1730 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1731 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1732 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1733 _FEED_NAME = 'subscriptions'
1734 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1735
1736 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1737 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1738 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1739 _FEED_NAME = 'recommended'
1740 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1741
1742 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1743 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1744 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1745 _FEED_NAME = 'watch_later'
1746 _PLAYLIST_TITLE = u'Youtube Watch Later'
1747 _PAGING_STEP = 100
1748 _PERSONAL_FEED = True
1749
1750 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1751 IE_NAME = u'youtube:favorites'
1752 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1753 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1754 _LOGIN_REQUIRED = True
1755
1756 def _real_extract(self, url):
1757 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1758 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1759 return self.url_result(playlist_id, 'YoutubePlaylist')