]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Add filesystem signature cache
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211
PH
3import collections
4import itertools
5import io
c5e8d7af 6import json
c4417ddb
PH
7import operator
8import os.path
c5e8d7af 9import re
c4417ddb 10import shutil
c5e8d7af 11import socket
e0df6211
PH
12import string
13import struct
14import traceback
15import zlib
c5e8d7af 16
b05654f0 17from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 18from .subtitles import SubtitlesInfoExtractor
c5e8d7af
PH
19from ..utils import (
20 compat_http_client,
21 compat_parse_qs,
22 compat_urllib_error,
23 compat_urllib_parse,
24 compat_urllib_request,
25 compat_str,
26
27 clean_html,
28 get_element_by_id,
29 ExtractorError,
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
c5e8d7af
PH
33)
34
de7f3446 35class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
36 """Provide base functions for Youtube extractors"""
37 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
38 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
39 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
40 _NETRC_MACHINE = 'youtube'
41 # If True it will raise an error if no login info is provided
42 _LOGIN_REQUIRED = False
43
44 def report_lang(self):
45 """Report attempt to set language."""
46 self.to_screen(u'Setting language')
47
48 def _set_language(self):
49 request = compat_urllib_request.Request(self._LANG_URL)
50 try:
51 self.report_lang()
52 compat_urllib_request.urlopen(request).read()
53 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
54 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
55 return False
56 return True
57
58 def _login(self):
59 (username, password) = self._get_login_info()
60 # No authentication to be performed
61 if username is None:
62 if self._LOGIN_REQUIRED:
63 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
64 return False
65
66 request = compat_urllib_request.Request(self._LOGIN_URL)
67 try:
68 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
69 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
70 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
71 return False
72
73 galx = None
74 dsh = None
75 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
76 if match:
77 galx = match.group(1)
78 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
79 if match:
80 dsh = match.group(1)
c5e8d7af 81
b2e8bc1b
JMF
82 # Log in
83 login_form_strs = {
84 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
85 u'Email': username,
86 u'GALX': galx,
87 u'Passwd': password,
88 u'PersistentCookie': u'yes',
89 u'_utf8': u'霱',
90 u'bgresponse': u'js_disabled',
91 u'checkConnection': u'',
92 u'checkedDomains': u'youtube',
93 u'dnConn': u'',
94 u'dsh': dsh,
95 u'pstMsg': u'0',
96 u'rmShown': u'1',
97 u'secTok': u'',
98 u'signIn': u'Sign in',
99 u'timeStmp': u'',
100 u'service': u'youtube',
101 u'uilel': u'3',
102 u'hl': u'en_US',
103 }
104 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
105 # chokes on unicode
106 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
107 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
108 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
109 try:
110 self.report_login()
111 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
112 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
113 self._downloader.report_warning(u'unable to log in: bad username or password')
114 return False
115 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
116 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
117 return False
118 return True
119
120 def _confirm_age(self):
121 age_form = {
122 'next_url': '/',
123 'action_confirm': 'Confirm',
124 }
125 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
126 try:
127 self.report_age_confirmation()
128 compat_urllib_request.urlopen(request).read().decode('utf-8')
129 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
130 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
131 return True
132
133 def _real_initialize(self):
134 if self._downloader is None:
135 return
136 if not self._set_language():
137 return
138 if not self._login():
139 return
140 self._confirm_age()
c5e8d7af 141
8377574c 142
de7f3446 143class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 144 IE_DESC = u'YouTube.com'
c5e8d7af
PH
145 _VALID_URL = r"""^
146 (
147 (?:https?://)? # http(s):// (optional)
f4b05232 148 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
e69ae5b9
JMF
149 tube\.majestyc\.net/|
150 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
151 (?:.*?\#/)? # handle anchor (#/) redirect urls
152 (?: # the various things that can precede the ID:
153 (?:(?:v|embed|e)/) # v/ or embed/ or e/
154 |(?: # or the v= param in all its forms
d741e55a 155 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
156 (?:\?|\#!?) # the params delimiter ? or # or #!
157 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
158 v=
159 )
f4b05232
JMF
160 ))
161 |youtu\.be/ # just youtu.be/xxxx
162 )
c5e8d7af 163 )? # all until now is optional -> you can pass the naked ID
8963d9c2 164 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
165 (?(1).+)? # if we found the ID, everything can follow
166 $"""
c5e8d7af 167 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 168 # Listed in order of quality
bdc6b3fc 169 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 170 # Apple HTTP Live Streaming
bdc6b3fc 171 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
172 # 3D
173 '85', '84', '102', '83', '101', '82', '100',
174 # Dash video
175 '138', '137', '248', '136', '247', '135', '246',
176 '245', '244', '134', '243', '133', '242', '160',
177 # Dash audio
178 '141', '172', '140', '171', '139',
1d043b93 179 ]
bdc6b3fc 180 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 181 # Apple HTTP Live Streaming
bdc6b3fc
AZ
182 '96', '95', '94', '93', '92', '132', '151',
183 # 3D
86fe61c8 184 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
185 # Dash video
186 '138', '248', '137', '247', '136', '246', '245',
187 '244', '135', '243', '134', '242', '133', '160',
188 # Dash audio
189 '172', '141', '171', '140', '139',
1d043b93 190 ]
bdc6b3fc
AZ
191 _video_formats_map = {
192 'flv': ['35', '34', '6', '5'],
193 '3gp': ['36', '17', '13'],
194 'mp4': ['38', '37', '22', '18'],
195 'webm': ['46', '45', '44', '43'],
196 }
c5e8d7af
PH
197 _video_extensions = {
198 '13': '3gp',
bdc6b3fc 199 '17': '3gp',
c5e8d7af
PH
200 '18': 'mp4',
201 '22': 'mp4',
bdc6b3fc 202 '36': '3gp',
c5e8d7af 203 '37': 'mp4',
d69cf69a 204 '38': 'mp4',
c5e8d7af
PH
205 '43': 'webm',
206 '44': 'webm',
207 '45': 'webm',
208 '46': 'webm',
1d043b93 209
86fe61c8
AZ
210 # 3d videos
211 '82': 'mp4',
212 '83': 'mp4',
213 '84': 'mp4',
214 '85': 'mp4',
215 '100': 'webm',
216 '101': 'webm',
217 '102': 'webm',
836a086c 218
96fb5605 219 # Apple HTTP Live Streaming
1d043b93
JMF
220 '92': 'mp4',
221 '93': 'mp4',
222 '94': 'mp4',
223 '95': 'mp4',
224 '96': 'mp4',
225 '132': 'mp4',
226 '151': 'mp4',
836a086c
AZ
227
228 # Dash mp4
229 '133': 'mp4',
230 '134': 'mp4',
231 '135': 'mp4',
232 '136': 'mp4',
233 '137': 'mp4',
234 '138': 'mp4',
235 '139': 'mp4',
236 '140': 'mp4',
237 '141': 'mp4',
238 '160': 'mp4',
239
240 # Dash webm
241 '171': 'webm',
242 '172': 'webm',
243 '242': 'webm',
244 '243': 'webm',
245 '244': 'webm',
246 '245': 'webm',
247 '246': 'webm',
248 '247': 'webm',
249 '248': 'webm',
c5e8d7af
PH
250 }
251 _video_dimensions = {
252 '5': '240x400',
253 '6': '???',
254 '13': '???',
255 '17': '144x176',
256 '18': '360x640',
257 '22': '720x1280',
258 '34': '360x640',
259 '35': '480x854',
bdc6b3fc 260 '36': '240x320',
c5e8d7af
PH
261 '37': '1080x1920',
262 '38': '3072x4096',
263 '43': '360x640',
264 '44': '480x854',
265 '45': '720x1280',
266 '46': '1080x1920',
86fe61c8
AZ
267 '82': '360p',
268 '83': '480p',
269 '84': '720p',
270 '85': '1080p',
1d043b93
JMF
271 '92': '240p',
272 '93': '360p',
273 '94': '480p',
274 '95': '720p',
275 '96': '1080p',
86fe61c8
AZ
276 '100': '360p',
277 '101': '480p',
836a086c 278 '102': '720p',
1d043b93
JMF
279 '132': '240p',
280 '151': '72p',
836a086c
AZ
281 '133': '240p',
282 '134': '360p',
283 '135': '480p',
284 '136': '720p',
285 '137': '1080p',
286 '138': '>1080p',
287 '139': '48k',
288 '140': '128k',
289 '141': '256k',
290 '160': '192p',
291 '171': '128k',
292 '172': '256k',
293 '242': '240p',
294 '243': '360p',
295 '244': '480p',
296 '245': '480p',
297 '246': '480p',
298 '247': '720p',
299 '248': '1080p',
c5e8d7af 300 }
836a086c
AZ
301 _special_itags = {
302 '82': '3D',
303 '83': '3D',
304 '84': '3D',
305 '85': '3D',
306 '100': '3D',
307 '101': '3D',
308 '102': '3D',
309 '133': 'DASH Video',
310 '134': 'DASH Video',
311 '135': 'DASH Video',
312 '136': 'DASH Video',
313 '137': 'DASH Video',
314 '138': 'DASH Video',
315 '139': 'DASH Audio',
316 '140': 'DASH Audio',
317 '141': 'DASH Audio',
318 '160': 'DASH Video',
319 '171': 'DASH Audio',
320 '172': 'DASH Audio',
321 '242': 'DASH Video',
322 '243': 'DASH Video',
323 '244': 'DASH Video',
324 '245': 'DASH Video',
325 '246': 'DASH Video',
326 '247': 'DASH Video',
327 '248': 'DASH Video',
c5e8d7af 328 }
836a086c 329
c5e8d7af 330 IE_NAME = u'youtube'
2eb88d95
PH
331 _TESTS = [
332 {
0e853ca4
PH
333 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
334 u"file": u"BaW_jenozKc.mp4",
335 u"info_dict": {
336 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
337 u"uploader": u"Philipp Hagemeister",
338 u"uploader_id": u"phihag",
339 u"upload_date": u"20121002",
340 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 341 }
0e853ca4
PH
342 },
343 {
344 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
345 u"file": u"1ltcDfZMA3U.flv",
346 u"note": u"Test VEVO video (#897)",
347 u"info_dict": {
348 u"upload_date": u"20070518",
349 u"title": u"Maps - It Will Find You",
350 u"description": u"Music video by Maps performing It Will Find You.",
351 u"uploader": u"MuteUSA",
352 u"uploader_id": u"MuteUSA"
2eb88d95 353 }
0e853ca4
PH
354 },
355 {
356 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
357 u"file": u"UxxajLWwzqY.mp4",
358 u"note": u"Test generic use_cipher_signature video (#897)",
359 u"info_dict": {
360 u"upload_date": u"20120506",
361 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
c7bf7366 362 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
45ed795c 363 u"uploader": u"Icona Pop",
0e853ca4 364 u"uploader_id": u"IconaPop"
2eb88d95 365 }
c108eb73
JMF
366 },
367 {
368 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
369 u"file": u"07FYdnEawAQ.mp4",
370 u"note": u"Test VEVO video with age protection (#956)",
371 u"info_dict": {
372 u"upload_date": u"20130703",
373 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
374 u"description": u"md5:64249768eec3bc4276236606ea996373",
375 u"uploader": u"justintimberlakeVEVO",
376 u"uploader_id": u"justintimberlakeVEVO"
377 }
378 },
1d043b93
JMF
379 {
380 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
381 u'file': u'TGi3HqYrWHE.mp4',
382 u'note': u'm3u8 video',
383 u'info_dict': {
384 u'title': u'Triathlon - Men - London 2012 Olympic Games',
385 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
386 u'uploader': u'olympic',
387 u'upload_date': u'20120807',
388 u'uploader_id': u'olympic',
389 },
390 u'params': {
391 u'skip_download': True,
392 },
393 },
2eb88d95
PH
394 ]
395
c5e8d7af
PH
396
397 @classmethod
398 def suitable(cls, url):
399 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 400 if YoutubePlaylistIE.suitable(url): return False
c5e8d7af
PH
401 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
402
e0df6211
PH
403 def __init__(self, *args, **kwargs):
404 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 405 self._player_cache = {}
e0df6211 406
c5e8d7af
PH
407 def report_video_webpage_download(self, video_id):
408 """Report attempt to download video webpage."""
409 self.to_screen(u'%s: Downloading video webpage' % video_id)
410
411 def report_video_info_webpage_download(self, video_id):
412 """Report attempt to download video info webpage."""
413 self.to_screen(u'%s: Downloading video info webpage' % video_id)
414
c5e8d7af
PH
415 def report_information_extraction(self, video_id):
416 """Report attempt to extract video information."""
417 self.to_screen(u'%s: Extracting video information' % video_id)
418
419 def report_unavailable_format(self, video_id, format):
420 """Report extracted video URL."""
421 self.to_screen(u'%s: Format %s not available' % (video_id, format))
422
423 def report_rtmp_download(self):
424 """Indicate the download will use the RTMP protocol."""
425 self.to_screen(u'RTMP download detected')
426
c4417ddb
PH
427 def _extract_signature_function(self, video_id, player_url, slen):
428 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 429 player_url)
e0df6211
PH
430 player_type = id_m.group('ext')
431 player_id = id_m.group('id')
432
c4417ddb
PH
433 # Read from filesystem cache
434 func_id = '%s_%s_%d' % (player_type, player_id, slen)
435 assert os.path.basename(func_id) == func_id
436 cache_dir = self.downloader.params.get('cachedir',
437 u'~/.youtube-dl/cache')
438
439 if cache_dir is not False:
440 cache_fn = os.path.join(os.path.expanduser(cache_dir),
441 u'youtube-sigfuncs',
442 func_id + '.json')
443 try:
444 with io.open(cache_fn, '', encoding='utf-8') as cachef:
445 cache_spec = json.load(cachef)
446 return lambda s: u''.join(s[i] for i in cache_spec)
447 except OSError:
448 pass # No cache available
83799698 449
e0df6211
PH
450 if player_type == 'js':
451 code = self._download_webpage(
452 player_url, video_id,
83799698 453 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 454 errnote=u'Download of %s failed' % player_url)
83799698 455 res = self._parse_sig_js(code)
c4417ddb 456 elif player_type == 'swf':
e0df6211
PH
457 urlh = self._request_webpage(
458 player_url, video_id,
83799698 459 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
460 errnote=u'Download of %s failed' % player_url)
461 code = urlh.read()
83799698 462 res = self._parse_sig_swf(code)
e0df6211
PH
463 else:
464 assert False, 'Invalid player type %r' % player_type
465
c4417ddb
PH
466 if cache_dir is not False:
467 cache_res = res(map(compat_chr, range(slen)))
468 cache_spec = [ord(c) for c in cache_res]
469 shutil.makedirs(os.path.dirname(cache_fn))
470 write_json_file(cache_spec, cache_fn)
83799698
PH
471
472 return res
473
e0df6211
PH
474 def _parse_sig_js(self, jscode):
475 funcname = self._search_regex(
476 r'signature=([a-zA-Z]+)', jscode,
477 u'Initial JS player signature function name')
478
479 functions = {}
480
481 def argidx(varname):
482 return string.lowercase.index(varname)
483
484 def interpret_statement(stmt, local_vars, allow_recursion=20):
485 if allow_recursion < 0:
486 raise ExctractorError(u'Recursion limit reached')
487
488 if stmt.startswith(u'var '):
489 stmt = stmt[len(u'var '):]
490 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
491 r'=(?P<expr>.*)$', stmt)
492 if ass_m:
493 if ass_m.groupdict().get('index'):
494 def assign(val):
495 lvar = local_vars[ass_m.group('out')]
496 idx = interpret_expression(ass_m.group('index'),
497 local_vars, allow_recursion)
498 assert isinstance(idx, int)
499 lvar[idx] = val
500 return val
501 expr = ass_m.group('expr')
502 else:
503 def assign(val):
504 local_vars[ass_m.group('out')] = val
505 return val
506 expr = ass_m.group('expr')
507 elif stmt.startswith(u'return '):
508 assign = lambda v: v
509 expr = stmt[len(u'return '):]
510 else:
511 raise ExtractorError(
512 u'Cannot determine left side of statement in %r' % stmt)
513
514 v = interpret_expression(expr, local_vars, allow_recursion)
515 return assign(v)
516
517 def interpret_expression(expr, local_vars, allow_recursion):
518 if expr.isdigit():
519 return int(expr)
520
521 if expr.isalpha():
522 return local_vars[expr]
523
524 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
525 if m:
526 member = m.group('member')
527 val = local_vars[m.group('in')]
528 if member == 'split("")':
529 return list(val)
530 if member == 'join("")':
531 return u''.join(val)
532 if member == 'length':
533 return len(val)
534 if member == 'reverse()':
535 return val[::-1]
536 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
537 if slice_m:
538 idx = interpret_expression(
539 slice_m.group('idx'), local_vars, allow_recursion-1)
540 return val[idx:]
541
542 m = re.match(
543 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
544 if m:
545 val = local_vars[m.group('in')]
546 idx = interpret_expression(m.group('idx'), local_vars,
547 allow_recursion-1)
548 return val[idx]
549
550 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
551 if m:
552 a = interpret_expression(m.group('a'),
553 local_vars, allow_recursion)
554 b = interpret_expression(m.group('b'),
555 local_vars, allow_recursion)
556 return a % b
557
558 m = re.match(
559 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
560 if m:
561 fname = m.group('func')
562 if fname not in functions:
563 functions[fname] = extract_function(fname)
564 argvals = [int(v) if v.isdigit() else local_vars[v]
565 for v in m.group('args').split(',')]
566 return functions[fname](argvals)
567 raise ExtractorError(u'Unsupported JS expression %r' % expr)
568
569 def extract_function(funcname):
570 func_m = re.search(
571 r'function ' + re.escape(funcname) +
572 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
573 jscode)
574 argnames = func_m.group('args').split(',')
575
576 def resf(args):
577 local_vars = dict(zip(argnames, args))
578 for stmt in func_m.group('code').split(';'):
579 res = interpret_statement(stmt, local_vars)
580 return res
581 return resf
582
583 initial_function = extract_function(funcname)
584 return lambda s: initial_function([s])
585
586 def _parse_sig_swf(self, file_contents):
587 if file_contents[1:3] != b'WS':
588 raise ExtractorError(
589 u'Not an SWF file; header is %r' % file_contents[:3])
590 if file_contents[:1] == b'C':
591 content = zlib.decompress(file_contents[8:])
592 else:
593 raise NotImplementedError(u'Unsupported compression format %r' %
594 file_contents[:1])
595
596 def extract_tags(content):
597 pos = 0
598 while pos < len(content):
599 header16 = struct.unpack('<H', content[pos:pos+2])[0]
600 pos += 2
601 tag_code = header16 >> 6
602 tag_len = header16 & 0x3f
603 if tag_len == 0x3f:
604 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
605 pos += 4
606 assert pos+tag_len <= len(content)
607 yield (tag_code, content[pos:pos+tag_len])
608 pos += tag_len
609
610 code_tag = next(tag
611 for tag_code, tag in extract_tags(content)
612 if tag_code == 82)
613 p = code_tag.index(b'\0', 4) + 1
ba552f54 614 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
615
616 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
617 def read_int(reader=None):
618 if reader is None:
619 reader = code_reader
e0df6211
PH
620 res = 0
621 shift = 0
622 for _ in range(5):
ba552f54
PH
623 buf = reader.read(1)
624 assert len(buf) == 1
625 b = struct.unpack('<B', buf)[0]
e0df6211
PH
626 res = res | ((b & 0x7f) << shift)
627 if b & 0x80 == 0:
628 break
629 shift += 7
ba552f54
PH
630 return res
631
632 def u30(reader=None):
633 res = read_int(reader)
634 assert res & 0xf0000000 == 0
e0df6211
PH
635 return res
636 u32 = read_int
637
ba552f54
PH
638 def s32(reader=None):
639 v = read_int(reader)
e0df6211
PH
640 if v & 0x80000000 != 0:
641 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
642 return v
643
644 def string(reader=None):
645 if reader is None:
646 reader = code_reader
647 slen = u30(reader)
648 resb = reader.read(slen)
649 assert len(resb) == slen
650 return resb.decode('utf-8')
651
652 def read_bytes(count, reader=None):
653 if reader is None:
654 reader = code_reader
655 resb = reader.read(count)
656 assert len(resb) == count
657 return resb
658
659 def read_byte(reader=None):
660 resb = read_bytes(1, reader=reader)
661 res = struct.unpack('<B', resb)[0]
662 return res
e0df6211
PH
663
664 # minor_version + major_version
2f2ffea9 665 _ = read_bytes(2 + 2)
e0df6211
PH
666
667 # Constant pool
ba552f54 668 int_count = u30()
e0df6211 669 for _c in range(1, int_count):
ba552f54
PH
670 _ = s32()
671 uint_count = u30()
e0df6211 672 for _c in range(1, uint_count):
ba552f54
PH
673 _ = u32()
674 double_count = u30()
675 _ = read_bytes((double_count-1) * 8)
676 string_count = u30()
e0df6211
PH
677 constant_strings = [u'']
678 for _c in range(1, string_count):
ba552f54 679 s = string()
e0df6211 680 constant_strings.append(s)
ba552f54 681 namespace_count = u30()
e0df6211 682 for _c in range(1, namespace_count):
ba552f54
PH
683 _ = read_bytes(1) # kind
684 _ = u30() # name
685 ns_set_count = u30()
e0df6211 686 for _c in range(1, ns_set_count):
ba552f54 687 count = u30()
e0df6211 688 for _c2 in range(count):
ba552f54
PH
689 _ = u30()
690 multiname_count = u30()
e0df6211
PH
691 MULTINAME_SIZES = {
692 0x07: 2, # QName
693 0x0d: 2, # QNameA
694 0x0f: 1, # RTQName
695 0x10: 1, # RTQNameA
696 0x11: 0, # RTQNameL
697 0x12: 0, # RTQNameLA
698 0x09: 2, # Multiname
699 0x0e: 2, # MultinameA
700 0x1b: 1, # MultinameL
701 0x1c: 1, # MultinameLA
702 }
703 multinames = [u'']
704 for _c in range(1, multiname_count):
ba552f54 705 kind = u30()
e0df6211
PH
706 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
707 if kind == 0x07:
ba552f54
PH
708 namespace_idx = u30()
709 name_idx = u30()
e0df6211
PH
710 multinames.append(constant_strings[name_idx])
711 else:
712 multinames.append('[MULTINAME kind: %d]' % kind)
713 for _c2 in range(MULTINAME_SIZES[kind]):
ba552f54 714 _ = u30()
e0df6211
PH
715
716 # Methods
ba552f54 717 method_count = u30()
e0df6211
PH
718 MethodInfo = collections.namedtuple(
719 'MethodInfo',
720 ['NEED_ARGUMENTS', 'NEED_REST'])
721 method_infos = []
722 for method_id in range(method_count):
ba552f54
PH
723 param_count = u30()
724 _ = u30() # return type
e0df6211 725 for _ in range(param_count):
ba552f54
PH
726 _ = u30() # param type
727 _ = u30() # name index (always 0 for youtube)
728 flags = read_byte()
e0df6211
PH
729 if flags & 0x08 != 0:
730 # Options present
ba552f54 731 option_count = u30()
e0df6211 732 for c in range(option_count):
ba552f54
PH
733 _ = u30() # val
734 _ = read_bytes(1) # kind
e0df6211
PH
735 if flags & 0x80 != 0:
736 # Param names present
737 for _ in range(param_count):
ba552f54 738 _ = u30() # param name
e0df6211
PH
739 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
740 method_infos.append(mi)
741
742 # Metadata
ba552f54 743 metadata_count = u30()
e0df6211 744 for _c in range(metadata_count):
ba552f54
PH
745 _ = u30() # name
746 item_count = u30()
e0df6211 747 for _c2 in range(item_count):
ba552f54
PH
748 _ = u30() # key
749 _ = u30() # value
750
751 def parse_traits_info():
752 trait_name_idx = u30()
753 kind_full = read_byte()
e0df6211
PH
754 kind = kind_full & 0x0f
755 attrs = kind_full >> 4
756 methods = {}
757 if kind in [0x00, 0x06]: # Slot or Const
ba552f54
PH
758 _ = u30() # Slot id
759 type_name_idx = u30()
760 vindex = u30()
e0df6211 761 if vindex != 0:
ba552f54 762 _ = read_byte() # vkind
e0df6211 763 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
ba552f54
PH
764 _ = u30() # disp_id
765 method_idx = u30()
e0df6211
PH
766 methods[multinames[trait_name_idx]] = method_idx
767 elif kind == 0x04: # Class
ba552f54
PH
768 _ = u30() # slot_id
769 _ = u30() # classi
e0df6211 770 elif kind == 0x05: # Function
ba552f54
PH
771 _ = u30() # slot_id
772 function_idx = u30()
e0df6211
PH
773 methods[function_idx] = multinames[trait_name_idx]
774 else:
775 raise ExtractorError(u'Unsupported trait kind %d' % kind)
776
777 if attrs & 0x4 != 0: # Metadata present
ba552f54 778 metadata_count = u30()
e0df6211 779 for _c3 in range(metadata_count):
ba552f54 780 _ = u30()
e0df6211 781
ba552f54 782 return methods
e0df6211
PH
783
784 # Classes
785 TARGET_CLASSNAME = u'SignatureDecipher'
786 searched_idx = multinames.index(TARGET_CLASSNAME)
787 searched_class_id = None
ba552f54 788 class_count = u30()
e0df6211 789 for class_id in range(class_count):
ba552f54 790 name_idx = u30()
e0df6211
PH
791 if name_idx == searched_idx:
792 # We found the class we're looking for!
793 searched_class_id = class_id
ba552f54
PH
794 _ = u30() # super_name idx
795 flags = read_byte()
e0df6211 796 if flags & 0x08 != 0: # Protected namespace is present
ba552f54
PH
797 protected_ns_idx = u30()
798 intrf_count = u30()
e0df6211 799 for _c2 in range(intrf_count):
ba552f54
PH
800 _ = u30()
801 _ = u30() # iinit
802 trait_count = u30()
e0df6211 803 for _c2 in range(trait_count):
ba552f54 804 _ = parse_traits_info()
e0df6211
PH
805
806 if searched_class_id is None:
807 raise ExtractorError(u'Target class %r not found' %
808 TARGET_CLASSNAME)
809
810 method_names = {}
811 method_idxs = {}
812 for class_id in range(class_count):
ba552f54
PH
813 _ = u30() # cinit
814 trait_count = u30()
e0df6211 815 for _c2 in range(trait_count):
ba552f54 816 trait_methods = parse_traits_info()
e0df6211
PH
817 if class_id == searched_class_id:
818 method_names.update(trait_methods.items())
819 method_idxs.update(dict(
820 (idx, name)
821 for name, idx in trait_methods.items()))
822
823 # Scripts
ba552f54 824 script_count = u30()
e0df6211 825 for _c in range(script_count):
ba552f54
PH
826 _ = u30() # init
827 trait_count = u30()
e0df6211 828 for _c2 in range(trait_count):
ba552f54 829 _ = parse_traits_info()
e0df6211
PH
830
831 # Method bodies
ba552f54 832 method_body_count = u30()
e0df6211
PH
833 Method = collections.namedtuple('Method', ['code', 'local_count'])
834 methods = {}
835 for _c in range(method_body_count):
ba552f54
PH
836 method_idx = u30()
837 max_stack = u30()
838 local_count = u30()
839 init_scope_depth = u30()
840 max_scope_depth = u30()
841 code_length = u30()
842 code = read_bytes(code_length)
e0df6211 843 if method_idx in method_idxs:
ba552f54 844 m = Method(code, local_count)
e0df6211 845 methods[method_idxs[method_idx]] = m
ba552f54 846 exception_count = u30()
e0df6211 847 for _c2 in range(exception_count):
ba552f54
PH
848 _ = u30() # from
849 _ = u30() # to
850 _ = u30() # target
851 _ = u30() # exc_type
852 _ = u30() # var_name
853 trait_count = u30()
e0df6211 854 for _c2 in range(trait_count):
ba552f54 855 _ = parse_traits_info()
e0df6211 856
ba552f54 857 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
858 assert len(methods) == len(method_idxs)
859
860 method_pyfunctions = {}
861
862 def extract_function(func_name):
863 if func_name in method_pyfunctions:
864 return method_pyfunctions[func_name]
865 if func_name not in methods:
866 raise ExtractorError(u'Cannot find function %r' % func_name)
867 m = methods[func_name]
868
869 def resfunc(args):
e0df6211
PH
870 registers = ['(this)'] + list(args) + [None] * m.local_count
871 stack = []
872 coder = io.BytesIO(m.code)
873 while True:
874 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 875 if opcode == 36: # pushbyte
e0df6211
PH
876 v = struct.unpack('!B', coder.read(1))[0]
877 stack.append(v)
878 elif opcode == 44: # pushstring
879 idx = u30(coder)
880 stack.append(constant_strings[idx])
881 elif opcode == 48: # pushscope
882 # We don't implement the scope register, so we'll just
883 # ignore the popped value
884 stack.pop()
885 elif opcode == 70: # callproperty
886 index = u30(coder)
887 mname = multinames[index]
888 arg_count = u30(coder)
889 args = list(reversed(
890 [stack.pop() for _ in range(arg_count)]))
891 obj = stack.pop()
892 if mname == u'split':
893 assert len(args) == 1
894 assert isinstance(args[0], compat_str)
895 assert isinstance(obj, compat_str)
896 if args[0] == u'':
897 res = list(obj)
898 else:
899 res = obj.split(args[0])
900 stack.append(res)
a7177865
PH
901 elif mname == u'slice':
902 assert len(args) == 1
903 assert isinstance(args[0], int)
904 assert isinstance(obj, list)
905 res = obj[args[0]:]
906 stack.append(res)
907 elif mname == u'join':
908 assert len(args) == 1
909 assert isinstance(args[0], compat_str)
910 assert isinstance(obj, list)
911 res = args[0].join(obj)
912 stack.append(res)
e0df6211
PH
913 elif mname in method_pyfunctions:
914 stack.append(method_pyfunctions[mname](args))
915 else:
916 raise NotImplementedError(
917 u'Unsupported property %r on %r'
918 % (mname, obj))
a7177865
PH
919 elif opcode == 72: # returnvalue
920 res = stack.pop()
921 return res
922 elif opcode == 79: # callpropvoid
923 index = u30(coder)
924 mname = multinames[index]
925 arg_count = u30(coder)
926 args = list(reversed(
927 [stack.pop() for _ in range(arg_count)]))
928 obj = stack.pop()
929 if mname == u'reverse':
930 assert isinstance(obj, list)
931 obj.reverse()
932 else:
933 raise NotImplementedError(
934 u'Unsupported (void) property %r on %r'
935 % (mname, obj))
e0df6211
PH
936 elif opcode == 93: # findpropstrict
937 index = u30(coder)
938 mname = multinames[index]
939 res = extract_function(mname)
940 stack.append(res)
941 elif opcode == 97: # setproperty
942 index = u30(coder)
943 value = stack.pop()
944 idx = stack.pop()
945 obj = stack.pop()
946 assert isinstance(obj, list)
947 assert isinstance(idx, int)
948 obj[idx] = value
949 elif opcode == 98: # getlocal
950 index = u30(coder)
951 stack.append(registers[index])
952 elif opcode == 99: # setlocal
953 index = u30(coder)
954 value = stack.pop()
955 registers[index] = value
956 elif opcode == 102: # getproperty
957 index = u30(coder)
958 pname = multinames[index]
959 if pname == u'length':
960 obj = stack.pop()
961 assert isinstance(obj, list)
962 stack.append(len(obj))
963 else: # Assume attribute access
964 idx = stack.pop()
965 assert isinstance(idx, int)
966 obj = stack.pop()
967 assert isinstance(obj, list)
968 stack.append(obj[idx])
969 elif opcode == 128: # coerce
970 _ = u30(coder)
971 elif opcode == 133: # coerce_s
972 assert isinstance(stack[-1], (type(None), compat_str))
973 elif opcode == 164: # modulo
974 value2 = stack.pop()
975 value1 = stack.pop()
976 res = value1 % value2
977 stack.append(res)
a7177865
PH
978 elif opcode == 208: # getlocal_0
979 stack.append(registers[0])
980 elif opcode == 209: # getlocal_1
981 stack.append(registers[1])
982 elif opcode == 210: # getlocal_2
983 stack.append(registers[2])
984 elif opcode == 211: # getlocal_3
985 stack.append(registers[3])
e0df6211
PH
986 elif opcode == 214: # setlocal_2
987 registers[2] = stack.pop()
988 elif opcode == 215: # setlocal_3
989 registers[3] = stack.pop()
990 else:
991 raise NotImplementedError(
992 u'Unsupported opcode %d' % opcode)
993
994 method_pyfunctions[func_name] = resfunc
995 return resfunc
996
997 initial_function = extract_function(u'decipher')
998 return lambda s: initial_function([s])
999
83799698 1000 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1001 """Turn the encrypted s field into a working signature"""
6b37f0be 1002
83799698 1003 if player_url is not None:
e0df6211 1004 try:
83799698
PH
1005 if player_url not in self._player_cache:
1006 func = self._extract_signature_function(
c4417ddb 1007 video_id, player_url, len(s)
e0df6211 1008 )
83799698
PH
1009 self._player_cache[player_url] = func
1010 return self._player_cache[player_url](s)
e0df6211
PH
1011 except Exception as e:
1012 tb = traceback.format_exc()
83799698
PH
1013 self._downloader.report_warning(
1014 u'Automatic signature extraction failed: ' + tb)
e0df6211 1015
83799698
PH
1016 self._downloader.report_warning(
1017 u'Warning: Falling back to static signature algorithm')
2f2ffea9
PH
1018 return self._static_decrypt_signature(
1019 s, video_id, player_url, age_gate)
e0df6211 1020
2f2ffea9 1021 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1022 if age_gate:
1023 # The videos with age protection use another player, so the
1024 # algorithms can be different.
1025 if len(s) == 86:
1026 return s[2:63] + s[82] + s[64:82] + s[63]
1027
1028 if len(s) == 92:
444b1165
JMF
1029 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1030 elif len(s) == 90:
1031 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1032 elif len(s) == 89:
1033 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1034 elif len(s) == 88:
3e223834 1035 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1036 elif len(s) == 87:
3a725669 1037 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1038 elif len(s) == 86:
1cf911bc 1039 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
be547e1d 1040 elif len(s) == 85:
6ae8ee3f 1041 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1042 elif len(s) == 84:
23b00bc0 1043 return s[81:36:-1] + s[0] + s[35:2:-1]
be547e1d 1044 elif len(s) == 83:
e1842025 1045 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
be547e1d 1046 elif len(s) == 82:
ce85f022 1047 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
be547e1d 1048 elif len(s) == 81:
aedd6bb9 1049 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1050 elif len(s) == 80:
1051 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1052 elif len(s) == 79:
1053 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1054
1055 else:
1056 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1057
75952c6e
JMF
1058 def _decrypt_signature_age_gate(self, s):
1059 # The videos with age protection use another player, so the algorithms
1060 # can be different.
1061 if len(s) == 86:
1062 return s[2:63] + s[82] + s[64:82] + s[63]
1063 else:
1064 # Fallback to the other algortihms
b072a9de 1065 return self._decrypt_signature(s)
c5e8d7af 1066
de7f3446 1067 def _get_available_subtitles(self, video_id):
de7f3446 1068 try:
7fad1c63
JMF
1069 sub_list = self._download_webpage(
1070 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1071 video_id, note=False)
1072 except ExtractorError as err:
de7f3446
JMF
1073 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1074 return {}
1075 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1076
1077 sub_lang_list = {}
1078 for l in lang_list:
1079 lang = l[1]
1080 params = compat_urllib_parse.urlencode({
1081 'lang': lang,
1082 'v': video_id,
1083 'fmt': self._downloader.params.get('subtitlesformat'),
1084 })
1085 url = u'http://www.youtube.com/api/timedtext?' + params
1086 sub_lang_list[lang] = url
1087 if not sub_lang_list:
1088 self._downloader.report_warning(u'video doesn\'t have subtitles')
1089 return {}
1090 return sub_lang_list
1091
055e6f36 1092 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1093 """We need the webpage for getting the captions url, pass it as an
1094 argument to speed up the process."""
de7f3446
JMF
1095 sub_format = self._downloader.params.get('subtitlesformat')
1096 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1097 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1098 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1099 if mobj is None:
1100 self._downloader.report_warning(err_msg)
1101 return {}
1102 player_config = json.loads(mobj.group(1))
1103 try:
1104 args = player_config[u'args']
1105 caption_url = args[u'ttsurl']
1106 timestamp = args[u'timestamp']
055e6f36
JMF
1107 # We get the available subtitles
1108 list_params = compat_urllib_parse.urlencode({
1109 'type': 'list',
1110 'tlangs': 1,
1111 'asrs': 1,
de7f3446 1112 })
055e6f36
JMF
1113 list_url = caption_url + '&' + list_params
1114 list_page = self._download_webpage(list_url, video_id)
1115 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
e3dc22ca
JMF
1116 original_lang_node = caption_list.find('track')
1117 if original_lang_node.attrib.get('kind') != 'asr' :
1118 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1119 return {}
1120 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1121
1122 sub_lang_list = {}
1123 for lang_node in caption_list.findall('target'):
1124 sub_lang = lang_node.attrib['lang_code']
1125 params = compat_urllib_parse.urlencode({
1126 'lang': original_lang,
1127 'tlang': sub_lang,
1128 'fmt': sub_format,
1129 'ts': timestamp,
1130 'kind': 'asr',
1131 })
1132 sub_lang_list[sub_lang] = caption_url + '&' + params
1133 return sub_lang_list
de7f3446
JMF
1134 # An extractor error can be raise by the download process if there are
1135 # no automatic captions but there are subtitles
1136 except (KeyError, ExtractorError):
1137 self._downloader.report_warning(err_msg)
1138 return {}
1139
c5e8d7af
PH
1140 def _print_formats(self, formats):
1141 print('Available formats:')
1142 for x in formats:
03cc7c20
JMF
1143 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1144 self._video_dimensions.get(x, '???'),
836a086c 1145 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1146
1147 def _extract_id(self, url):
1148 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1149 if mobj is None:
1150 raise ExtractorError(u'Invalid URL: %s' % url)
1151 video_id = mobj.group(2)
1152 return video_id
1153
1d043b93
JMF
1154 def _get_video_url_list(self, url_map):
1155 """
1156 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1157 with the requested formats.
1158 """
1159 req_format = self._downloader.params.get('format', None)
1160 format_limit = self._downloader.params.get('format_limit', None)
1161 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1162 if format_limit is not None and format_limit in available_formats:
1163 format_list = available_formats[available_formats.index(format_limit):]
1164 else:
1165 format_list = available_formats
1166 existing_formats = [x for x in format_list if x in url_map]
1167 if len(existing_formats) == 0:
1168 raise ExtractorError(u'no known formats available for video')
1169 if self._downloader.params.get('listformats', None):
1170 self._print_formats(existing_formats)
1171 return
1172 if req_format is None or req_format == 'best':
1173 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1174 elif req_format == 'worst':
1175 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1176 elif req_format in ('-1', 'all'):
1177 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1178 else:
1179 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1180 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1181 # available in the specified format. For example,
1182 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1183 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1184 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1185 req_formats = req_format.split('/')
1186 video_url_list = None
1187 for rf in req_formats:
1188 if rf in url_map:
1189 video_url_list = [(rf, url_map[rf])]
1190 break
bdc6b3fc
AZ
1191 if rf in self._video_formats_map:
1192 for srf in self._video_formats_map[rf]:
1193 if srf in url_map:
1194 video_url_list = [(srf, url_map[srf])]
1195 break
1196 else:
1197 continue
1198 break
1d043b93
JMF
1199 if video_url_list is None:
1200 raise ExtractorError(u'requested format not available')
1201 return video_url_list
1202
1203 def _extract_from_m3u8(self, manifest_url, video_id):
1204 url_map = {}
1205 def _get_urls(_manifest):
1206 lines = _manifest.split('\n')
1207 urls = filter(lambda l: l and not l.startswith('#'),
1208 lines)
1209 return urls
1210 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1211 formats_urls = _get_urls(manifest)
1212 for format_url in formats_urls:
890f62e8 1213 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1214 url_map[itag] = format_url
1215 return url_map
1216
c5e8d7af 1217 def _real_extract(self, url):
d7f44b5b
PH
1218 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1219 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1220
c5e8d7af
PH
1221 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1222 mobj = re.search(self._NEXT_URL_RE, url)
1223 if mobj:
1224 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1225 video_id = self._extract_id(url)
1226
1227 # Get video webpage
1228 self.report_video_webpage_download(video_id)
1229 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1230 request = compat_urllib_request.Request(url)
1231 try:
1232 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1234 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1235
1236 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1237
1238 # Attempt to extract SWF player URL
e0df6211 1239 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1240 if mobj is not None:
1241 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1242 else:
1243 player_url = None
1244
1245 # Get video info
1246 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1247 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1248 self.report_age_confirmation()
1249 age_gate = True
1250 # We simulate the access to the video from www.youtube.com/v/{video_id}
1251 # this can be viewed without login into Youtube
1252 data = compat_urllib_parse.urlencode({'video_id': video_id,
1253 'el': 'embedded',
1254 'gl': 'US',
1255 'hl': 'en',
1256 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1257 'asv': 3,
1258 'sts':'1588',
1259 })
1260 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1261 video_info_webpage = self._download_webpage(video_info_url, video_id,
1262 note=False,
1263 errnote='unable to download video info webpage')
1264 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1265 else:
1266 age_gate = False
1267 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1268 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1269 % (video_id, el_type))
1270 video_info_webpage = self._download_webpage(video_info_url, video_id,
1271 note=False,
1272 errnote='unable to download video info webpage')
1273 video_info = compat_parse_qs(video_info_webpage)
1274 if 'token' in video_info:
1275 break
c5e8d7af
PH
1276 if 'token' not in video_info:
1277 if 'reason' in video_info:
9a82b238 1278 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1279 else:
1280 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1281
1282 # Check for "rental" videos
1283 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1284 raise ExtractorError(u'"rental" videos not supported')
1285
1286 # Start extracting information
1287 self.report_information_extraction(video_id)
1288
1289 # uploader
1290 if 'author' not in video_info:
1291 raise ExtractorError(u'Unable to extract uploader name')
1292 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1293
1294 # uploader_id
1295 video_uploader_id = None
1296 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1297 if mobj is not None:
1298 video_uploader_id = mobj.group(1)
1299 else:
1300 self._downloader.report_warning(u'unable to extract uploader nickname')
1301
1302 # title
1303 if 'title' not in video_info:
1304 raise ExtractorError(u'Unable to extract video title')
1305 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1306
1307 # thumbnail image
7763b04e
JMF
1308 # We try first to get a high quality image:
1309 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1310 video_webpage, re.DOTALL)
1311 if m_thumb is not None:
1312 video_thumbnail = m_thumb.group(1)
1313 elif 'thumbnail_url' not in video_info:
c5e8d7af
PH
1314 self._downloader.report_warning(u'unable to extract video thumbnail')
1315 video_thumbnail = ''
1316 else: # don't panic if we can't find it
1317 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1318
1319 # upload date
1320 upload_date = None
1321 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1322 if mobj is not None:
1323 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1324 upload_date = unified_strdate(upload_date)
1325
1326 # description
1327 video_description = get_element_by_id("eow-description", video_webpage)
1328 if video_description:
1329 video_description = clean_html(video_description)
1330 else:
1331 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1332 if fd_mobj:
1333 video_description = unescapeHTML(fd_mobj.group(1))
1334 else:
1335 video_description = u''
1336
1337 # subtitles
d82134c3 1338 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1339
c5e8d7af 1340 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1341 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1342 return
1343
1344 if 'length_seconds' not in video_info:
1345 self._downloader.report_warning(u'unable to extract video duration')
1346 video_duration = ''
1347 else:
1348 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1349
c5e8d7af 1350 # Decide which formats to download
c5e8d7af
PH
1351
1352 try:
1353 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1354 if not mobj:
1355 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1356 info = json.loads(mobj.group(1))
1357 args = info['args']
7ce7e394
JMF
1358 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1359 # this signatures are encrypted
1360 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1361 if m_s is not None:
1362 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1363 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
cde846b3 1364 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
b7a68384 1365 if m_s is not None:
37b6d5f6
AZ
1366 if 'url_encoded_fmt_stream_map' in video_info:
1367 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1368 else:
1369 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
211fbc13 1370 elif 'adaptive_fmts' in video_info:
37b6d5f6
AZ
1371 if 'url_encoded_fmt_stream_map' in video_info:
1372 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1373 else:
1374 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
c5e8d7af
PH
1375 except ValueError:
1376 pass
1377
1378 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1379 self.report_rtmp_download()
1380 video_url_list = [(None, video_info['conn'][0])]
1381 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
a7055eb9
JMF
1382 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1383 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af
PH
1384 url_map = {}
1385 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1386 url_data = compat_parse_qs(url_data_str)
1387 if 'itag' in url_data and 'url' in url_data:
1388 url = url_data['url'][0]
1389 if 'sig' in url_data:
1390 url += '&signature=' + url_data['sig'][0]
1391 elif 's' in url_data:
e0df6211 1392 encrypted_sig = url_data['s'][0]
769fda3c 1393 if self._downloader.params.get('verbose'):
c108eb73 1394 if age_gate:
83799698
PH
1395 player_version = self._search_regex(
1396 r'-(.+)\.swf$',
1397 player_url if player_url else None,
e0df6211
PH
1398 'flash player', fatal=False)
1399 player_desc = 'flash player %s' % player_version
c108eb73 1400 else:
83799698
PH
1401 player_version = self._search_regex(
1402 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1403 'html5 player', fatal=False)
e0df6211
PH
1404 player_desc = u'html5 player %s' % player_version
1405
1406 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1407 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1408 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1409
83799698 1410 if not age_gate:
e0df6211
PH
1411 jsplayer_url_json = self._search_regex(
1412 r'"assets":.+?"js":\s*("[^"]+")',
1413 video_webpage, u'JS player URL')
83799698 1414 player_url = json.loads(jsplayer_url_json)
e0df6211 1415
83799698
PH
1416 signature = self._decrypt_signature(
1417 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1418 url += '&signature=' + signature
1419 if 'ratebypass' not in url:
1420 url += '&ratebypass=yes'
1421 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1422 video_url_list = self._get_video_url_list(url_map)
1423 if not video_url_list:
c5e8d7af 1424 return
1d043b93
JMF
1425 elif video_info.get('hlsvp'):
1426 manifest_url = video_info['hlsvp'][0]
1427 url_map = self._extract_from_m3u8(manifest_url, video_id)
1428 video_url_list = self._get_video_url_list(url_map)
1429 if not video_url_list:
1430 return
1431
c5e8d7af
PH
1432 else:
1433 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1434
1435 results = []
1436 for format_param, video_real_url in video_url_list:
1437 # Extension
1438 video_extension = self._video_extensions.get(format_param, 'flv')
1439
03cc7c20
JMF
1440 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1441 self._video_dimensions.get(format_param, '???'),
836a086c 1442 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
c5e8d7af
PH
1443
1444 results.append({
1445 'id': video_id,
1446 'url': video_real_url,
1447 'uploader': video_uploader,
1448 'uploader_id': video_uploader_id,
1449 'upload_date': upload_date,
1450 'title': video_title,
1451 'ext': video_extension,
1452 'format': video_format,
1453 'thumbnail': video_thumbnail,
1454 'description': video_description,
1455 'player_url': player_url,
1456 'subtitles': video_subtitles,
1457 'duration': video_duration
1458 })
1459 return results
1460
1461class YoutubePlaylistIE(InfoExtractor):
0f818663 1462 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1463 _VALID_URL = r"""(?:
1464 (?:https?://)?
1465 (?:\w+\.)?
1466 youtube\.com/
1467 (?:
1468 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1469 \? (?:.*?&)*? (?:p|a|list)=
1470 | p/
1471 )
c626a3d9 1472 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1473 .*
1474 |
c626a3d9 1475 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1476 )"""
1477 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1478 _MAX_RESULTS = 50
1479 IE_NAME = u'youtube:playlist'
1480
1481 @classmethod
1482 def suitable(cls, url):
1483 """Receives a URL and returns True if suitable for this IE."""
1484 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1485
1486 def _real_extract(self, url):
1487 # Extract playlist id
1488 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1489 if mobj is None:
1490 raise ExtractorError(u'Invalid URL: %s' % url)
1491
1492 # Download playlist videos from API
1493 playlist_id = mobj.group(1) or mobj.group(2)
c5e8d7af
PH
1494 videos = []
1495
755eb032 1496 for page_num in itertools.count(1):
771822eb
JMF
1497 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1498 if start_index >= 1000:
1499 self._downloader.report_warning(u'Max number of results reached')
1500 break
1501 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
c5e8d7af
PH
1502 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1503
1504 try:
1505 response = json.loads(page)
1506 except ValueError as err:
1507 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1508
1509 if 'feed' not in response:
1510 raise ExtractorError(u'Got a malformed response from YouTube API')
1511 playlist_title = response['feed']['title']['$t']
1512 if 'entry' not in response['feed']:
1513 # Number of videos is a multiple of self._MAX_RESULTS
1514 break
1515
1516 for entry in response['feed']['entry']:
1517 index = entry['yt$position']['$t']
c215217e
JMF
1518 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1519 videos.append((
1520 index,
1521 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1522 ))
c5e8d7af 1523
c5e8d7af
PH
1524 videos = [v[1] for v in sorted(videos)]
1525
20c3893f 1526 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
c5e8d7af
PH
1527 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1528
1529
1530class YoutubeChannelIE(InfoExtractor):
0f818663 1531 IE_DESC = u'YouTube.com channels'
c5e8d7af
PH
1532 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1533 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1534 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1535 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1536 IE_NAME = u'youtube:channel'
1537
1538 def extract_videos_from_page(self, page):
1539 ids_in_page = []
1540 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1541 if mobj.group(1) not in ids_in_page:
1542 ids_in_page.append(mobj.group(1))
1543 return ids_in_page
1544
1545 def _real_extract(self, url):
1546 # Extract channel id
1547 mobj = re.match(self._VALID_URL, url)
1548 if mobj is None:
1549 raise ExtractorError(u'Invalid URL: %s' % url)
1550
1551 # Download channel page
1552 channel_id = mobj.group(1)
1553 video_ids = []
1554 pagenum = 1
1555
1556 url = self._TEMPLATE_URL % (channel_id, pagenum)
1557 page = self._download_webpage(url, channel_id,
1558 u'Downloading page #%s' % pagenum)
1559
1560 # Extract video identifiers
1561 ids_in_page = self.extract_videos_from_page(page)
1562 video_ids.extend(ids_in_page)
1563
1564 # Download any subsequent channel pages using the json-based channel_ajax query
1565 if self._MORE_PAGES_INDICATOR in page:
755eb032 1566 for pagenum in itertools.count(1):
c5e8d7af
PH
1567 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1568 page = self._download_webpage(url, channel_id,
1569 u'Downloading page #%s' % pagenum)
1570
1571 page = json.loads(page)
1572
1573 ids_in_page = self.extract_videos_from_page(page['content_html'])
1574 video_ids.extend(ids_in_page)
1575
1576 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1577 break
1578
1579 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1580
1581 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
20c3893f 1582 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
c5e8d7af
PH
1583 return [self.playlist_result(url_entries, channel_id)]
1584
1585
1586class YoutubeUserIE(InfoExtractor):
0f818663 1587 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
faab1d38 1588 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1589 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1590 _GDATA_PAGE_SIZE = 50
fd9cf738 1591 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1592 IE_NAME = u'youtube:user'
1593
e3ea4790 1594 @classmethod
f4b05232 1595 def suitable(cls, url):
e3ea4790
JMF
1596 # Don't return True if the url can be extracted with other youtube
1597 # extractor, the regex would is too permissive and it would match.
1598 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1599 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1600 else: return super(YoutubeUserIE, cls).suitable(url)
1601
c5e8d7af
PH
1602 def _real_extract(self, url):
1603 # Extract username
1604 mobj = re.match(self._VALID_URL, url)
1605 if mobj is None:
1606 raise ExtractorError(u'Invalid URL: %s' % url)
1607
1608 username = mobj.group(1)
1609
1610 # Download video ids using YouTube Data API. Result size per
1611 # query is limited (currently to 50 videos) so we need to query
1612 # page by page until there are no video ids - it means we got
1613 # all of them.
1614
1615 video_ids = []
c5e8d7af 1616
755eb032 1617 for pagenum in itertools.count(0):
c5e8d7af
PH
1618 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1619
1620 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1621 page = self._download_webpage(gdata_url, username,
1622 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1623
fd9cf738
JMF
1624 try:
1625 response = json.loads(page)
1626 except ValueError as err:
1627 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1628 if 'entry' not in response['feed']:
1629 # Number of videos is a multiple of self._MAX_RESULTS
1630 break
fd9cf738 1631
c5e8d7af
PH
1632 # Extract video identifiers
1633 ids_in_page = []
fd9cf738
JMF
1634 for entry in response['feed']['entry']:
1635 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1636 video_ids.extend(ids_in_page)
1637
1638 # A little optimization - if current page is not
1639 # "full", ie. does not contain PAGE_SIZE video ids then
1640 # we can assume that this page is the last one - there
1641 # are no more ids on further pages - no need to query
1642 # again.
1643
1644 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1645 break
1646
c5e8d7af 1647 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
20c3893f 1648 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
c5e8d7af 1649 return [self.playlist_result(url_results, playlist_title = username)]
b05654f0
PH
1650
1651class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1652 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1653 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1654 _MAX_RESULTS = 1000
1655 IE_NAME = u'youtube:search'
1656 _SEARCH_KEY = 'ytsearch'
1657
1658 def report_download_page(self, query, pagenum):
1659 """Report attempt to download search page with given number."""
1660 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1661
1662 def _get_n_results(self, query, n):
1663 """Get a specified number of results for a query"""
1664
1665 video_ids = []
1666 pagenum = 0
1667 limit = n
1668
1669 while (50 * pagenum) < limit:
1670 self.report_download_page(query, pagenum+1)
1671 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1672 request = compat_urllib_request.Request(result_url)
1673 try:
1674 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1675 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1676 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1677 api_response = json.loads(data)['data']
1678
1679 if not 'items' in api_response:
1680 raise ExtractorError(u'[youtube] No video results')
1681
1682 new_ids = list(video['id'] for video in api_response['items'])
1683 video_ids += new_ids
1684
1685 limit = min(n, api_response['totalItems'])
1686 pagenum += 1
1687
1688 if len(video_ids) > n:
1689 video_ids = video_ids[:n]
1690 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1691 return self.playlist_result(videos, query)
75dff0ee
JMF
1692
1693
1694class YoutubeShowIE(InfoExtractor):
0f818663 1695 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1696 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1697 IE_NAME = u'youtube:show'
1698
1699 def _real_extract(self, url):
1700 mobj = re.match(self._VALID_URL, url)
1701 show_name = mobj.group(1)
1702 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1703 # There's one playlist for each season of the show
1704 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1705 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1706 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1707
1708
b2e8bc1b 1709class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1710 """
1711 Base class for extractors that fetch info from
1712 http://www.youtube.com/feed_ajax
1713 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1714 """
b2e8bc1b 1715 _LOGIN_REQUIRED = True
04cc9617 1716 _PAGING_STEP = 30
43ba5456
JMF
1717 # use action_load_personal_feed instead of action_load_system_feed
1718 _PERSONAL_FEED = False
04cc9617 1719
d7ae0639
JMF
1720 @property
1721 def _FEED_TEMPLATE(self):
43ba5456
JMF
1722 action = 'action_load_system_feed'
1723 if self._PERSONAL_FEED:
1724 action = 'action_load_personal_feed'
1725 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1726
1727 @property
1728 def IE_NAME(self):
1729 return u'youtube:%s' % self._FEED_NAME
04cc9617 1730
81f0259b 1731 def _real_initialize(self):
b2e8bc1b 1732 self._login()
81f0259b 1733
04cc9617
JMF
1734 def _real_extract(self, url):
1735 feed_entries = []
1736 # The step argument is available only in 2.7 or higher
1737 for i in itertools.count(0):
1738 paging = i*self._PAGING_STEP
d7ae0639
JMF
1739 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1740 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1741 u'Downloading page %s' % i)
1742 info = json.loads(info)
1743 feed_html = info['feed_html']
43ba5456 1744 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617
JMF
1745 ids = orderedSet(m.group(1) for m in m_ids)
1746 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1747 if info['paging'] is None:
1748 break
d7ae0639
JMF
1749 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1750
1751class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1752 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1753 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1754 _FEED_NAME = 'subscriptions'
1755 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1756
1757class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1758 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1759 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1760 _FEED_NAME = 'recommended'
1761 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1762
43ba5456
JMF
1763class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1764 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1765 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1766 _FEED_NAME = 'watch_later'
1767 _PLAYLIST_TITLE = u'Youtube Watch Later'
1768 _PAGING_STEP = 100
1769 _PERSONAL_FEED = True
c626a3d9
JMF
1770
1771class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1772 IE_NAME = u'youtube:favorites'
1773 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1774 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1775 _LOGIN_REQUIRED = True
1776
1777 def _real_extract(self, url):
1778 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1779 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1780 return self.url_result(playlist_id, 'YoutubePlaylist')