]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
extractor: youtube: Set extension of AAC audio formats to m4a.
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af
PH
9import re
10import socket
e0df6211
PH
11import string
12import struct
13import traceback
055e6f36 14import xml.etree.ElementTree
e0df6211 15import zlib
c5e8d7af 16
b05654f0 17from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 18from .subtitles import SubtitlesInfoExtractor
c5e8d7af 19from ..utils import (
edf3e38e 20 compat_chr,
c5e8d7af
PH
21 compat_http_client,
22 compat_parse_qs,
23 compat_urllib_error,
24 compat_urllib_parse,
25 compat_urllib_request,
7c61bd36 26 compat_urlparse,
c5e8d7af
PH
27 compat_str,
28
29 clean_html,
c38b1e77 30 get_cachedir,
c5e8d7af
PH
31 get_element_by_id,
32 ExtractorError,
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def report_lang(self):
49 """Report attempt to set language."""
50 self.to_screen(u'Setting language')
51
52 def _set_language(self):
53 request = compat_urllib_request.Request(self._LANG_URL)
54 try:
55 self.report_lang()
56 compat_urllib_request.urlopen(request).read()
57 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
58 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
59 return False
60 return True
61
62 def _login(self):
63 (username, password) = self._get_login_info()
64 # No authentication to be performed
65 if username is None:
66 if self._LOGIN_REQUIRED:
67 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 return False
69
70 request = compat_urllib_request.Request(self._LOGIN_URL)
71 try:
72 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
73 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
74 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
75 return False
76
77 galx = None
78 dsh = None
79 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
80 if match:
81 galx = match.group(1)
82 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
83 if match:
84 dsh = match.group(1)
c5e8d7af 85
b2e8bc1b
JMF
86 # Log in
87 login_form_strs = {
88 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
89 u'Email': username,
90 u'GALX': galx,
91 u'Passwd': password,
92 u'PersistentCookie': u'yes',
93 u'_utf8': u'霱',
94 u'bgresponse': u'js_disabled',
95 u'checkConnection': u'',
96 u'checkedDomains': u'youtube',
97 u'dnConn': u'',
98 u'dsh': dsh,
99 u'pstMsg': u'0',
100 u'rmShown': u'1',
101 u'secTok': u'',
102 u'signIn': u'Sign in',
103 u'timeStmp': u'',
104 u'service': u'youtube',
105 u'uilel': u'3',
106 u'hl': u'en_US',
107 }
108 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
109 # chokes on unicode
110 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
111 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
112 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
113 try:
114 self.report_login()
115 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
116 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
117 self._downloader.report_warning(u'unable to log in: bad username or password')
118 return False
119 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
121 return False
122 return True
123
124 def _confirm_age(self):
125 age_form = {
126 'next_url': '/',
127 'action_confirm': 'Confirm',
128 }
129 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
130 try:
131 self.report_age_confirmation()
132 compat_urllib_request.urlopen(request).read().decode('utf-8')
133 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
134 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
135 return True
136
137 def _real_initialize(self):
138 if self._downloader is None:
139 return
140 if not self._set_language():
141 return
142 if not self._login():
143 return
144 self._confirm_age()
c5e8d7af 145
8377574c 146
de7f3446 147class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 148 IE_DESC = u'YouTube.com'
c5e8d7af
PH
149 _VALID_URL = r"""^
150 (
151 (?:https?://)? # http(s):// (optional)
f4b05232 152 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
e69ae5b9
JMF
153 tube\.majestyc\.net/|
154 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
155 (?:.*?\#/)? # handle anchor (#/) redirect urls
156 (?: # the various things that can precede the ID:
157 (?:(?:v|embed|e)/) # v/ or embed/ or e/
158 |(?: # or the v= param in all its forms
d741e55a 159 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
160 (?:\?|\#!?) # the params delimiter ? or # or #!
161 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
162 v=
163 )
f4b05232
JMF
164 ))
165 |youtu\.be/ # just youtu.be/xxxx
166 )
c5e8d7af 167 )? # all until now is optional -> you can pass the naked ID
8963d9c2 168 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
169 (?(1).+)? # if we found the ID, everything can follow
170 $"""
c5e8d7af 171 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 172 # Listed in order of quality
bdc6b3fc 173 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 174 # Apple HTTP Live Streaming
bdc6b3fc 175 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
176 # 3D
177 '85', '84', '102', '83', '101', '82', '100',
178 # Dash video
179 '138', '137', '248', '136', '247', '135', '246',
180 '245', '244', '134', '243', '133', '242', '160',
181 # Dash audio
182 '141', '172', '140', '171', '139',
1d043b93 183 ]
bdc6b3fc 184 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 185 # Apple HTTP Live Streaming
bdc6b3fc
AZ
186 '96', '95', '94', '93', '92', '132', '151',
187 # 3D
86fe61c8 188 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
189 # Dash video
190 '138', '248', '137', '247', '136', '246', '245',
191 '244', '135', '243', '134', '242', '133', '160',
192 # Dash audio
193 '172', '141', '171', '140', '139',
1d043b93 194 ]
bdc6b3fc
AZ
195 _video_formats_map = {
196 'flv': ['35', '34', '6', '5'],
197 '3gp': ['36', '17', '13'],
198 'mp4': ['38', '37', '22', '18'],
199 'webm': ['46', '45', '44', '43'],
200 }
c5e8d7af
PH
201 _video_extensions = {
202 '13': '3gp',
bdc6b3fc 203 '17': '3gp',
c5e8d7af
PH
204 '18': 'mp4',
205 '22': 'mp4',
bdc6b3fc 206 '36': '3gp',
c5e8d7af 207 '37': 'mp4',
d69cf69a 208 '38': 'mp4',
c5e8d7af
PH
209 '43': 'webm',
210 '44': 'webm',
211 '45': 'webm',
212 '46': 'webm',
1d043b93 213
86fe61c8
AZ
214 # 3d videos
215 '82': 'mp4',
216 '83': 'mp4',
217 '84': 'mp4',
218 '85': 'mp4',
219 '100': 'webm',
220 '101': 'webm',
221 '102': 'webm',
836a086c 222
96fb5605 223 # Apple HTTP Live Streaming
1d043b93
JMF
224 '92': 'mp4',
225 '93': 'mp4',
226 '94': 'mp4',
227 '95': 'mp4',
228 '96': 'mp4',
229 '132': 'mp4',
230 '151': 'mp4',
836a086c
AZ
231
232 # Dash mp4
233 '133': 'mp4',
234 '134': 'mp4',
235 '135': 'mp4',
236 '136': 'mp4',
237 '137': 'mp4',
238 '138': 'mp4',
239 '139': 'mp4',
16f36a6f
RB
240 '140': 'm4a',
241 '141': 'm4a',
242 '160': 'm4a',
836a086c
AZ
243
244 # Dash webm
245 '171': 'webm',
246 '172': 'webm',
247 '242': 'webm',
248 '243': 'webm',
249 '244': 'webm',
250 '245': 'webm',
251 '246': 'webm',
252 '247': 'webm',
253 '248': 'webm',
c5e8d7af
PH
254 }
255 _video_dimensions = {
256 '5': '240x400',
257 '6': '???',
258 '13': '???',
259 '17': '144x176',
260 '18': '360x640',
261 '22': '720x1280',
262 '34': '360x640',
263 '35': '480x854',
bdc6b3fc 264 '36': '240x320',
c5e8d7af
PH
265 '37': '1080x1920',
266 '38': '3072x4096',
267 '43': '360x640',
268 '44': '480x854',
269 '45': '720x1280',
270 '46': '1080x1920',
86fe61c8
AZ
271 '82': '360p',
272 '83': '480p',
273 '84': '720p',
274 '85': '1080p',
1d043b93
JMF
275 '92': '240p',
276 '93': '360p',
277 '94': '480p',
278 '95': '720p',
279 '96': '1080p',
86fe61c8
AZ
280 '100': '360p',
281 '101': '480p',
836a086c 282 '102': '720p',
1d043b93
JMF
283 '132': '240p',
284 '151': '72p',
836a086c
AZ
285 '133': '240p',
286 '134': '360p',
287 '135': '480p',
288 '136': '720p',
289 '137': '1080p',
290 '138': '>1080p',
291 '139': '48k',
292 '140': '128k',
293 '141': '256k',
294 '160': '192p',
295 '171': '128k',
296 '172': '256k',
297 '242': '240p',
298 '243': '360p',
299 '244': '480p',
300 '245': '480p',
301 '246': '480p',
302 '247': '720p',
303 '248': '1080p',
c5e8d7af 304 }
836a086c
AZ
305 _special_itags = {
306 '82': '3D',
307 '83': '3D',
308 '84': '3D',
309 '85': '3D',
310 '100': '3D',
311 '101': '3D',
312 '102': '3D',
313 '133': 'DASH Video',
314 '134': 'DASH Video',
315 '135': 'DASH Video',
316 '136': 'DASH Video',
317 '137': 'DASH Video',
318 '138': 'DASH Video',
319 '139': 'DASH Audio',
320 '140': 'DASH Audio',
321 '141': 'DASH Audio',
322 '160': 'DASH Video',
323 '171': 'DASH Audio',
324 '172': 'DASH Audio',
325 '242': 'DASH Video',
326 '243': 'DASH Video',
327 '244': 'DASH Video',
328 '245': 'DASH Video',
329 '246': 'DASH Video',
330 '247': 'DASH Video',
331 '248': 'DASH Video',
c5e8d7af 332 }
836a086c 333
c5e8d7af 334 IE_NAME = u'youtube'
2eb88d95
PH
335 _TESTS = [
336 {
0e853ca4
PH
337 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
338 u"file": u"BaW_jenozKc.mp4",
339 u"info_dict": {
340 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
341 u"uploader": u"Philipp Hagemeister",
342 u"uploader_id": u"phihag",
343 u"upload_date": u"20121002",
344 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 345 }
0e853ca4
PH
346 },
347 {
348 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
349 u"file": u"1ltcDfZMA3U.flv",
350 u"note": u"Test VEVO video (#897)",
351 u"info_dict": {
352 u"upload_date": u"20070518",
353 u"title": u"Maps - It Will Find You",
354 u"description": u"Music video by Maps performing It Will Find You.",
355 u"uploader": u"MuteUSA",
356 u"uploader_id": u"MuteUSA"
2eb88d95 357 }
0e853ca4
PH
358 },
359 {
360 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
361 u"file": u"UxxajLWwzqY.mp4",
362 u"note": u"Test generic use_cipher_signature video (#897)",
363 u"info_dict": {
364 u"upload_date": u"20120506",
365 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 366 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 367 u"uploader": u"Icona Pop",
0e853ca4 368 u"uploader_id": u"IconaPop"
2eb88d95 369 }
c108eb73
JMF
370 },
371 {
372 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
373 u"file": u"07FYdnEawAQ.mp4",
374 u"note": u"Test VEVO video with age protection (#956)",
375 u"info_dict": {
376 u"upload_date": u"20130703",
377 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
378 u"description": u"md5:64249768eec3bc4276236606ea996373",
379 u"uploader": u"justintimberlakeVEVO",
380 u"uploader_id": u"justintimberlakeVEVO"
381 }
382 },
2eb88d95
PH
383 ]
384
c5e8d7af
PH
385
386 @classmethod
387 def suitable(cls, url):
388 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 389 if YoutubePlaylistIE.suitable(url): return False
c5e8d7af
PH
390 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
391
e0df6211
PH
392 def __init__(self, *args, **kwargs):
393 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 394 self._player_cache = {}
e0df6211 395
c5e8d7af
PH
396 def report_video_webpage_download(self, video_id):
397 """Report attempt to download video webpage."""
398 self.to_screen(u'%s: Downloading video webpage' % video_id)
399
400 def report_video_info_webpage_download(self, video_id):
401 """Report attempt to download video info webpage."""
402 self.to_screen(u'%s: Downloading video info webpage' % video_id)
403
c5e8d7af
PH
404 def report_information_extraction(self, video_id):
405 """Report attempt to extract video information."""
406 self.to_screen(u'%s: Extracting video information' % video_id)
407
408 def report_unavailable_format(self, video_id, format):
409 """Report extracted video URL."""
410 self.to_screen(u'%s: Format %s not available' % (video_id, format))
411
412 def report_rtmp_download(self):
413 """Indicate the download will use the RTMP protocol."""
414 self.to_screen(u'RTMP download detected')
415
c4417ddb
PH
416 def _extract_signature_function(self, video_id, player_url, slen):
417 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 418 player_url)
e0df6211
PH
419 player_type = id_m.group('ext')
420 player_id = id_m.group('id')
421
c4417ddb
PH
422 # Read from filesystem cache
423 func_id = '%s_%s_%d' % (player_type, player_id, slen)
424 assert os.path.basename(func_id) == func_id
c38b1e77 425 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 426
c3c88a26 427 cache_enabled = cache_dir is not None
f8061589 428 if cache_enabled:
c4417ddb
PH
429 cache_fn = os.path.join(os.path.expanduser(cache_dir),
430 u'youtube-sigfuncs',
431 func_id + '.json')
432 try:
edf3e38e 433 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
434 cache_spec = json.load(cachef)
435 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 436 except IOError:
c4417ddb 437 pass # No cache available
83799698 438
e0df6211
PH
439 if player_type == 'js':
440 code = self._download_webpage(
441 player_url, video_id,
83799698 442 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 443 errnote=u'Download of %s failed' % player_url)
83799698 444 res = self._parse_sig_js(code)
c4417ddb 445 elif player_type == 'swf':
e0df6211
PH
446 urlh = self._request_webpage(
447 player_url, video_id,
83799698 448 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
449 errnote=u'Download of %s failed' % player_url)
450 code = urlh.read()
83799698 451 res = self._parse_sig_swf(code)
e0df6211
PH
452 else:
453 assert False, 'Invalid player type %r' % player_type
454
f8061589 455 if cache_enabled:
edf3e38e 456 try:
c705320f
PH
457 test_string = u''.join(map(compat_chr, range(slen)))
458 cache_res = res(test_string)
edf3e38e
PH
459 cache_spec = [ord(c) for c in cache_res]
460 try:
461 os.makedirs(os.path.dirname(cache_fn))
462 except OSError as ose:
463 if ose.errno != errno.EEXIST:
464 raise
465 write_json_file(cache_spec, cache_fn)
0ca96d48 466 except Exception:
edf3e38e
PH
467 tb = traceback.format_exc()
468 self._downloader.report_warning(
469 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
470
471 return res
472
edf3e38e
PH
473 def _print_sig_code(self, func, slen):
474 def gen_sig_code(idxs):
475 def _genslice(start, end, step):
476 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
477 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
478 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
479 return u's[%s%s%s]' % (starts, ends, steps)
480
481 step = None
0ca96d48
PH
482 start = '(Never used)' # Quelch pyflakes warnings - start will be
483 # set as soon as step is set
edf3e38e
PH
484 for i, prev in zip(idxs[1:], idxs[:-1]):
485 if step is not None:
486 if i - prev == step:
487 continue
488 yield _genslice(start, prev, step)
489 step = None
490 continue
491 if i - prev in [-1, 1]:
492 step = i - prev
493 start = prev
494 continue
495 else:
496 yield u's[%d]' % prev
497 if step is None:
498 yield u's[%d]' % i
499 else:
500 yield _genslice(start, i, step)
501
c705320f
PH
502 test_string = u''.join(map(compat_chr, range(slen)))
503 cache_res = func(test_string)
edf3e38e
PH
504 cache_spec = [ord(c) for c in cache_res]
505 expr_code = u' + '.join(gen_sig_code(cache_spec))
506 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 507 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 508
e0df6211
PH
509 def _parse_sig_js(self, jscode):
510 funcname = self._search_regex(
511 r'signature=([a-zA-Z]+)', jscode,
512 u'Initial JS player signature function name')
513
514 functions = {}
515
516 def argidx(varname):
517 return string.lowercase.index(varname)
518
519 def interpret_statement(stmt, local_vars, allow_recursion=20):
520 if allow_recursion < 0:
0ca96d48 521 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
522
523 if stmt.startswith(u'var '):
524 stmt = stmt[len(u'var '):]
525 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
526 r'=(?P<expr>.*)$', stmt)
527 if ass_m:
528 if ass_m.groupdict().get('index'):
529 def assign(val):
530 lvar = local_vars[ass_m.group('out')]
531 idx = interpret_expression(ass_m.group('index'),
532 local_vars, allow_recursion)
533 assert isinstance(idx, int)
534 lvar[idx] = val
535 return val
536 expr = ass_m.group('expr')
537 else:
538 def assign(val):
539 local_vars[ass_m.group('out')] = val
540 return val
541 expr = ass_m.group('expr')
542 elif stmt.startswith(u'return '):
543 assign = lambda v: v
544 expr = stmt[len(u'return '):]
545 else:
546 raise ExtractorError(
547 u'Cannot determine left side of statement in %r' % stmt)
548
549 v = interpret_expression(expr, local_vars, allow_recursion)
550 return assign(v)
551
552 def interpret_expression(expr, local_vars, allow_recursion):
553 if expr.isdigit():
554 return int(expr)
555
556 if expr.isalpha():
557 return local_vars[expr]
558
559 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
560 if m:
561 member = m.group('member')
562 val = local_vars[m.group('in')]
563 if member == 'split("")':
564 return list(val)
565 if member == 'join("")':
566 return u''.join(val)
567 if member == 'length':
568 return len(val)
569 if member == 'reverse()':
570 return val[::-1]
571 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
572 if slice_m:
573 idx = interpret_expression(
574 slice_m.group('idx'), local_vars, allow_recursion-1)
575 return val[idx:]
576
577 m = re.match(
578 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
579 if m:
580 val = local_vars[m.group('in')]
581 idx = interpret_expression(m.group('idx'), local_vars,
582 allow_recursion-1)
583 return val[idx]
584
585 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
586 if m:
587 a = interpret_expression(m.group('a'),
588 local_vars, allow_recursion)
589 b = interpret_expression(m.group('b'),
590 local_vars, allow_recursion)
591 return a % b
592
593 m = re.match(
594 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
595 if m:
596 fname = m.group('func')
597 if fname not in functions:
598 functions[fname] = extract_function(fname)
599 argvals = [int(v) if v.isdigit() else local_vars[v]
600 for v in m.group('args').split(',')]
601 return functions[fname](argvals)
602 raise ExtractorError(u'Unsupported JS expression %r' % expr)
603
604 def extract_function(funcname):
605 func_m = re.search(
606 r'function ' + re.escape(funcname) +
607 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
608 jscode)
609 argnames = func_m.group('args').split(',')
610
611 def resf(args):
612 local_vars = dict(zip(argnames, args))
613 for stmt in func_m.group('code').split(';'):
614 res = interpret_statement(stmt, local_vars)
615 return res
616 return resf
617
618 initial_function = extract_function(funcname)
619 return lambda s: initial_function([s])
620
621 def _parse_sig_swf(self, file_contents):
622 if file_contents[1:3] != b'WS':
623 raise ExtractorError(
624 u'Not an SWF file; header is %r' % file_contents[:3])
625 if file_contents[:1] == b'C':
626 content = zlib.decompress(file_contents[8:])
627 else:
628 raise NotImplementedError(u'Unsupported compression format %r' %
629 file_contents[:1])
630
631 def extract_tags(content):
632 pos = 0
633 while pos < len(content):
634 header16 = struct.unpack('<H', content[pos:pos+2])[0]
635 pos += 2
636 tag_code = header16 >> 6
637 tag_len = header16 & 0x3f
638 if tag_len == 0x3f:
639 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
640 pos += 4
641 assert pos+tag_len <= len(content)
642 yield (tag_code, content[pos:pos+tag_len])
643 pos += tag_len
644
645 code_tag = next(tag
646 for tag_code, tag in extract_tags(content)
647 if tag_code == 82)
648 p = code_tag.index(b'\0', 4) + 1
ba552f54 649 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
650
651 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
652 def read_int(reader=None):
653 if reader is None:
654 reader = code_reader
e0df6211
PH
655 res = 0
656 shift = 0
657 for _ in range(5):
ba552f54
PH
658 buf = reader.read(1)
659 assert len(buf) == 1
660 b = struct.unpack('<B', buf)[0]
e0df6211
PH
661 res = res | ((b & 0x7f) << shift)
662 if b & 0x80 == 0:
663 break
664 shift += 7
ba552f54
PH
665 return res
666
667 def u30(reader=None):
668 res = read_int(reader)
669 assert res & 0xf0000000 == 0
e0df6211
PH
670 return res
671 u32 = read_int
672
ba552f54
PH
673 def s32(reader=None):
674 v = read_int(reader)
e0df6211
PH
675 if v & 0x80000000 != 0:
676 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
677 return v
678
0ca96d48 679 def read_string(reader=None):
ba552f54
PH
680 if reader is None:
681 reader = code_reader
682 slen = u30(reader)
683 resb = reader.read(slen)
684 assert len(resb) == slen
685 return resb.decode('utf-8')
686
687 def read_bytes(count, reader=None):
688 if reader is None:
689 reader = code_reader
690 resb = reader.read(count)
691 assert len(resb) == count
692 return resb
693
694 def read_byte(reader=None):
695 resb = read_bytes(1, reader=reader)
696 res = struct.unpack('<B', resb)[0]
697 return res
e0df6211
PH
698
699 # minor_version + major_version
0ca96d48 700 read_bytes(2 + 2)
e0df6211
PH
701
702 # Constant pool
ba552f54 703 int_count = u30()
e0df6211 704 for _c in range(1, int_count):
0ca96d48 705 s32()
ba552f54 706 uint_count = u30()
e0df6211 707 for _c in range(1, uint_count):
0ca96d48 708 u32()
ba552f54 709 double_count = u30()
0ca96d48 710 read_bytes((double_count-1) * 8)
ba552f54 711 string_count = u30()
e0df6211
PH
712 constant_strings = [u'']
713 for _c in range(1, string_count):
0ca96d48 714 s = read_string()
e0df6211 715 constant_strings.append(s)
ba552f54 716 namespace_count = u30()
e0df6211 717 for _c in range(1, namespace_count):
0ca96d48
PH
718 read_bytes(1) # kind
719 u30() # name
ba552f54 720 ns_set_count = u30()
e0df6211 721 for _c in range(1, ns_set_count):
ba552f54 722 count = u30()
e0df6211 723 for _c2 in range(count):
0ca96d48 724 u30()
ba552f54 725 multiname_count = u30()
e0df6211
PH
726 MULTINAME_SIZES = {
727 0x07: 2, # QName
728 0x0d: 2, # QNameA
729 0x0f: 1, # RTQName
730 0x10: 1, # RTQNameA
731 0x11: 0, # RTQNameL
732 0x12: 0, # RTQNameLA
733 0x09: 2, # Multiname
734 0x0e: 2, # MultinameA
735 0x1b: 1, # MultinameL
736 0x1c: 1, # MultinameLA
737 }
738 multinames = [u'']
739 for _c in range(1, multiname_count):
ba552f54 740 kind = u30()
e0df6211
PH
741 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
742 if kind == 0x07:
0ca96d48 743 u30() # namespace_idx
ba552f54 744 name_idx = u30()
e0df6211
PH
745 multinames.append(constant_strings[name_idx])
746 else:
747 multinames.append('[MULTINAME kind: %d]' % kind)
748 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 749 u30()
e0df6211
PH
750
751 # Methods
ba552f54 752 method_count = u30()
e0df6211
PH
753 MethodInfo = collections.namedtuple(
754 'MethodInfo',
755 ['NEED_ARGUMENTS', 'NEED_REST'])
756 method_infos = []
757 for method_id in range(method_count):
ba552f54 758 param_count = u30()
0ca96d48 759 u30() # return type
e0df6211 760 for _ in range(param_count):
0ca96d48
PH
761 u30() # param type
762 u30() # name index (always 0 for youtube)
ba552f54 763 flags = read_byte()
e0df6211
PH
764 if flags & 0x08 != 0:
765 # Options present
ba552f54 766 option_count = u30()
e0df6211 767 for c in range(option_count):
0ca96d48
PH
768 u30() # val
769 read_bytes(1) # kind
e0df6211
PH
770 if flags & 0x80 != 0:
771 # Param names present
772 for _ in range(param_count):
0ca96d48 773 u30() # param name
e0df6211
PH
774 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
775 method_infos.append(mi)
776
777 # Metadata
ba552f54 778 metadata_count = u30()
e0df6211 779 for _c in range(metadata_count):
0ca96d48 780 u30() # name
ba552f54 781 item_count = u30()
e0df6211 782 for _c2 in range(item_count):
0ca96d48
PH
783 u30() # key
784 u30() # value
ba552f54
PH
785
786 def parse_traits_info():
787 trait_name_idx = u30()
788 kind_full = read_byte()
e0df6211
PH
789 kind = kind_full & 0x0f
790 attrs = kind_full >> 4
791 methods = {}
792 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
793 u30() # Slot id
794 u30() # type_name_idx
ba552f54 795 vindex = u30()
e0df6211 796 if vindex != 0:
0ca96d48 797 read_byte() # vkind
e0df6211 798 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 799 u30() # disp_id
ba552f54 800 method_idx = u30()
e0df6211
PH
801 methods[multinames[trait_name_idx]] = method_idx
802 elif kind == 0x04: # Class
0ca96d48
PH
803 u30() # slot_id
804 u30() # classi
e0df6211 805 elif kind == 0x05: # Function
0ca96d48 806 u30() # slot_id
ba552f54 807 function_idx = u30()
e0df6211
PH
808 methods[function_idx] = multinames[trait_name_idx]
809 else:
810 raise ExtractorError(u'Unsupported trait kind %d' % kind)
811
812 if attrs & 0x4 != 0: # Metadata present
ba552f54 813 metadata_count = u30()
e0df6211 814 for _c3 in range(metadata_count):
0ca96d48 815 u30() # metadata index
e0df6211 816
ba552f54 817 return methods
e0df6211
PH
818
819 # Classes
820 TARGET_CLASSNAME = u'SignatureDecipher'
821 searched_idx = multinames.index(TARGET_CLASSNAME)
822 searched_class_id = None
ba552f54 823 class_count = u30()
e0df6211 824 for class_id in range(class_count):
ba552f54 825 name_idx = u30()
e0df6211
PH
826 if name_idx == searched_idx:
827 # We found the class we're looking for!
828 searched_class_id = class_id
0ca96d48 829 u30() # super_name idx
ba552f54 830 flags = read_byte()
e0df6211 831 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 832 u30() # protected_ns_idx
ba552f54 833 intrf_count = u30()
e0df6211 834 for _c2 in range(intrf_count):
0ca96d48
PH
835 u30()
836 u30() # iinit
ba552f54 837 trait_count = u30()
e0df6211 838 for _c2 in range(trait_count):
0ca96d48 839 parse_traits_info()
e0df6211
PH
840
841 if searched_class_id is None:
842 raise ExtractorError(u'Target class %r not found' %
843 TARGET_CLASSNAME)
844
845 method_names = {}
846 method_idxs = {}
847 for class_id in range(class_count):
0ca96d48 848 u30() # cinit
ba552f54 849 trait_count = u30()
e0df6211 850 for _c2 in range(trait_count):
ba552f54 851 trait_methods = parse_traits_info()
e0df6211
PH
852 if class_id == searched_class_id:
853 method_names.update(trait_methods.items())
854 method_idxs.update(dict(
855 (idx, name)
856 for name, idx in trait_methods.items()))
857
858 # Scripts
ba552f54 859 script_count = u30()
e0df6211 860 for _c in range(script_count):
0ca96d48 861 u30() # init
ba552f54 862 trait_count = u30()
e0df6211 863 for _c2 in range(trait_count):
0ca96d48 864 parse_traits_info()
e0df6211
PH
865
866 # Method bodies
ba552f54 867 method_body_count = u30()
e0df6211
PH
868 Method = collections.namedtuple('Method', ['code', 'local_count'])
869 methods = {}
870 for _c in range(method_body_count):
ba552f54 871 method_idx = u30()
0ca96d48 872 u30() # max_stack
ba552f54 873 local_count = u30()
0ca96d48
PH
874 u30() # init_scope_depth
875 u30() # max_scope_depth
ba552f54
PH
876 code_length = u30()
877 code = read_bytes(code_length)
e0df6211 878 if method_idx in method_idxs:
ba552f54 879 m = Method(code, local_count)
e0df6211 880 methods[method_idxs[method_idx]] = m
ba552f54 881 exception_count = u30()
e0df6211 882 for _c2 in range(exception_count):
0ca96d48
PH
883 u30() # from
884 u30() # to
885 u30() # target
886 u30() # exc_type
887 u30() # var_name
ba552f54 888 trait_count = u30()
e0df6211 889 for _c2 in range(trait_count):
0ca96d48 890 parse_traits_info()
e0df6211 891
ba552f54 892 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
893 assert len(methods) == len(method_idxs)
894
895 method_pyfunctions = {}
896
897 def extract_function(func_name):
898 if func_name in method_pyfunctions:
899 return method_pyfunctions[func_name]
900 if func_name not in methods:
901 raise ExtractorError(u'Cannot find function %r' % func_name)
902 m = methods[func_name]
903
904 def resfunc(args):
e0df6211
PH
905 registers = ['(this)'] + list(args) + [None] * m.local_count
906 stack = []
907 coder = io.BytesIO(m.code)
908 while True:
909 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 910 if opcode == 36: # pushbyte
e0df6211
PH
911 v = struct.unpack('!B', coder.read(1))[0]
912 stack.append(v)
913 elif opcode == 44: # pushstring
914 idx = u30(coder)
915 stack.append(constant_strings[idx])
916 elif opcode == 48: # pushscope
917 # We don't implement the scope register, so we'll just
918 # ignore the popped value
919 stack.pop()
920 elif opcode == 70: # callproperty
921 index = u30(coder)
922 mname = multinames[index]
923 arg_count = u30(coder)
924 args = list(reversed(
925 [stack.pop() for _ in range(arg_count)]))
926 obj = stack.pop()
927 if mname == u'split':
928 assert len(args) == 1
929 assert isinstance(args[0], compat_str)
930 assert isinstance(obj, compat_str)
931 if args[0] == u'':
932 res = list(obj)
933 else:
934 res = obj.split(args[0])
935 stack.append(res)
a7177865
PH
936 elif mname == u'slice':
937 assert len(args) == 1
938 assert isinstance(args[0], int)
939 assert isinstance(obj, list)
940 res = obj[args[0]:]
941 stack.append(res)
942 elif mname == u'join':
943 assert len(args) == 1
944 assert isinstance(args[0], compat_str)
945 assert isinstance(obj, list)
946 res = args[0].join(obj)
947 stack.append(res)
e0df6211
PH
948 elif mname in method_pyfunctions:
949 stack.append(method_pyfunctions[mname](args))
950 else:
951 raise NotImplementedError(
952 u'Unsupported property %r on %r'
953 % (mname, obj))
a7177865
PH
954 elif opcode == 72: # returnvalue
955 res = stack.pop()
956 return res
957 elif opcode == 79: # callpropvoid
958 index = u30(coder)
959 mname = multinames[index]
960 arg_count = u30(coder)
961 args = list(reversed(
962 [stack.pop() for _ in range(arg_count)]))
963 obj = stack.pop()
964 if mname == u'reverse':
965 assert isinstance(obj, list)
966 obj.reverse()
967 else:
968 raise NotImplementedError(
969 u'Unsupported (void) property %r on %r'
970 % (mname, obj))
e0df6211
PH
971 elif opcode == 93: # findpropstrict
972 index = u30(coder)
973 mname = multinames[index]
974 res = extract_function(mname)
975 stack.append(res)
976 elif opcode == 97: # setproperty
977 index = u30(coder)
978 value = stack.pop()
979 idx = stack.pop()
980 obj = stack.pop()
981 assert isinstance(obj, list)
982 assert isinstance(idx, int)
983 obj[idx] = value
984 elif opcode == 98: # getlocal
985 index = u30(coder)
986 stack.append(registers[index])
987 elif opcode == 99: # setlocal
988 index = u30(coder)
989 value = stack.pop()
990 registers[index] = value
991 elif opcode == 102: # getproperty
992 index = u30(coder)
993 pname = multinames[index]
994 if pname == u'length':
995 obj = stack.pop()
996 assert isinstance(obj, list)
997 stack.append(len(obj))
998 else: # Assume attribute access
999 idx = stack.pop()
1000 assert isinstance(idx, int)
1001 obj = stack.pop()
1002 assert isinstance(obj, list)
1003 stack.append(obj[idx])
1004 elif opcode == 128: # coerce
0ca96d48 1005 u30(coder)
e0df6211
PH
1006 elif opcode == 133: # coerce_s
1007 assert isinstance(stack[-1], (type(None), compat_str))
1008 elif opcode == 164: # modulo
1009 value2 = stack.pop()
1010 value1 = stack.pop()
1011 res = value1 % value2
1012 stack.append(res)
a7177865
PH
1013 elif opcode == 208: # getlocal_0
1014 stack.append(registers[0])
1015 elif opcode == 209: # getlocal_1
1016 stack.append(registers[1])
1017 elif opcode == 210: # getlocal_2
1018 stack.append(registers[2])
1019 elif opcode == 211: # getlocal_3
1020 stack.append(registers[3])
e0df6211
PH
1021 elif opcode == 214: # setlocal_2
1022 registers[2] = stack.pop()
1023 elif opcode == 215: # setlocal_3
1024 registers[3] = stack.pop()
1025 else:
1026 raise NotImplementedError(
1027 u'Unsupported opcode %d' % opcode)
1028
1029 method_pyfunctions[func_name] = resfunc
1030 return resfunc
1031
1032 initial_function = extract_function(u'decipher')
1033 return lambda s: initial_function([s])
1034
83799698 1035 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1036 """Turn the encrypted s field into a working signature"""
6b37f0be 1037
83799698 1038 if player_url is not None:
e0df6211 1039 try:
7f8ae73a
PH
1040 player_id = (player_url, len(s))
1041 if player_id not in self._player_cache:
83799698 1042 func = self._extract_signature_function(
c4417ddb 1043 video_id, player_url, len(s)
e0df6211 1044 )
7f8ae73a
PH
1045 self._player_cache[player_id] = func
1046 func = self._player_cache[player_id]
edf3e38e
PH
1047 if self._downloader.params.get('youtube_print_sig_code'):
1048 self._print_sig_code(func, len(s))
1049 return func(s)
0ca96d48 1050 except Exception:
e0df6211 1051 tb = traceback.format_exc()
83799698
PH
1052 self._downloader.report_warning(
1053 u'Automatic signature extraction failed: ' + tb)
e0df6211 1054
d2d8f895
PH
1055 self._downloader.report_warning(
1056 u'Warning: Falling back to static signature algorithm')
920de7a2 1057
2f2ffea9
PH
1058 return self._static_decrypt_signature(
1059 s, video_id, player_url, age_gate)
e0df6211 1060
2f2ffea9 1061 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1062 if age_gate:
1063 # The videos with age protection use another player, so the
1064 # algorithms can be different.
1065 if len(s) == 86:
1066 return s[2:63] + s[82] + s[64:82] + s[63]
1067
bc4b9008 1068 if len(s) == 93:
1069 return s[86:29:-1] + s[88] + s[28:5:-1]
1070 elif len(s) == 92:
444b1165 1071 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
1072 elif len(s) == 91:
1073 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
1074 elif len(s) == 90:
1075 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1076 elif len(s) == 89:
1077 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1078 elif len(s) == 88:
3e223834 1079 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1080 elif len(s) == 87:
3a725669 1081 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1082 elif len(s) == 86:
f2c327fd 1083 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 1084 elif len(s) == 85:
6ae8ee3f 1085 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1086 elif len(s) == 84:
6f56389b 1087 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 1088 elif len(s) == 83:
920de7a2 1089 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1090 elif len(s) == 82:
c21315f2 1091 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 1092 elif len(s) == 81:
aedd6bb9 1093 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1094 elif len(s) == 80:
1095 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1096 elif len(s) == 79:
1097 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1098
1099 else:
1100 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1101
de7f3446 1102 def _get_available_subtitles(self, video_id):
de7f3446 1103 try:
7fad1c63
JMF
1104 sub_list = self._download_webpage(
1105 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1106 video_id, note=False)
1107 except ExtractorError as err:
de7f3446
JMF
1108 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1109 return {}
1110 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1111
1112 sub_lang_list = {}
1113 for l in lang_list:
1114 lang = l[1]
1115 params = compat_urllib_parse.urlencode({
1116 'lang': lang,
1117 'v': video_id,
1118 'fmt': self._downloader.params.get('subtitlesformat'),
a34c2faa 1119 'name': l[0],
de7f3446
JMF
1120 })
1121 url = u'http://www.youtube.com/api/timedtext?' + params
1122 sub_lang_list[lang] = url
1123 if not sub_lang_list:
1124 self._downloader.report_warning(u'video doesn\'t have subtitles')
1125 return {}
1126 return sub_lang_list
1127
055e6f36 1128 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1129 """We need the webpage for getting the captions url, pass it as an
1130 argument to speed up the process."""
de7f3446
JMF
1131 sub_format = self._downloader.params.get('subtitlesformat')
1132 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1133 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1134 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1135 if mobj is None:
1136 self._downloader.report_warning(err_msg)
1137 return {}
1138 player_config = json.loads(mobj.group(1))
1139 try:
1140 args = player_config[u'args']
1141 caption_url = args[u'ttsurl']
1142 timestamp = args[u'timestamp']
055e6f36
JMF
1143 # We get the available subtitles
1144 list_params = compat_urllib_parse.urlencode({
1145 'type': 'list',
1146 'tlangs': 1,
1147 'asrs': 1,
de7f3446 1148 })
055e6f36
JMF
1149 list_url = caption_url + '&' + list_params
1150 list_page = self._download_webpage(list_url, video_id)
1151 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
e3dc22ca 1152 original_lang_node = caption_list.find('track')
a733eb6c 1153 if not original_lang_node or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1154 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1155 return {}
1156 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1157
1158 sub_lang_list = {}
1159 for lang_node in caption_list.findall('target'):
1160 sub_lang = lang_node.attrib['lang_code']
1161 params = compat_urllib_parse.urlencode({
1162 'lang': original_lang,
1163 'tlang': sub_lang,
1164 'fmt': sub_format,
1165 'ts': timestamp,
1166 'kind': 'asr',
1167 })
1168 sub_lang_list[sub_lang] = caption_url + '&' + params
1169 return sub_lang_list
de7f3446
JMF
1170 # An extractor error can be raise by the download process if there are
1171 # no automatic captions but there are subtitles
1172 except (KeyError, ExtractorError):
1173 self._downloader.report_warning(err_msg)
1174 return {}
1175
c5e8d7af
PH
1176 def _print_formats(self, formats):
1177 print('Available formats:')
1178 for x in formats:
03cc7c20
JMF
1179 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1180 self._video_dimensions.get(x, '???'),
836a086c 1181 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1182
1183 def _extract_id(self, url):
1184 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1185 if mobj is None:
1186 raise ExtractorError(u'Invalid URL: %s' % url)
1187 video_id = mobj.group(2)
1188 return video_id
1189
1d043b93
JMF
1190 def _get_video_url_list(self, url_map):
1191 """
1192 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1193 with the requested formats.
1194 """
1195 req_format = self._downloader.params.get('format', None)
1196 format_limit = self._downloader.params.get('format_limit', None)
1197 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1198 if format_limit is not None and format_limit in available_formats:
1199 format_list = available_formats[available_formats.index(format_limit):]
1200 else:
1201 format_list = available_formats
1202 existing_formats = [x for x in format_list if x in url_map]
1203 if len(existing_formats) == 0:
1204 raise ExtractorError(u'no known formats available for video')
1205 if self._downloader.params.get('listformats', None):
1206 self._print_formats(existing_formats)
1207 return
1208 if req_format is None or req_format == 'best':
1209 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1210 elif req_format == 'worst':
1211 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1212 elif req_format in ('-1', 'all'):
1213 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1214 else:
1215 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1216 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1217 # available in the specified format. For example,
1218 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1219 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1220 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1221 req_formats = req_format.split('/')
1222 video_url_list = None
1223 for rf in req_formats:
1224 if rf in url_map:
1225 video_url_list = [(rf, url_map[rf])]
1226 break
bdc6b3fc
AZ
1227 if rf in self._video_formats_map:
1228 for srf in self._video_formats_map[rf]:
1229 if srf in url_map:
1230 video_url_list = [(srf, url_map[srf])]
1231 break
1232 else:
1233 continue
1234 break
1d043b93
JMF
1235 if video_url_list is None:
1236 raise ExtractorError(u'requested format not available')
1237 return video_url_list
1238
1239 def _extract_from_m3u8(self, manifest_url, video_id):
1240 url_map = {}
1241 def _get_urls(_manifest):
1242 lines = _manifest.split('\n')
1243 urls = filter(lambda l: l and not l.startswith('#'),
1244 lines)
1245 return urls
1246 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1247 formats_urls = _get_urls(manifest)
1248 for format_url in formats_urls:
890f62e8 1249 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1250 url_map[itag] = format_url
1251 return url_map
1252
1fb07d10
JG
1253 def _extract_annotations(self, video_id):
1254 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1255 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1256
c5e8d7af
PH
1257 def _real_extract(self, url):
1258 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1259 mobj = re.search(self._NEXT_URL_RE, url)
1260 if mobj:
1261 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1262 video_id = self._extract_id(url)
1263
1264 # Get video webpage
1265 self.report_video_webpage_download(video_id)
1266 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1267 request = compat_urllib_request.Request(url)
1268 try:
1269 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1270 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1271 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1272
1273 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1274
1275 # Attempt to extract SWF player URL
e0df6211 1276 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1277 if mobj is not None:
1278 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1279 else:
1280 player_url = None
1281
1282 # Get video info
1283 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1284 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1285 self.report_age_confirmation()
1286 age_gate = True
1287 # We simulate the access to the video from www.youtube.com/v/{video_id}
1288 # this can be viewed without login into Youtube
1289 data = compat_urllib_parse.urlencode({'video_id': video_id,
1290 'el': 'embedded',
1291 'gl': 'US',
1292 'hl': 'en',
1293 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1294 'asv': 3,
1295 'sts':'1588',
1296 })
1297 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1298 video_info_webpage = self._download_webpage(video_info_url, video_id,
1299 note=False,
1300 errnote='unable to download video info webpage')
1301 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1302 else:
1303 age_gate = False
1304 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1305 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1306 % (video_id, el_type))
1307 video_info_webpage = self._download_webpage(video_info_url, video_id,
1308 note=False,
1309 errnote='unable to download video info webpage')
1310 video_info = compat_parse_qs(video_info_webpage)
1311 if 'token' in video_info:
1312 break
c5e8d7af
PH
1313 if 'token' not in video_info:
1314 if 'reason' in video_info:
9a82b238 1315 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1316 else:
1317 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1318
1319 # Check for "rental" videos
1320 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1321 raise ExtractorError(u'"rental" videos not supported')
1322
1323 # Start extracting information
1324 self.report_information_extraction(video_id)
1325
1326 # uploader
1327 if 'author' not in video_info:
1328 raise ExtractorError(u'Unable to extract uploader name')
1329 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1330
1331 # uploader_id
1332 video_uploader_id = None
1333 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1334 if mobj is not None:
1335 video_uploader_id = mobj.group(1)
1336 else:
1337 self._downloader.report_warning(u'unable to extract uploader nickname')
1338
1339 # title
a8c6b241
PH
1340 if 'title' in video_info:
1341 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1342 else:
1343 self._downloader.report_warning(u'Unable to extract video title')
1344 video_title = u'_'
c5e8d7af
PH
1345
1346 # thumbnail image
7763b04e
JMF
1347 # We try first to get a high quality image:
1348 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1349 video_webpage, re.DOTALL)
1350 if m_thumb is not None:
1351 video_thumbnail = m_thumb.group(1)
1352 elif 'thumbnail_url' not in video_info:
c5e8d7af 1353 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1354 video_thumbnail = None
c5e8d7af
PH
1355 else: # don't panic if we can't find it
1356 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1357
1358 # upload date
1359 upload_date = None
1360 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1361 if mobj is not None:
1362 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1363 upload_date = unified_strdate(upload_date)
1364
1365 # description
1366 video_description = get_element_by_id("eow-description", video_webpage)
1367 if video_description:
1368 video_description = clean_html(video_description)
1369 else:
1370 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1371 if fd_mobj:
1372 video_description = unescapeHTML(fd_mobj.group(1))
1373 else:
1374 video_description = u''
1375
1376 # subtitles
d82134c3 1377 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1378
c5e8d7af 1379 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1380 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1381 return
1382
1383 if 'length_seconds' not in video_info:
1384 self._downloader.report_warning(u'unable to extract video duration')
1385 video_duration = ''
1386 else:
1387 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1388
1fb07d10
JG
1389 # annotations
1390 video_annotations = None
1391 if self._downloader.params.get('writeannotations', False):
1392 video_annotations = self._extract_annotations(video_id)
1393
c5e8d7af 1394 # Decide which formats to download
c5e8d7af
PH
1395
1396 try:
1397 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1398 if not mobj:
1399 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1400 info = json.loads(mobj.group(1))
1401 args = info['args']
7ce7e394
JMF
1402 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1403 # this signatures are encrypted
44d46655 1404 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1405 raise ValueError(u'No stream_map present') # caught below
7ce7e394
JMF
1406 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1407 if m_s is not None:
1408 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1409 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
cde846b3 1410 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
b7a68384 1411 if m_s is not None:
37b6d5f6
AZ
1412 if 'url_encoded_fmt_stream_map' in video_info:
1413 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1414 else:
1415 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
211fbc13 1416 elif 'adaptive_fmts' in video_info:
37b6d5f6
AZ
1417 if 'url_encoded_fmt_stream_map' in video_info:
1418 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1419 else:
1420 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
c5e8d7af
PH
1421 except ValueError:
1422 pass
1423
1424 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1425 self.report_rtmp_download()
1426 video_url_list = [(None, video_info['conn'][0])]
1427 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
a7055eb9
JMF
1428 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1429 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af
PH
1430 url_map = {}
1431 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1432 url_data = compat_parse_qs(url_data_str)
1433 if 'itag' in url_data and 'url' in url_data:
1434 url = url_data['url'][0]
1435 if 'sig' in url_data:
1436 url += '&signature=' + url_data['sig'][0]
1437 elif 's' in url_data:
e0df6211 1438 encrypted_sig = url_data['s'][0]
769fda3c 1439 if self._downloader.params.get('verbose'):
c108eb73 1440 if age_gate:
bdde940e
PH
1441 if player_url is None:
1442 player_version = 'unknown'
1443 else:
1444 player_version = self._search_regex(
1445 r'-(.+)\.swf$', player_url,
1446 u'flash player', fatal=False)
e0df6211 1447 player_desc = 'flash player %s' % player_version
c108eb73 1448 else:
83799698
PH
1449 player_version = self._search_regex(
1450 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1451 'html5 player', fatal=False)
e0df6211
PH
1452 player_desc = u'html5 player %s' % player_version
1453
1454 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1455 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1456 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1457
83799698 1458 if not age_gate:
e0df6211
PH
1459 jsplayer_url_json = self._search_regex(
1460 r'"assets":.+?"js":\s*("[^"]+")',
1461 video_webpage, u'JS player URL')
83799698 1462 player_url = json.loads(jsplayer_url_json)
e0df6211 1463
83799698
PH
1464 signature = self._decrypt_signature(
1465 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1466 url += '&signature=' + signature
1467 if 'ratebypass' not in url:
1468 url += '&ratebypass=yes'
1469 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1470 video_url_list = self._get_video_url_list(url_map)
1471 if not video_url_list:
c5e8d7af 1472 return
1d043b93
JMF
1473 elif video_info.get('hlsvp'):
1474 manifest_url = video_info['hlsvp'][0]
1475 url_map = self._extract_from_m3u8(manifest_url, video_id)
1476 video_url_list = self._get_video_url_list(url_map)
1477 if not video_url_list:
1478 return
1479
c5e8d7af 1480 else:
9abb3204 1481 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af
PH
1482
1483 results = []
1484 for format_param, video_real_url in video_url_list:
1485 # Extension
1486 video_extension = self._video_extensions.get(format_param, 'flv')
1487
03cc7c20
JMF
1488 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1489 self._video_dimensions.get(format_param, '???'),
836a086c 1490 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
c5e8d7af
PH
1491
1492 results.append({
1493 'id': video_id,
1494 'url': video_real_url,
1495 'uploader': video_uploader,
1496 'uploader_id': video_uploader_id,
1497 'upload_date': upload_date,
1498 'title': video_title,
1499 'ext': video_extension,
1500 'format': video_format,
1501 'thumbnail': video_thumbnail,
1502 'description': video_description,
1503 'player_url': player_url,
1504 'subtitles': video_subtitles,
8dbe9899 1505 'duration': video_duration,
cfadd183 1506 'age_limit': 18 if age_gate else 0,
1fb07d10 1507 'annotations': video_annotations
c5e8d7af
PH
1508 })
1509 return results
1510
1511class YoutubePlaylistIE(InfoExtractor):
0f818663 1512 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1513 _VALID_URL = r"""(?:
1514 (?:https?://)?
1515 (?:\w+\.)?
1516 youtube\.com/
1517 (?:
1518 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1519 \? (?:.*?&)*? (?:p|a|list)=
1520 | p/
1521 )
c626a3d9 1522 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1523 .*
1524 |
c626a3d9 1525 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1526 )"""
1527 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1528 _MAX_RESULTS = 50
1529 IE_NAME = u'youtube:playlist'
1530
1531 @classmethod
1532 def suitable(cls, url):
1533 """Receives a URL and returns True if suitable for this IE."""
1534 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1535
1536 def _real_extract(self, url):
1537 # Extract playlist id
1538 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1539 if mobj is None:
1540 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1541 playlist_id = mobj.group(1) or mobj.group(2)
1542
1543 # Check if it's a video-specific URL
7c61bd36 1544 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1545 if 'v' in query_dict:
1546 video_id = query_dict['v'][0]
1547 if self._downloader.params.get('noplaylist'):
1548 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1549 return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
1550 else:
1551 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af
PH
1552
1553 # Download playlist videos from API
c5e8d7af
PH
1554 videos = []
1555
755eb032 1556 for page_num in itertools.count(1):
771822eb
JMF
1557 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1558 if start_index >= 1000:
1559 self._downloader.report_warning(u'Max number of results reached')
1560 break
1561 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
c5e8d7af
PH
1562 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1563
1564 try:
1565 response = json.loads(page)
1566 except ValueError as err:
1567 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1568
1569 if 'feed' not in response:
1570 raise ExtractorError(u'Got a malformed response from YouTube API')
1571 playlist_title = response['feed']['title']['$t']
1572 if 'entry' not in response['feed']:
1573 # Number of videos is a multiple of self._MAX_RESULTS
1574 break
1575
1576 for entry in response['feed']['entry']:
1577 index = entry['yt$position']['$t']
c215217e
JMF
1578 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1579 videos.append((
1580 index,
1581 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1582 ))
c5e8d7af 1583
c5e8d7af
PH
1584 videos = [v[1] for v in sorted(videos)]
1585
20c3893f 1586 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
c5e8d7af
PH
1587 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1588
1589
1590class YoutubeChannelIE(InfoExtractor):
0f818663 1591 IE_DESC = u'YouTube.com channels'
c5e8d7af
PH
1592 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1593 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1594 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1595 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1596 IE_NAME = u'youtube:channel'
1597
1598 def extract_videos_from_page(self, page):
1599 ids_in_page = []
1600 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1601 if mobj.group(1) not in ids_in_page:
1602 ids_in_page.append(mobj.group(1))
1603 return ids_in_page
1604
1605 def _real_extract(self, url):
1606 # Extract channel id
1607 mobj = re.match(self._VALID_URL, url)
1608 if mobj is None:
1609 raise ExtractorError(u'Invalid URL: %s' % url)
1610
1611 # Download channel page
1612 channel_id = mobj.group(1)
1613 video_ids = []
1614 pagenum = 1
1615
1616 url = self._TEMPLATE_URL % (channel_id, pagenum)
1617 page = self._download_webpage(url, channel_id,
1618 u'Downloading page #%s' % pagenum)
1619
1620 # Extract video identifiers
1621 ids_in_page = self.extract_videos_from_page(page)
1622 video_ids.extend(ids_in_page)
1623
1624 # Download any subsequent channel pages using the json-based channel_ajax query
1625 if self._MORE_PAGES_INDICATOR in page:
755eb032 1626 for pagenum in itertools.count(1):
c5e8d7af
PH
1627 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1628 page = self._download_webpage(url, channel_id,
1629 u'Downloading page #%s' % pagenum)
1630
1631 page = json.loads(page)
1632
1633 ids_in_page = self.extract_videos_from_page(page['content_html'])
1634 video_ids.extend(ids_in_page)
1635
1636 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1637 break
1638
1639 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1640
1641 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
20c3893f 1642 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
c5e8d7af
PH
1643 return [self.playlist_result(url_entries, channel_id)]
1644
1645
1646class YoutubeUserIE(InfoExtractor):
0f818663 1647 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1648 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1649 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1650 _GDATA_PAGE_SIZE = 50
fd9cf738 1651 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1652 IE_NAME = u'youtube:user'
1653
e3ea4790 1654 @classmethod
f4b05232 1655 def suitable(cls, url):
e3ea4790
JMF
1656 # Don't return True if the url can be extracted with other youtube
1657 # extractor, the regex would is too permissive and it would match.
1658 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1659 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1660 else: return super(YoutubeUserIE, cls).suitable(url)
1661
c5e8d7af
PH
1662 def _real_extract(self, url):
1663 # Extract username
1664 mobj = re.match(self._VALID_URL, url)
1665 if mobj is None:
1666 raise ExtractorError(u'Invalid URL: %s' % url)
1667
1668 username = mobj.group(1)
1669
1670 # Download video ids using YouTube Data API. Result size per
1671 # query is limited (currently to 50 videos) so we need to query
1672 # page by page until there are no video ids - it means we got
1673 # all of them.
1674
1675 video_ids = []
c5e8d7af 1676
755eb032 1677 for pagenum in itertools.count(0):
c5e8d7af
PH
1678 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1679
1680 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1681 page = self._download_webpage(gdata_url, username,
1682 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1683
fd9cf738
JMF
1684 try:
1685 response = json.loads(page)
1686 except ValueError as err:
1687 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1688 if 'entry' not in response['feed']:
1689 # Number of videos is a multiple of self._MAX_RESULTS
1690 break
fd9cf738 1691
c5e8d7af
PH
1692 # Extract video identifiers
1693 ids_in_page = []
fd9cf738
JMF
1694 for entry in response['feed']['entry']:
1695 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1696 video_ids.extend(ids_in_page)
1697
1698 # A little optimization - if current page is not
1699 # "full", ie. does not contain PAGE_SIZE video ids then
1700 # we can assume that this page is the last one - there
1701 # are no more ids on further pages - no need to query
1702 # again.
1703
1704 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1705 break
1706
c5e8d7af 1707 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
20c3893f 1708 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
c5e8d7af 1709 return [self.playlist_result(url_results, playlist_title = username)]
b05654f0
PH
1710
1711class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1712 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1713 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1714 _MAX_RESULTS = 1000
1715 IE_NAME = u'youtube:search'
1716 _SEARCH_KEY = 'ytsearch'
1717
1718 def report_download_page(self, query, pagenum):
1719 """Report attempt to download search page with given number."""
1720 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1721
1722 def _get_n_results(self, query, n):
1723 """Get a specified number of results for a query"""
1724
1725 video_ids = []
1726 pagenum = 0
1727 limit = n
1728
1729 while (50 * pagenum) < limit:
1730 self.report_download_page(query, pagenum+1)
1731 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1732 request = compat_urllib_request.Request(result_url)
1733 try:
1734 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1735 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1736 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1737 api_response = json.loads(data)['data']
1738
1739 if not 'items' in api_response:
1740 raise ExtractorError(u'[youtube] No video results')
1741
1742 new_ids = list(video['id'] for video in api_response['items'])
1743 video_ids += new_ids
1744
1745 limit = min(n, api_response['totalItems'])
1746 pagenum += 1
1747
1748 if len(video_ids) > n:
1749 video_ids = video_ids[:n]
1750 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1751 return self.playlist_result(videos, query)
75dff0ee
JMF
1752
1753
1754class YoutubeShowIE(InfoExtractor):
0f818663 1755 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1756 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1757 IE_NAME = u'youtube:show'
1758
1759 def _real_extract(self, url):
1760 mobj = re.match(self._VALID_URL, url)
1761 show_name = mobj.group(1)
1762 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1763 # There's one playlist for each season of the show
1764 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1765 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1766 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1767
1768
b2e8bc1b 1769class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1770 """
1771 Base class for extractors that fetch info from
1772 http://www.youtube.com/feed_ajax
1773 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1774 """
b2e8bc1b 1775 _LOGIN_REQUIRED = True
04cc9617 1776 _PAGING_STEP = 30
43ba5456
JMF
1777 # use action_load_personal_feed instead of action_load_system_feed
1778 _PERSONAL_FEED = False
04cc9617 1779
d7ae0639
JMF
1780 @property
1781 def _FEED_TEMPLATE(self):
43ba5456
JMF
1782 action = 'action_load_system_feed'
1783 if self._PERSONAL_FEED:
1784 action = 'action_load_personal_feed'
1785 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1786
1787 @property
1788 def IE_NAME(self):
1789 return u'youtube:%s' % self._FEED_NAME
04cc9617 1790
81f0259b 1791 def _real_initialize(self):
b2e8bc1b 1792 self._login()
81f0259b 1793
04cc9617
JMF
1794 def _real_extract(self, url):
1795 feed_entries = []
1796 # The step argument is available only in 2.7 or higher
1797 for i in itertools.count(0):
1798 paging = i*self._PAGING_STEP
d7ae0639
JMF
1799 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1800 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1801 u'Downloading page %s' % i)
1802 info = json.loads(info)
1803 feed_html = info['feed_html']
43ba5456 1804 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617
JMF
1805 ids = orderedSet(m.group(1) for m in m_ids)
1806 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1807 if info['paging'] is None:
1808 break
d7ae0639
JMF
1809 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1810
1811class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1812 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1813 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1814 _FEED_NAME = 'subscriptions'
1815 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1816
1817class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1818 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1819 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1820 _FEED_NAME = 'recommended'
1821 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1822
43ba5456
JMF
1823class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1824 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1825 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1826 _FEED_NAME = 'watch_later'
1827 _PLAYLIST_TITLE = u'Youtube Watch Later'
1828 _PAGING_STEP = 100
1829 _PERSONAL_FEED = True
c626a3d9
JMF
1830
1831class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1832 IE_NAME = u'youtube:favorites'
1833 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1834 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1835 _LOGIN_REQUIRED = True
1836
1837 def _real_extract(self, url):
1838 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1839 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1840 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1841
1842
1843class YoutubeTruncatedURLIE(InfoExtractor):
1844 IE_NAME = 'youtube:truncated_url'
1845 IE_DESC = False # Do not list
1846 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1847
1848 def _real_extract(self, url):
1849 raise ExtractorError(
1850 u'Did you forget to quote the URL? Remember that & is a meta '
1851 u'character in most shells, so you want to put the URL in quotes, '
1852 u'like youtube-dl '
1853 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1854 u' (or simply youtube-dl BaW_jenozKc ).',
1855 expected=True)