]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Document disabling proxy (#1882)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af
PH
9import re
10import socket
e0df6211
PH
11import string
12import struct
13import traceback
14import zlib
c5e8d7af 15
b05654f0 16from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 17from .subtitles import SubtitlesInfoExtractor
c5e8d7af 18from ..utils import (
edf3e38e 19 compat_chr,
c5e8d7af
PH
20 compat_http_client,
21 compat_parse_qs,
22 compat_urllib_error,
23 compat_urllib_parse,
24 compat_urllib_request,
7c61bd36 25 compat_urlparse,
c5e8d7af
PH
26 compat_str,
27
28 clean_html,
c38b1e77 29 get_cachedir,
c5e8d7af 30 get_element_by_id,
652cdaa2 31 get_element_by_attribute,
c5e8d7af
PH
32 ExtractorError,
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def report_lang(self):
49 """Report attempt to set language."""
50 self.to_screen(u'Setting language')
51
52 def _set_language(self):
53 request = compat_urllib_request.Request(self._LANG_URL)
54 try:
55 self.report_lang()
56 compat_urllib_request.urlopen(request).read()
57 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
58 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
59 return False
60 return True
61
62 def _login(self):
63 (username, password) = self._get_login_info()
64 # No authentication to be performed
65 if username is None:
66 if self._LOGIN_REQUIRED:
67 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 return False
69
70 request = compat_urllib_request.Request(self._LOGIN_URL)
71 try:
72 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
73 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
74 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
75 return False
76
795f28f8
PH
77 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78 login_page, u'Login GALX parameter')
c5e8d7af 79
b2e8bc1b
JMF
80 # Log in
81 login_form_strs = {
82 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
83 u'Email': username,
84 u'GALX': galx,
85 u'Passwd': password,
86 u'PersistentCookie': u'yes',
87 u'_utf8': u'霱',
88 u'bgresponse': u'js_disabled',
89 u'checkConnection': u'',
90 u'checkedDomains': u'youtube',
91 u'dnConn': u'',
b2e8bc1b
JMF
92 u'pstMsg': u'0',
93 u'rmShown': u'1',
94 u'secTok': u'',
95 u'signIn': u'Sign in',
96 u'timeStmp': u'',
97 u'service': u'youtube',
98 u'uilel': u'3',
99 u'hl': u'en_US',
100 }
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
105 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 try:
107 self.report_login()
108 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
109 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
110 self._downloader.report_warning(u'unable to log in: bad username or password')
111 return False
112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
113 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
114 return False
115 return True
116
117 def _confirm_age(self):
118 age_form = {
119 'next_url': '/',
120 'action_confirm': 'Confirm',
121 }
122 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
123 try:
124 self.report_age_confirmation()
125 compat_urllib_request.urlopen(request).read().decode('utf-8')
126 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
128 return True
129
130 def _real_initialize(self):
131 if self._downloader is None:
132 return
133 if not self._set_language():
134 return
135 if not self._login():
136 return
137 self._confirm_age()
c5e8d7af 138
8377574c 139
de7f3446 140class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 141 IE_DESC = u'YouTube.com'
cb7dfeea 142 _VALID_URL = r"""(?x)^
c5e8d7af 143 (
83aa5293 144 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 145 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
e69ae5b9
JMF
146 tube\.majestyc\.net/|
147 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
148 (?:.*?\#/)? # handle anchor (#/) redirect urls
149 (?: # the various things that can precede the ID:
150 (?:(?:v|embed|e)/) # v/ or embed/ or e/
151 |(?: # or the v= param in all its forms
d741e55a 152 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
153 (?:\?|\#!?) # the params delimiter ? or # or #!
154 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
155 v=
156 )
f4b05232
JMF
157 ))
158 |youtu\.be/ # just youtu.be/xxxx
159 )
c5e8d7af 160 )? # all until now is optional -> you can pass the naked ID
8963d9c2 161 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
162 (?(1).+)? # if we found the ID, everything can follow
163 $"""
c5e8d7af 164 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 165 # Listed in order of quality
bdc6b3fc 166 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 167 # Apple HTTP Live Streaming
bdc6b3fc 168 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
169 # 3D
170 '85', '84', '102', '83', '101', '82', '100',
171 # Dash video
172 '138', '137', '248', '136', '247', '135', '246',
173 '245', '244', '134', '243', '133', '242', '160',
174 # Dash audio
175 '141', '172', '140', '171', '139',
1d043b93 176 ]
bdc6b3fc 177 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 178 # Apple HTTP Live Streaming
bdc6b3fc
AZ
179 '96', '95', '94', '93', '92', '132', '151',
180 # 3D
86fe61c8 181 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
182 # Dash video
183 '138', '248', '137', '247', '136', '246', '245',
184 '244', '135', '243', '134', '242', '133', '160',
185 # Dash audio
186 '172', '141', '171', '140', '139',
1d043b93 187 ]
bdc6b3fc
AZ
188 _video_formats_map = {
189 'flv': ['35', '34', '6', '5'],
190 '3gp': ['36', '17', '13'],
191 'mp4': ['38', '37', '22', '18'],
192 'webm': ['46', '45', '44', '43'],
193 }
c5e8d7af
PH
194 _video_extensions = {
195 '13': '3gp',
bdc6b3fc 196 '17': '3gp',
c5e8d7af
PH
197 '18': 'mp4',
198 '22': 'mp4',
bdc6b3fc 199 '36': '3gp',
c5e8d7af 200 '37': 'mp4',
d69cf69a 201 '38': 'mp4',
c5e8d7af
PH
202 '43': 'webm',
203 '44': 'webm',
204 '45': 'webm',
205 '46': 'webm',
1d043b93 206
86fe61c8
AZ
207 # 3d videos
208 '82': 'mp4',
209 '83': 'mp4',
210 '84': 'mp4',
211 '85': 'mp4',
212 '100': 'webm',
213 '101': 'webm',
214 '102': 'webm',
836a086c 215
96fb5605 216 # Apple HTTP Live Streaming
1d043b93
JMF
217 '92': 'mp4',
218 '93': 'mp4',
219 '94': 'mp4',
220 '95': 'mp4',
221 '96': 'mp4',
222 '132': 'mp4',
223 '151': 'mp4',
836a086c
AZ
224
225 # Dash mp4
226 '133': 'mp4',
227 '134': 'mp4',
228 '135': 'mp4',
229 '136': 'mp4',
230 '137': 'mp4',
231 '138': 'mp4',
836a086c
AZ
232 '160': 'mp4',
233
f6f1fc92
RB
234 # Dash mp4 audio
235 '139': 'm4a',
16f36a6f
RB
236 '140': 'm4a',
237 '141': 'm4a',
836a086c
AZ
238
239 # Dash webm
240 '171': 'webm',
241 '172': 'webm',
242 '242': 'webm',
243 '243': 'webm',
244 '244': 'webm',
245 '245': 'webm',
246 '246': 'webm',
247 '247': 'webm',
248 '248': 'webm',
c5e8d7af
PH
249 }
250 _video_dimensions = {
d5a9bb4e 251 '5': '400x240',
c5e8d7af
PH
252 '6': '???',
253 '13': '???',
d5a9bb4e
RB
254 '17': '176x144',
255 '18': '640x360',
256 '22': '1280x720',
257 '34': '640x360',
258 '35': '854x480',
259 '36': '320x240',
260 '37': '1920x1080',
261 '38': '4096x3072',
262 '43': '640x360',
263 '44': '854x480',
264 '45': '1280x720',
265 '46': '1920x1080',
86fe61c8
AZ
266 '82': '360p',
267 '83': '480p',
268 '84': '720p',
269 '85': '1080p',
1d043b93
JMF
270 '92': '240p',
271 '93': '360p',
272 '94': '480p',
273 '95': '720p',
274 '96': '1080p',
86fe61c8
AZ
275 '100': '360p',
276 '101': '480p',
836a086c 277 '102': '720p',
1d043b93
JMF
278 '132': '240p',
279 '151': '72p',
836a086c
AZ
280 '133': '240p',
281 '134': '360p',
282 '135': '480p',
283 '136': '720p',
284 '137': '1080p',
285 '138': '>1080p',
286 '139': '48k',
287 '140': '128k',
288 '141': '256k',
289 '160': '192p',
290 '171': '128k',
291 '172': '256k',
292 '242': '240p',
293 '243': '360p',
294 '244': '480p',
295 '245': '480p',
296 '246': '480p',
297 '247': '720p',
298 '248': '1080p',
c5e8d7af 299 }
836a086c
AZ
300 _special_itags = {
301 '82': '3D',
302 '83': '3D',
303 '84': '3D',
304 '85': '3D',
305 '100': '3D',
306 '101': '3D',
307 '102': '3D',
308 '133': 'DASH Video',
309 '134': 'DASH Video',
310 '135': 'DASH Video',
311 '136': 'DASH Video',
312 '137': 'DASH Video',
313 '138': 'DASH Video',
314 '139': 'DASH Audio',
315 '140': 'DASH Audio',
316 '141': 'DASH Audio',
317 '160': 'DASH Video',
318 '171': 'DASH Audio',
319 '172': 'DASH Audio',
320 '242': 'DASH Video',
321 '243': 'DASH Video',
322 '244': 'DASH Video',
323 '245': 'DASH Video',
324 '246': 'DASH Video',
325 '247': 'DASH Video',
326 '248': 'DASH Video',
c5e8d7af 327 }
836a086c 328
c5e8d7af 329 IE_NAME = u'youtube'
2eb88d95
PH
330 _TESTS = [
331 {
0e853ca4
PH
332 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
333 u"file": u"BaW_jenozKc.mp4",
334 u"info_dict": {
335 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
336 u"uploader": u"Philipp Hagemeister",
337 u"uploader_id": u"phihag",
338 u"upload_date": u"20121002",
339 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 340 }
0e853ca4 341 },
0e853ca4
PH
342 {
343 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
344 u"file": u"UxxajLWwzqY.mp4",
345 u"note": u"Test generic use_cipher_signature video (#897)",
346 u"info_dict": {
347 u"upload_date": u"20120506",
348 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 349 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 350 u"uploader": u"Icona Pop",
0e853ca4 351 u"uploader_id": u"IconaPop"
2eb88d95 352 }
c108eb73
JMF
353 },
354 {
355 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
356 u"file": u"07FYdnEawAQ.mp4",
357 u"note": u"Test VEVO video with age protection (#956)",
358 u"info_dict": {
359 u"upload_date": u"20130703",
360 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
361 u"description": u"md5:64249768eec3bc4276236606ea996373",
362 u"uploader": u"justintimberlakeVEVO",
363 u"uploader_id": u"justintimberlakeVEVO"
364 }
365 },
fccd3771 366 {
83aa5293 367 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
368 u"file": u"yZIXLfi8CZQ.mp4",
369 u"note": u"Embed-only video (#1746)",
370 u"info_dict": {
371 u"upload_date": u"20120608",
372 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
373 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
374 u"uploader": u"SET India",
375 u"uploader_id": u"setindia"
376 }
377 },
2eb88d95
PH
378 ]
379
c5e8d7af
PH
380
381 @classmethod
382 def suitable(cls, url):
383 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 384 if YoutubePlaylistIE.suitable(url): return False
fccd3771 385 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 386
e0df6211
PH
387 def __init__(self, *args, **kwargs):
388 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 389 self._player_cache = {}
e0df6211 390
c5e8d7af
PH
391 def report_video_webpage_download(self, video_id):
392 """Report attempt to download video webpage."""
393 self.to_screen(u'%s: Downloading video webpage' % video_id)
394
395 def report_video_info_webpage_download(self, video_id):
396 """Report attempt to download video info webpage."""
397 self.to_screen(u'%s: Downloading video info webpage' % video_id)
398
c5e8d7af
PH
399 def report_information_extraction(self, video_id):
400 """Report attempt to extract video information."""
401 self.to_screen(u'%s: Extracting video information' % video_id)
402
403 def report_unavailable_format(self, video_id, format):
404 """Report extracted video URL."""
405 self.to_screen(u'%s: Format %s not available' % (video_id, format))
406
407 def report_rtmp_download(self):
408 """Indicate the download will use the RTMP protocol."""
409 self.to_screen(u'RTMP download detected')
410
c4417ddb
PH
411 def _extract_signature_function(self, video_id, player_url, slen):
412 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 413 player_url)
e0df6211
PH
414 player_type = id_m.group('ext')
415 player_id = id_m.group('id')
416
c4417ddb
PH
417 # Read from filesystem cache
418 func_id = '%s_%s_%d' % (player_type, player_id, slen)
419 assert os.path.basename(func_id) == func_id
c38b1e77 420 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 421
c3c88a26 422 cache_enabled = cache_dir is not None
f8061589 423 if cache_enabled:
c4417ddb
PH
424 cache_fn = os.path.join(os.path.expanduser(cache_dir),
425 u'youtube-sigfuncs',
426 func_id + '.json')
427 try:
edf3e38e 428 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
429 cache_spec = json.load(cachef)
430 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 431 except IOError:
c4417ddb 432 pass # No cache available
83799698 433
e0df6211
PH
434 if player_type == 'js':
435 code = self._download_webpage(
436 player_url, video_id,
83799698 437 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 438 errnote=u'Download of %s failed' % player_url)
83799698 439 res = self._parse_sig_js(code)
c4417ddb 440 elif player_type == 'swf':
e0df6211
PH
441 urlh = self._request_webpage(
442 player_url, video_id,
83799698 443 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
444 errnote=u'Download of %s failed' % player_url)
445 code = urlh.read()
83799698 446 res = self._parse_sig_swf(code)
e0df6211
PH
447 else:
448 assert False, 'Invalid player type %r' % player_type
449
f8061589 450 if cache_enabled:
edf3e38e 451 try:
c705320f
PH
452 test_string = u''.join(map(compat_chr, range(slen)))
453 cache_res = res(test_string)
edf3e38e
PH
454 cache_spec = [ord(c) for c in cache_res]
455 try:
456 os.makedirs(os.path.dirname(cache_fn))
457 except OSError as ose:
458 if ose.errno != errno.EEXIST:
459 raise
460 write_json_file(cache_spec, cache_fn)
0ca96d48 461 except Exception:
edf3e38e
PH
462 tb = traceback.format_exc()
463 self._downloader.report_warning(
464 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
465
466 return res
467
edf3e38e
PH
468 def _print_sig_code(self, func, slen):
469 def gen_sig_code(idxs):
470 def _genslice(start, end, step):
471 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
472 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
473 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
474 return u's[%s%s%s]' % (starts, ends, steps)
475
476 step = None
0ca96d48
PH
477 start = '(Never used)' # Quelch pyflakes warnings - start will be
478 # set as soon as step is set
edf3e38e
PH
479 for i, prev in zip(idxs[1:], idxs[:-1]):
480 if step is not None:
481 if i - prev == step:
482 continue
483 yield _genslice(start, prev, step)
484 step = None
485 continue
486 if i - prev in [-1, 1]:
487 step = i - prev
488 start = prev
489 continue
490 else:
491 yield u's[%d]' % prev
492 if step is None:
493 yield u's[%d]' % i
494 else:
495 yield _genslice(start, i, step)
496
c705320f
PH
497 test_string = u''.join(map(compat_chr, range(slen)))
498 cache_res = func(test_string)
edf3e38e
PH
499 cache_spec = [ord(c) for c in cache_res]
500 expr_code = u' + '.join(gen_sig_code(cache_spec))
501 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 502 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 503
e0df6211
PH
504 def _parse_sig_js(self, jscode):
505 funcname = self._search_regex(
506 r'signature=([a-zA-Z]+)', jscode,
507 u'Initial JS player signature function name')
508
509 functions = {}
510
511 def argidx(varname):
512 return string.lowercase.index(varname)
513
514 def interpret_statement(stmt, local_vars, allow_recursion=20):
515 if allow_recursion < 0:
0ca96d48 516 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
517
518 if stmt.startswith(u'var '):
519 stmt = stmt[len(u'var '):]
520 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
521 r'=(?P<expr>.*)$', stmt)
522 if ass_m:
523 if ass_m.groupdict().get('index'):
524 def assign(val):
525 lvar = local_vars[ass_m.group('out')]
526 idx = interpret_expression(ass_m.group('index'),
527 local_vars, allow_recursion)
528 assert isinstance(idx, int)
529 lvar[idx] = val
530 return val
531 expr = ass_m.group('expr')
532 else:
533 def assign(val):
534 local_vars[ass_m.group('out')] = val
535 return val
536 expr = ass_m.group('expr')
537 elif stmt.startswith(u'return '):
538 assign = lambda v: v
539 expr = stmt[len(u'return '):]
540 else:
541 raise ExtractorError(
542 u'Cannot determine left side of statement in %r' % stmt)
543
544 v = interpret_expression(expr, local_vars, allow_recursion)
545 return assign(v)
546
547 def interpret_expression(expr, local_vars, allow_recursion):
548 if expr.isdigit():
549 return int(expr)
550
551 if expr.isalpha():
552 return local_vars[expr]
553
554 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
555 if m:
556 member = m.group('member')
557 val = local_vars[m.group('in')]
558 if member == 'split("")':
559 return list(val)
560 if member == 'join("")':
561 return u''.join(val)
562 if member == 'length':
563 return len(val)
564 if member == 'reverse()':
565 return val[::-1]
566 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
567 if slice_m:
568 idx = interpret_expression(
569 slice_m.group('idx'), local_vars, allow_recursion-1)
570 return val[idx:]
571
572 m = re.match(
573 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
574 if m:
575 val = local_vars[m.group('in')]
576 idx = interpret_expression(m.group('idx'), local_vars,
577 allow_recursion-1)
578 return val[idx]
579
580 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
581 if m:
582 a = interpret_expression(m.group('a'),
583 local_vars, allow_recursion)
584 b = interpret_expression(m.group('b'),
585 local_vars, allow_recursion)
586 return a % b
587
588 m = re.match(
589 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
590 if m:
591 fname = m.group('func')
592 if fname not in functions:
593 functions[fname] = extract_function(fname)
594 argvals = [int(v) if v.isdigit() else local_vars[v]
595 for v in m.group('args').split(',')]
596 return functions[fname](argvals)
597 raise ExtractorError(u'Unsupported JS expression %r' % expr)
598
599 def extract_function(funcname):
600 func_m = re.search(
601 r'function ' + re.escape(funcname) +
602 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
603 jscode)
604 argnames = func_m.group('args').split(',')
605
606 def resf(args):
607 local_vars = dict(zip(argnames, args))
608 for stmt in func_m.group('code').split(';'):
609 res = interpret_statement(stmt, local_vars)
610 return res
611 return resf
612
613 initial_function = extract_function(funcname)
614 return lambda s: initial_function([s])
615
616 def _parse_sig_swf(self, file_contents):
617 if file_contents[1:3] != b'WS':
618 raise ExtractorError(
619 u'Not an SWF file; header is %r' % file_contents[:3])
620 if file_contents[:1] == b'C':
621 content = zlib.decompress(file_contents[8:])
622 else:
623 raise NotImplementedError(u'Unsupported compression format %r' %
624 file_contents[:1])
625
626 def extract_tags(content):
627 pos = 0
628 while pos < len(content):
629 header16 = struct.unpack('<H', content[pos:pos+2])[0]
630 pos += 2
631 tag_code = header16 >> 6
632 tag_len = header16 & 0x3f
633 if tag_len == 0x3f:
634 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
635 pos += 4
636 assert pos+tag_len <= len(content)
637 yield (tag_code, content[pos:pos+tag_len])
638 pos += tag_len
639
640 code_tag = next(tag
641 for tag_code, tag in extract_tags(content)
642 if tag_code == 82)
643 p = code_tag.index(b'\0', 4) + 1
ba552f54 644 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
645
646 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
647 def read_int(reader=None):
648 if reader is None:
649 reader = code_reader
e0df6211
PH
650 res = 0
651 shift = 0
652 for _ in range(5):
ba552f54
PH
653 buf = reader.read(1)
654 assert len(buf) == 1
655 b = struct.unpack('<B', buf)[0]
e0df6211
PH
656 res = res | ((b & 0x7f) << shift)
657 if b & 0x80 == 0:
658 break
659 shift += 7
ba552f54
PH
660 return res
661
662 def u30(reader=None):
663 res = read_int(reader)
664 assert res & 0xf0000000 == 0
e0df6211
PH
665 return res
666 u32 = read_int
667
ba552f54
PH
668 def s32(reader=None):
669 v = read_int(reader)
e0df6211
PH
670 if v & 0x80000000 != 0:
671 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
672 return v
673
0ca96d48 674 def read_string(reader=None):
ba552f54
PH
675 if reader is None:
676 reader = code_reader
677 slen = u30(reader)
678 resb = reader.read(slen)
679 assert len(resb) == slen
680 return resb.decode('utf-8')
681
682 def read_bytes(count, reader=None):
683 if reader is None:
684 reader = code_reader
685 resb = reader.read(count)
686 assert len(resb) == count
687 return resb
688
689 def read_byte(reader=None):
690 resb = read_bytes(1, reader=reader)
691 res = struct.unpack('<B', resb)[0]
692 return res
e0df6211
PH
693
694 # minor_version + major_version
0ca96d48 695 read_bytes(2 + 2)
e0df6211
PH
696
697 # Constant pool
ba552f54 698 int_count = u30()
e0df6211 699 for _c in range(1, int_count):
0ca96d48 700 s32()
ba552f54 701 uint_count = u30()
e0df6211 702 for _c in range(1, uint_count):
0ca96d48 703 u32()
ba552f54 704 double_count = u30()
0ca96d48 705 read_bytes((double_count-1) * 8)
ba552f54 706 string_count = u30()
e0df6211
PH
707 constant_strings = [u'']
708 for _c in range(1, string_count):
0ca96d48 709 s = read_string()
e0df6211 710 constant_strings.append(s)
ba552f54 711 namespace_count = u30()
e0df6211 712 for _c in range(1, namespace_count):
0ca96d48
PH
713 read_bytes(1) # kind
714 u30() # name
ba552f54 715 ns_set_count = u30()
e0df6211 716 for _c in range(1, ns_set_count):
ba552f54 717 count = u30()
e0df6211 718 for _c2 in range(count):
0ca96d48 719 u30()
ba552f54 720 multiname_count = u30()
e0df6211
PH
721 MULTINAME_SIZES = {
722 0x07: 2, # QName
723 0x0d: 2, # QNameA
724 0x0f: 1, # RTQName
725 0x10: 1, # RTQNameA
726 0x11: 0, # RTQNameL
727 0x12: 0, # RTQNameLA
728 0x09: 2, # Multiname
729 0x0e: 2, # MultinameA
730 0x1b: 1, # MultinameL
731 0x1c: 1, # MultinameLA
732 }
733 multinames = [u'']
734 for _c in range(1, multiname_count):
ba552f54 735 kind = u30()
e0df6211
PH
736 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
737 if kind == 0x07:
0ca96d48 738 u30() # namespace_idx
ba552f54 739 name_idx = u30()
e0df6211
PH
740 multinames.append(constant_strings[name_idx])
741 else:
742 multinames.append('[MULTINAME kind: %d]' % kind)
743 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 744 u30()
e0df6211
PH
745
746 # Methods
ba552f54 747 method_count = u30()
e0df6211
PH
748 MethodInfo = collections.namedtuple(
749 'MethodInfo',
750 ['NEED_ARGUMENTS', 'NEED_REST'])
751 method_infos = []
752 for method_id in range(method_count):
ba552f54 753 param_count = u30()
0ca96d48 754 u30() # return type
e0df6211 755 for _ in range(param_count):
0ca96d48
PH
756 u30() # param type
757 u30() # name index (always 0 for youtube)
ba552f54 758 flags = read_byte()
e0df6211
PH
759 if flags & 0x08 != 0:
760 # Options present
ba552f54 761 option_count = u30()
e0df6211 762 for c in range(option_count):
0ca96d48
PH
763 u30() # val
764 read_bytes(1) # kind
e0df6211
PH
765 if flags & 0x80 != 0:
766 # Param names present
767 for _ in range(param_count):
0ca96d48 768 u30() # param name
e0df6211
PH
769 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
770 method_infos.append(mi)
771
772 # Metadata
ba552f54 773 metadata_count = u30()
e0df6211 774 for _c in range(metadata_count):
0ca96d48 775 u30() # name
ba552f54 776 item_count = u30()
e0df6211 777 for _c2 in range(item_count):
0ca96d48
PH
778 u30() # key
779 u30() # value
ba552f54
PH
780
781 def parse_traits_info():
782 trait_name_idx = u30()
783 kind_full = read_byte()
e0df6211
PH
784 kind = kind_full & 0x0f
785 attrs = kind_full >> 4
786 methods = {}
787 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
788 u30() # Slot id
789 u30() # type_name_idx
ba552f54 790 vindex = u30()
e0df6211 791 if vindex != 0:
0ca96d48 792 read_byte() # vkind
e0df6211 793 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 794 u30() # disp_id
ba552f54 795 method_idx = u30()
e0df6211
PH
796 methods[multinames[trait_name_idx]] = method_idx
797 elif kind == 0x04: # Class
0ca96d48
PH
798 u30() # slot_id
799 u30() # classi
e0df6211 800 elif kind == 0x05: # Function
0ca96d48 801 u30() # slot_id
ba552f54 802 function_idx = u30()
e0df6211
PH
803 methods[function_idx] = multinames[trait_name_idx]
804 else:
805 raise ExtractorError(u'Unsupported trait kind %d' % kind)
806
807 if attrs & 0x4 != 0: # Metadata present
ba552f54 808 metadata_count = u30()
e0df6211 809 for _c3 in range(metadata_count):
0ca96d48 810 u30() # metadata index
e0df6211 811
ba552f54 812 return methods
e0df6211
PH
813
814 # Classes
815 TARGET_CLASSNAME = u'SignatureDecipher'
816 searched_idx = multinames.index(TARGET_CLASSNAME)
817 searched_class_id = None
ba552f54 818 class_count = u30()
e0df6211 819 for class_id in range(class_count):
ba552f54 820 name_idx = u30()
e0df6211
PH
821 if name_idx == searched_idx:
822 # We found the class we're looking for!
823 searched_class_id = class_id
0ca96d48 824 u30() # super_name idx
ba552f54 825 flags = read_byte()
e0df6211 826 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 827 u30() # protected_ns_idx
ba552f54 828 intrf_count = u30()
e0df6211 829 for _c2 in range(intrf_count):
0ca96d48
PH
830 u30()
831 u30() # iinit
ba552f54 832 trait_count = u30()
e0df6211 833 for _c2 in range(trait_count):
0ca96d48 834 parse_traits_info()
e0df6211
PH
835
836 if searched_class_id is None:
837 raise ExtractorError(u'Target class %r not found' %
838 TARGET_CLASSNAME)
839
840 method_names = {}
841 method_idxs = {}
842 for class_id in range(class_count):
0ca96d48 843 u30() # cinit
ba552f54 844 trait_count = u30()
e0df6211 845 for _c2 in range(trait_count):
ba552f54 846 trait_methods = parse_traits_info()
e0df6211
PH
847 if class_id == searched_class_id:
848 method_names.update(trait_methods.items())
849 method_idxs.update(dict(
850 (idx, name)
851 for name, idx in trait_methods.items()))
852
853 # Scripts
ba552f54 854 script_count = u30()
e0df6211 855 for _c in range(script_count):
0ca96d48 856 u30() # init
ba552f54 857 trait_count = u30()
e0df6211 858 for _c2 in range(trait_count):
0ca96d48 859 parse_traits_info()
e0df6211
PH
860
861 # Method bodies
ba552f54 862 method_body_count = u30()
e0df6211
PH
863 Method = collections.namedtuple('Method', ['code', 'local_count'])
864 methods = {}
865 for _c in range(method_body_count):
ba552f54 866 method_idx = u30()
0ca96d48 867 u30() # max_stack
ba552f54 868 local_count = u30()
0ca96d48
PH
869 u30() # init_scope_depth
870 u30() # max_scope_depth
ba552f54
PH
871 code_length = u30()
872 code = read_bytes(code_length)
e0df6211 873 if method_idx in method_idxs:
ba552f54 874 m = Method(code, local_count)
e0df6211 875 methods[method_idxs[method_idx]] = m
ba552f54 876 exception_count = u30()
e0df6211 877 for _c2 in range(exception_count):
0ca96d48
PH
878 u30() # from
879 u30() # to
880 u30() # target
881 u30() # exc_type
882 u30() # var_name
ba552f54 883 trait_count = u30()
e0df6211 884 for _c2 in range(trait_count):
0ca96d48 885 parse_traits_info()
e0df6211 886
ba552f54 887 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
888 assert len(methods) == len(method_idxs)
889
890 method_pyfunctions = {}
891
892 def extract_function(func_name):
893 if func_name in method_pyfunctions:
894 return method_pyfunctions[func_name]
895 if func_name not in methods:
896 raise ExtractorError(u'Cannot find function %r' % func_name)
897 m = methods[func_name]
898
899 def resfunc(args):
e0df6211
PH
900 registers = ['(this)'] + list(args) + [None] * m.local_count
901 stack = []
902 coder = io.BytesIO(m.code)
903 while True:
904 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 905 if opcode == 36: # pushbyte
e0df6211
PH
906 v = struct.unpack('!B', coder.read(1))[0]
907 stack.append(v)
908 elif opcode == 44: # pushstring
909 idx = u30(coder)
910 stack.append(constant_strings[idx])
911 elif opcode == 48: # pushscope
912 # We don't implement the scope register, so we'll just
913 # ignore the popped value
914 stack.pop()
915 elif opcode == 70: # callproperty
916 index = u30(coder)
917 mname = multinames[index]
918 arg_count = u30(coder)
919 args = list(reversed(
920 [stack.pop() for _ in range(arg_count)]))
921 obj = stack.pop()
922 if mname == u'split':
923 assert len(args) == 1
924 assert isinstance(args[0], compat_str)
925 assert isinstance(obj, compat_str)
926 if args[0] == u'':
927 res = list(obj)
928 else:
929 res = obj.split(args[0])
930 stack.append(res)
a7177865
PH
931 elif mname == u'slice':
932 assert len(args) == 1
933 assert isinstance(args[0], int)
934 assert isinstance(obj, list)
935 res = obj[args[0]:]
936 stack.append(res)
937 elif mname == u'join':
938 assert len(args) == 1
939 assert isinstance(args[0], compat_str)
940 assert isinstance(obj, list)
941 res = args[0].join(obj)
942 stack.append(res)
e0df6211
PH
943 elif mname in method_pyfunctions:
944 stack.append(method_pyfunctions[mname](args))
945 else:
946 raise NotImplementedError(
947 u'Unsupported property %r on %r'
948 % (mname, obj))
a7177865
PH
949 elif opcode == 72: # returnvalue
950 res = stack.pop()
951 return res
952 elif opcode == 79: # callpropvoid
953 index = u30(coder)
954 mname = multinames[index]
955 arg_count = u30(coder)
956 args = list(reversed(
957 [stack.pop() for _ in range(arg_count)]))
958 obj = stack.pop()
959 if mname == u'reverse':
960 assert isinstance(obj, list)
961 obj.reverse()
962 else:
963 raise NotImplementedError(
964 u'Unsupported (void) property %r on %r'
965 % (mname, obj))
e0df6211
PH
966 elif opcode == 93: # findpropstrict
967 index = u30(coder)
968 mname = multinames[index]
969 res = extract_function(mname)
970 stack.append(res)
971 elif opcode == 97: # setproperty
972 index = u30(coder)
973 value = stack.pop()
974 idx = stack.pop()
975 obj = stack.pop()
976 assert isinstance(obj, list)
977 assert isinstance(idx, int)
978 obj[idx] = value
979 elif opcode == 98: # getlocal
980 index = u30(coder)
981 stack.append(registers[index])
982 elif opcode == 99: # setlocal
983 index = u30(coder)
984 value = stack.pop()
985 registers[index] = value
986 elif opcode == 102: # getproperty
987 index = u30(coder)
988 pname = multinames[index]
989 if pname == u'length':
990 obj = stack.pop()
991 assert isinstance(obj, list)
992 stack.append(len(obj))
993 else: # Assume attribute access
994 idx = stack.pop()
995 assert isinstance(idx, int)
996 obj = stack.pop()
997 assert isinstance(obj, list)
998 stack.append(obj[idx])
999 elif opcode == 128: # coerce
0ca96d48 1000 u30(coder)
e0df6211
PH
1001 elif opcode == 133: # coerce_s
1002 assert isinstance(stack[-1], (type(None), compat_str))
1003 elif opcode == 164: # modulo
1004 value2 = stack.pop()
1005 value1 = stack.pop()
1006 res = value1 % value2
1007 stack.append(res)
a7177865
PH
1008 elif opcode == 208: # getlocal_0
1009 stack.append(registers[0])
1010 elif opcode == 209: # getlocal_1
1011 stack.append(registers[1])
1012 elif opcode == 210: # getlocal_2
1013 stack.append(registers[2])
1014 elif opcode == 211: # getlocal_3
1015 stack.append(registers[3])
e0df6211
PH
1016 elif opcode == 214: # setlocal_2
1017 registers[2] = stack.pop()
1018 elif opcode == 215: # setlocal_3
1019 registers[3] = stack.pop()
1020 else:
1021 raise NotImplementedError(
1022 u'Unsupported opcode %d' % opcode)
1023
1024 method_pyfunctions[func_name] = resfunc
1025 return resfunc
1026
1027 initial_function = extract_function(u'decipher')
1028 return lambda s: initial_function([s])
1029
83799698 1030 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1031 """Turn the encrypted s field into a working signature"""
6b37f0be 1032
83799698 1033 if player_url is not None:
9f9be844
PH
1034 if player_url.startswith(u'//'):
1035 player_url = u'https:' + player_url
e0df6211 1036 try:
7f8ae73a
PH
1037 player_id = (player_url, len(s))
1038 if player_id not in self._player_cache:
83799698 1039 func = self._extract_signature_function(
c4417ddb 1040 video_id, player_url, len(s)
e0df6211 1041 )
7f8ae73a
PH
1042 self._player_cache[player_id] = func
1043 func = self._player_cache[player_id]
edf3e38e
PH
1044 if self._downloader.params.get('youtube_print_sig_code'):
1045 self._print_sig_code(func, len(s))
1046 return func(s)
0ca96d48 1047 except Exception:
e0df6211 1048 tb = traceback.format_exc()
83799698
PH
1049 self._downloader.report_warning(
1050 u'Automatic signature extraction failed: ' + tb)
e0df6211 1051
d2d8f895
PH
1052 self._downloader.report_warning(
1053 u'Warning: Falling back to static signature algorithm')
920de7a2 1054
2f2ffea9
PH
1055 return self._static_decrypt_signature(
1056 s, video_id, player_url, age_gate)
e0df6211 1057
2f2ffea9 1058 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1059 if age_gate:
1060 # The videos with age protection use another player, so the
1061 # algorithms can be different.
1062 if len(s) == 86:
1063 return s[2:63] + s[82] + s[64:82] + s[63]
1064
bc4b9008 1065 if len(s) == 93:
1066 return s[86:29:-1] + s[88] + s[28:5:-1]
1067 elif len(s) == 92:
444b1165 1068 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
1069 elif len(s) == 91:
1070 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
1071 elif len(s) == 90:
1072 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1073 elif len(s) == 89:
1074 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1075 elif len(s) == 88:
3e223834 1076 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1077 elif len(s) == 87:
3a725669 1078 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1079 elif len(s) == 86:
f2c327fd 1080 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 1081 elif len(s) == 85:
6ae8ee3f 1082 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1083 elif len(s) == 84:
6f56389b 1084 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 1085 elif len(s) == 83:
920de7a2 1086 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1087 elif len(s) == 82:
c21315f2 1088 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 1089 elif len(s) == 81:
aedd6bb9 1090 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1091 elif len(s) == 80:
1092 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1093 elif len(s) == 79:
1094 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1095
1096 else:
1097 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1098
1f343eaa 1099 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1100 try:
7fad1c63
JMF
1101 sub_list = self._download_webpage(
1102 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1103 video_id, note=False)
1104 except ExtractorError as err:
de7f3446
JMF
1105 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1106 return {}
1107 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1108
1109 sub_lang_list = {}
1110 for l in lang_list:
1111 lang = l[1]
1112 params = compat_urllib_parse.urlencode({
1113 'lang': lang,
1114 'v': video_id,
ca715127 1115 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
8eddf3e9 1116 'name': l[0].encode('utf-8'),
de7f3446
JMF
1117 })
1118 url = u'http://www.youtube.com/api/timedtext?' + params
1119 sub_lang_list[lang] = url
1120 if not sub_lang_list:
1121 self._downloader.report_warning(u'video doesn\'t have subtitles')
1122 return {}
1123 return sub_lang_list
1124
055e6f36 1125 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1126 """We need the webpage for getting the captions url, pass it as an
1127 argument to speed up the process."""
ca715127 1128 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1129 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1130 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1131 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1132 if mobj is None:
1133 self._downloader.report_warning(err_msg)
1134 return {}
1135 player_config = json.loads(mobj.group(1))
1136 try:
1137 args = player_config[u'args']
1138 caption_url = args[u'ttsurl']
1139 timestamp = args[u'timestamp']
055e6f36
JMF
1140 # We get the available subtitles
1141 list_params = compat_urllib_parse.urlencode({
1142 'type': 'list',
1143 'tlangs': 1,
1144 'asrs': 1,
de7f3446 1145 })
055e6f36 1146 list_url = caption_url + '&' + list_params
e26f8712 1147 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1148 original_lang_node = caption_list.find('track')
f6a54188 1149 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1150 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1151 return {}
1152 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1153
1154 sub_lang_list = {}
1155 for lang_node in caption_list.findall('target'):
1156 sub_lang = lang_node.attrib['lang_code']
1157 params = compat_urllib_parse.urlencode({
1158 'lang': original_lang,
1159 'tlang': sub_lang,
1160 'fmt': sub_format,
1161 'ts': timestamp,
1162 'kind': 'asr',
1163 })
1164 sub_lang_list[sub_lang] = caption_url + '&' + params
1165 return sub_lang_list
de7f3446
JMF
1166 # An extractor error can be raise by the download process if there are
1167 # no automatic captions but there are subtitles
1168 except (KeyError, ExtractorError):
1169 self._downloader.report_warning(err_msg)
1170 return {}
1171
c5e8d7af
PH
1172 def _print_formats(self, formats):
1173 print('Available formats:')
1174 for x in formats:
03cc7c20
JMF
1175 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1176 self._video_dimensions.get(x, '???'),
836a086c 1177 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1178
1179 def _extract_id(self, url):
1180 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1181 if mobj is None:
1182 raise ExtractorError(u'Invalid URL: %s' % url)
1183 video_id = mobj.group(2)
1184 return video_id
1185
1d043b93
JMF
1186 def _get_video_url_list(self, url_map):
1187 """
1188 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1189 with the requested formats.
1190 """
1191 req_format = self._downloader.params.get('format', None)
1192 format_limit = self._downloader.params.get('format_limit', None)
1193 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1194 if format_limit is not None and format_limit in available_formats:
1195 format_list = available_formats[available_formats.index(format_limit):]
1196 else:
1197 format_list = available_formats
1198 existing_formats = [x for x in format_list if x in url_map]
1199 if len(existing_formats) == 0:
1200 raise ExtractorError(u'no known formats available for video')
1201 if self._downloader.params.get('listformats', None):
1202 self._print_formats(existing_formats)
1203 return
1204 if req_format is None or req_format == 'best':
1205 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1206 elif req_format == 'worst':
1207 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1208 elif req_format in ('-1', 'all'):
1209 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1210 else:
1211 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1212 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1213 # available in the specified format. For example,
1214 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1215 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1216 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1217 req_formats = req_format.split('/')
1218 video_url_list = None
1219 for rf in req_formats:
1220 if rf in url_map:
1221 video_url_list = [(rf, url_map[rf])]
1222 break
bdc6b3fc
AZ
1223 if rf in self._video_formats_map:
1224 for srf in self._video_formats_map[rf]:
1225 if srf in url_map:
1226 video_url_list = [(srf, url_map[srf])]
1227 break
1228 else:
1229 continue
1230 break
1d043b93
JMF
1231 if video_url_list is None:
1232 raise ExtractorError(u'requested format not available')
1233 return video_url_list
1234
1235 def _extract_from_m3u8(self, manifest_url, video_id):
1236 url_map = {}
1237 def _get_urls(_manifest):
1238 lines = _manifest.split('\n')
1239 urls = filter(lambda l: l and not l.startswith('#'),
1240 lines)
1241 return urls
1242 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1243 formats_urls = _get_urls(manifest)
1244 for format_url in formats_urls:
890f62e8 1245 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1246 url_map[itag] = format_url
1247 return url_map
1248
1fb07d10
JG
1249 def _extract_annotations(self, video_id):
1250 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1251 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1252
c5e8d7af
PH
1253 def _real_extract(self, url):
1254 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1255 mobj = re.search(self._NEXT_URL_RE, url)
1256 if mobj:
1257 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1258 video_id = self._extract_id(url)
1259
1260 # Get video webpage
1261 self.report_video_webpage_download(video_id)
1262 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1263 request = compat_urllib_request.Request(url)
1264 try:
1265 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1266 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1267 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1268
1269 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1270
1271 # Attempt to extract SWF player URL
e0df6211 1272 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1273 if mobj is not None:
1274 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1275 else:
1276 player_url = None
1277
1278 # Get video info
1279 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1280 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1281 self.report_age_confirmation()
1282 age_gate = True
1283 # We simulate the access to the video from www.youtube.com/v/{video_id}
1284 # this can be viewed without login into Youtube
1285 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1286 'el': 'player_embedded',
c108eb73
JMF
1287 'gl': 'US',
1288 'hl': 'en',
1289 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1290 'asv': 3,
1291 'sts':'1588',
1292 })
1293 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1294 video_info_webpage = self._download_webpage(video_info_url, video_id,
1295 note=False,
1296 errnote='unable to download video info webpage')
1297 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1298 else:
1299 age_gate = False
1300 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1301 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1302 % (video_id, el_type))
1303 video_info_webpage = self._download_webpage(video_info_url, video_id,
1304 note=False,
1305 errnote='unable to download video info webpage')
1306 video_info = compat_parse_qs(video_info_webpage)
1307 if 'token' in video_info:
1308 break
c5e8d7af
PH
1309 if 'token' not in video_info:
1310 if 'reason' in video_info:
9a82b238 1311 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1312 else:
1313 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1314
1d699755
PH
1315 if 'view_count' in video_info:
1316 view_count = int(video_info['view_count'][0])
1317 else:
1318 view_count = None
1319
c5e8d7af
PH
1320 # Check for "rental" videos
1321 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1322 raise ExtractorError(u'"rental" videos not supported')
1323
1324 # Start extracting information
1325 self.report_information_extraction(video_id)
1326
1327 # uploader
1328 if 'author' not in video_info:
1329 raise ExtractorError(u'Unable to extract uploader name')
1330 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1331
1332 # uploader_id
1333 video_uploader_id = None
1334 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1335 if mobj is not None:
1336 video_uploader_id = mobj.group(1)
1337 else:
1338 self._downloader.report_warning(u'unable to extract uploader nickname')
1339
1340 # title
a8c6b241
PH
1341 if 'title' in video_info:
1342 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1343 else:
1344 self._downloader.report_warning(u'Unable to extract video title')
1345 video_title = u'_'
c5e8d7af
PH
1346
1347 # thumbnail image
7763b04e
JMF
1348 # We try first to get a high quality image:
1349 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1350 video_webpage, re.DOTALL)
1351 if m_thumb is not None:
1352 video_thumbnail = m_thumb.group(1)
1353 elif 'thumbnail_url' not in video_info:
c5e8d7af 1354 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1355 video_thumbnail = None
c5e8d7af
PH
1356 else: # don't panic if we can't find it
1357 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1358
1359 # upload date
1360 upload_date = None
1361 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1362 if mobj is not None:
1363 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1364 upload_date = unified_strdate(upload_date)
1365
1366 # description
1367 video_description = get_element_by_id("eow-description", video_webpage)
1368 if video_description:
1369 video_description = clean_html(video_description)
1370 else:
1371 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1372 if fd_mobj:
1373 video_description = unescapeHTML(fd_mobj.group(1))
1374 else:
1375 video_description = u''
1376
1377 # subtitles
d82134c3 1378 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1379
c5e8d7af 1380 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1381 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1382 return
1383
1384 if 'length_seconds' not in video_info:
1385 self._downloader.report_warning(u'unable to extract video duration')
1386 video_duration = ''
1387 else:
1388 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1389
1fb07d10
JG
1390 # annotations
1391 video_annotations = None
1392 if self._downloader.params.get('writeannotations', False):
1393 video_annotations = self._extract_annotations(video_id)
1394
c5e8d7af 1395 # Decide which formats to download
c5e8d7af
PH
1396
1397 try:
1398 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1399 if not mobj:
1400 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1401 info = json.loads(mobj.group(1))
1402 args = info['args']
7ce7e394
JMF
1403 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1404 # this signatures are encrypted
44d46655 1405 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1406 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1407 re_signature = re.compile(r'[&,]s=')
1408 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1409 if m_s is not None:
1410 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1411 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1412 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1413 if m_s is not None:
00fe14fc
JMF
1414 if 'adaptive_fmts' in video_info:
1415 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1416 else:
00fe14fc 1417 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1418 except ValueError:
1419 pass
1420
1421 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1422 self.report_rtmp_download()
1423 video_url_list = [(None, video_info['conn'][0])]
00fe14fc
JMF
1424 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1425 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1426 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1427 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1428 url_map = {}
00fe14fc 1429 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1430 url_data = compat_parse_qs(url_data_str)
1431 if 'itag' in url_data and 'url' in url_data:
1432 url = url_data['url'][0]
1433 if 'sig' in url_data:
1434 url += '&signature=' + url_data['sig'][0]
1435 elif 's' in url_data:
e0df6211 1436 encrypted_sig = url_data['s'][0]
769fda3c 1437 if self._downloader.params.get('verbose'):
c108eb73 1438 if age_gate:
bdde940e
PH
1439 if player_url is None:
1440 player_version = 'unknown'
1441 else:
1442 player_version = self._search_regex(
1443 r'-(.+)\.swf$', player_url,
1444 u'flash player', fatal=False)
e0df6211 1445 player_desc = 'flash player %s' % player_version
c108eb73 1446 else:
83799698
PH
1447 player_version = self._search_regex(
1448 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1449 'html5 player', fatal=False)
e0df6211
PH
1450 player_desc = u'html5 player %s' % player_version
1451
1452 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1453 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1454 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1455
83799698 1456 if not age_gate:
e0df6211
PH
1457 jsplayer_url_json = self._search_regex(
1458 r'"assets":.+?"js":\s*("[^"]+")',
1459 video_webpage, u'JS player URL')
83799698 1460 player_url = json.loads(jsplayer_url_json)
e0df6211 1461
83799698
PH
1462 signature = self._decrypt_signature(
1463 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1464 url += '&signature=' + signature
1465 if 'ratebypass' not in url:
1466 url += '&ratebypass=yes'
1467 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1468 video_url_list = self._get_video_url_list(url_map)
1469 if not video_url_list:
c5e8d7af 1470 return
1d043b93
JMF
1471 elif video_info.get('hlsvp'):
1472 manifest_url = video_info['hlsvp'][0]
1473 url_map = self._extract_from_m3u8(manifest_url, video_id)
1474 video_url_list = self._get_video_url_list(url_map)
1475 if not video_url_list:
1476 return
1477
c5e8d7af 1478 else:
9abb3204 1479 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af
PH
1480
1481 results = []
600cc1a4 1482 for itag, video_real_url in video_url_list:
c5e8d7af 1483 # Extension
600cc1a4 1484 video_extension = self._video_extensions.get(itag, 'flv')
c5e8d7af 1485
600cc1a4
JMF
1486 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1487 self._video_dimensions.get(itag, '???'),
1488 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
c5e8d7af
PH
1489
1490 results.append({
1491 'id': video_id,
1492 'url': video_real_url,
1493 'uploader': video_uploader,
1494 'uploader_id': video_uploader_id,
1495 'upload_date': upload_date,
1496 'title': video_title,
1497 'ext': video_extension,
1498 'format': video_format,
600cc1a4 1499 'format_id': itag,
c5e8d7af
PH
1500 'thumbnail': video_thumbnail,
1501 'description': video_description,
1502 'player_url': player_url,
1503 'subtitles': video_subtitles,
8dbe9899 1504 'duration': video_duration,
cfadd183 1505 'age_limit': 18 if age_gate else 0,
9103bbc5
JMF
1506 'annotations': video_annotations,
1507 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1d699755 1508 'view_count': view_count,
c5e8d7af
PH
1509 })
1510 return results
1511
880e1c52 1512class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1513 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1514 _VALID_URL = r"""(?:
1515 (?:https?://)?
1516 (?:\w+\.)?
1517 youtube\.com/
1518 (?:
1519 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1520 \? (?:.*?&)*? (?:p|a|list)=
1521 | p/
1522 )
c626a3d9 1523 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1524 .*
1525 |
c626a3d9 1526 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
c5e8d7af 1527 )"""
dcbb4580
JMF
1528 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1529 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1530 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1531 IE_NAME = u'youtube:playlist'
1532
1533 @classmethod
1534 def suitable(cls, url):
1535 """Receives a URL and returns True if suitable for this IE."""
1536 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1537
880e1c52
JMF
1538 def _real_initialize(self):
1539 self._login()
1540
652cdaa2
JMF
1541 def _ids_to_results(self, ids):
1542 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1543 for vid_id in ids]
1544
1545 def _extract_mix(self, playlist_id):
1546 # The mixes are generated from a a single video
1547 # the id of the playlist is just 'RD' + video_id
1548 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id)
1549 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1550 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1551 get_element_by_attribute('class', 'title ', webpage))
1552 title = clean_html(title_span)
652cdaa2
JMF
1553 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1554 ids = orderedSet(re.findall(video_re, webpage))
1555 url_results = self._ids_to_results(ids)
1556
1557 return self.playlist_result(url_results, playlist_id, title)
1558
c5e8d7af
PH
1559 def _real_extract(self, url):
1560 # Extract playlist id
1561 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1562 if mobj is None:
1563 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1564 playlist_id = mobj.group(1) or mobj.group(2)
1565
1566 # Check if it's a video-specific URL
7c61bd36 1567 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1568 if 'v' in query_dict:
1569 video_id = query_dict['v'][0]
1570 if self._downloader.params.get('noplaylist'):
1571 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1572 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1573 else:
1574 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1575
652cdaa2
JMF
1576 if len(playlist_id) == 13: # 'RD' + 11 characters for the video id
1577 # Mixes require a custom extraction process
1578 return self._extract_mix(playlist_id)
1579
dcbb4580
JMF
1580 # Extract the video ids from the playlist pages
1581 ids = []
c5e8d7af 1582
755eb032 1583 for page_num in itertools.count(1):
dcbb4580 1584 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1585 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1586 matches = re.finditer(self._VIDEO_RE, page)
1587 # We remove the duplicates and the link with index 0
1588 # (it's not the first video of the playlist)
1589 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1590 ids.extend(new_ids)
c5e8d7af 1591
dcbb4580 1592 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1593 break
1594
dcbb4580 1595 playlist_title = self._og_search_title(page)
c5e8d7af 1596
652cdaa2 1597 url_results = self._ids_to_results(ids)
dcbb4580 1598 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1599
1600
1601class YoutubeChannelIE(InfoExtractor):
0f818663 1602 IE_DESC = u'YouTube.com channels'
c5e8d7af 1603 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1604 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1605 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1606 IE_NAME = u'youtube:channel'
1607
1608 def extract_videos_from_page(self, page):
1609 ids_in_page = []
1610 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1611 if mobj.group(1) not in ids_in_page:
1612 ids_in_page.append(mobj.group(1))
1613 return ids_in_page
1614
1615 def _real_extract(self, url):
1616 # Extract channel id
1617 mobj = re.match(self._VALID_URL, url)
1618 if mobj is None:
1619 raise ExtractorError(u'Invalid URL: %s' % url)
1620
1621 # Download channel page
1622 channel_id = mobj.group(1)
1623 video_ids = []
b9643eed
JMF
1624 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1625 channel_page = self._download_webpage(url, channel_id)
1626 if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
1627 autogenerated = True
1628 else:
1629 autogenerated = False
c5e8d7af 1630
b9643eed
JMF
1631 if autogenerated:
1632 # The videos are contained in a single page
1633 # the ajax pages can't be used, they are empty
1634 video_ids = self.extract_videos_from_page(channel_page)
1635 else:
1636 # Download all channel pages using the json-based channel_ajax query
1637 for pagenum in itertools.count(1):
1638 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1639 page = self._download_webpage(url, channel_id,
1640 u'Downloading page #%s' % pagenum)
1641
1642 page = json.loads(page)
1643
1644 ids_in_page = self.extract_videos_from_page(page['content_html'])
1645 video_ids.extend(ids_in_page)
1646
1647 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1648 break
c5e8d7af
PH
1649
1650 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1651
7012b23c
PH
1652 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1653 for video_id in video_ids]
1654 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1655
1656
1657class YoutubeUserIE(InfoExtractor):
0f818663 1658 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1659 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1660 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1661 _GDATA_PAGE_SIZE = 50
fd9cf738 1662 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1663 IE_NAME = u'youtube:user'
1664
e3ea4790 1665 @classmethod
f4b05232 1666 def suitable(cls, url):
e3ea4790
JMF
1667 # Don't return True if the url can be extracted with other youtube
1668 # extractor, the regex would is too permissive and it would match.
1669 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1670 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1671 else: return super(YoutubeUserIE, cls).suitable(url)
1672
c5e8d7af
PH
1673 def _real_extract(self, url):
1674 # Extract username
1675 mobj = re.match(self._VALID_URL, url)
1676 if mobj is None:
1677 raise ExtractorError(u'Invalid URL: %s' % url)
1678
1679 username = mobj.group(1)
1680
1681 # Download video ids using YouTube Data API. Result size per
1682 # query is limited (currently to 50 videos) so we need to query
1683 # page by page until there are no video ids - it means we got
1684 # all of them.
1685
1686 video_ids = []
c5e8d7af 1687
755eb032 1688 for pagenum in itertools.count(0):
c5e8d7af
PH
1689 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1690
1691 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1692 page = self._download_webpage(gdata_url, username,
1693 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1694
fd9cf738
JMF
1695 try:
1696 response = json.loads(page)
1697 except ValueError as err:
1698 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1699 if 'entry' not in response['feed']:
1700 # Number of videos is a multiple of self._MAX_RESULTS
1701 break
fd9cf738 1702
c5e8d7af
PH
1703 # Extract video identifiers
1704 ids_in_page = []
fd9cf738
JMF
1705 for entry in response['feed']['entry']:
1706 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1707 video_ids.extend(ids_in_page)
1708
1709 # A little optimization - if current page is not
1710 # "full", ie. does not contain PAGE_SIZE video ids then
1711 # we can assume that this page is the last one - there
1712 # are no more ids on further pages - no need to query
1713 # again.
1714
1715 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1716 break
1717
7012b23c
PH
1718 url_results = [
1719 self.url_result(video_id, 'Youtube', video_id=video_id)
1720 for video_id in video_ids]
1721 return self.playlist_result(url_results, playlist_title=username)
1722
b05654f0
PH
1723
1724class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1725 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1726 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1727 _MAX_RESULTS = 1000
1728 IE_NAME = u'youtube:search'
1729 _SEARCH_KEY = 'ytsearch'
1730
1731 def report_download_page(self, query, pagenum):
1732 """Report attempt to download search page with given number."""
1733 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1734
1735 def _get_n_results(self, query, n):
1736 """Get a specified number of results for a query"""
1737
1738 video_ids = []
1739 pagenum = 0
1740 limit = n
1741
1742 while (50 * pagenum) < limit:
1743 self.report_download_page(query, pagenum+1)
1744 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1745 request = compat_urllib_request.Request(result_url)
1746 try:
1747 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1748 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1749 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1750 api_response = json.loads(data)['data']
1751
1752 if not 'items' in api_response:
1753 raise ExtractorError(u'[youtube] No video results')
1754
1755 new_ids = list(video['id'] for video in api_response['items'])
1756 video_ids += new_ids
1757
1758 limit = min(n, api_response['totalItems'])
1759 pagenum += 1
1760
1761 if len(video_ids) > n:
1762 video_ids = video_ids[:n]
7012b23c
PH
1763 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1764 for video_id in video_ids]
b05654f0 1765 return self.playlist_result(videos, query)
75dff0ee 1766
a3dd9248
CM
1767class YoutubeSearchDateIE(YoutubeSearchIE):
1768 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1769 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1770 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1771
1772class YoutubeShowIE(InfoExtractor):
0f818663 1773 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1774 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1775 IE_NAME = u'youtube:show'
1776
1777 def _real_extract(self, url):
1778 mobj = re.match(self._VALID_URL, url)
1779 show_name = mobj.group(1)
1780 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1781 # There's one playlist for each season of the show
1782 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1783 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1784 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1785
1786
b2e8bc1b 1787class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1788 """
1789 Base class for extractors that fetch info from
1790 http://www.youtube.com/feed_ajax
1791 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1792 """
b2e8bc1b 1793 _LOGIN_REQUIRED = True
43ba5456
JMF
1794 # use action_load_personal_feed instead of action_load_system_feed
1795 _PERSONAL_FEED = False
04cc9617 1796
d7ae0639
JMF
1797 @property
1798 def _FEED_TEMPLATE(self):
43ba5456
JMF
1799 action = 'action_load_system_feed'
1800 if self._PERSONAL_FEED:
1801 action = 'action_load_personal_feed'
1802 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1803
1804 @property
1805 def IE_NAME(self):
1806 return u'youtube:%s' % self._FEED_NAME
04cc9617 1807
81f0259b 1808 def _real_initialize(self):
b2e8bc1b 1809 self._login()
81f0259b 1810
04cc9617
JMF
1811 def _real_extract(self, url):
1812 feed_entries = []
0e44d838
JMF
1813 paging = 0
1814 for i in itertools.count(1):
d7ae0639
JMF
1815 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1816 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1817 u'Downloading page %s' % i)
1818 info = json.loads(info)
1819 feed_html = info['feed_html']
43ba5456 1820 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1821 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1822 feed_entries.extend(
1823 self.url_result(video_id, 'Youtube', video_id=video_id)
1824 for video_id in ids)
04cc9617
JMF
1825 if info['paging'] is None:
1826 break
0e44d838 1827 paging = info['paging']
d7ae0639
JMF
1828 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1829
1830class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1831 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1832 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1833 _FEED_NAME = 'subscriptions'
1834 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1835
1836class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1837 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1838 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1839 _FEED_NAME = 'recommended'
1840 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1841
43ba5456
JMF
1842class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1843 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1844 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1845 _FEED_NAME = 'watch_later'
1846 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1847 _PERSONAL_FEED = True
c626a3d9 1848
f459d170
JMF
1849class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1850 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1851 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1852 _FEED_NAME = 'history'
1853 _PERSONAL_FEED = True
1854 _PLAYLIST_TITLE = u'Youtube Watch History'
1855
c626a3d9
JMF
1856class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1857 IE_NAME = u'youtube:favorites'
1858 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1859 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1860 _LOGIN_REQUIRED = True
1861
1862 def _real_extract(self, url):
1863 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1864 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1865 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1866
1867
1868class YoutubeTruncatedURLIE(InfoExtractor):
1869 IE_NAME = 'youtube:truncated_url'
1870 IE_DESC = False # Do not list
1871 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1872
1873 def _real_extract(self, url):
1874 raise ExtractorError(
1875 u'Did you forget to quote the URL? Remember that & is a meta '
1876 u'character in most shells, so you want to put the URL in quotes, '
1877 u'like youtube-dl '
1878 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1879 u' (or simply youtube-dl BaW_jenozKc ).',
1880 expected=True)